# RUN THIS FILE ON GOOGLE COLAB

In [None]:
import pandas as pd
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,hamming_loss
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from scipy.sparse import hstack
import os
from lightgbm import LGBMClassifier
import warnings
warnings.filterwarnings("ignore")

In [None]:
DATA_PATH = '/content/code_classification_dataset_v3.csv'

In [None]:
df = pd.read_csv(DATA_PATH)
df["new_tags"] = df["new_tags"].apply(eval)  # Convert stringified list to list

# 2. Combine description + code
df["full_text"] = df["prob_desc_description"].fillna("") + " " + df["source_code"].fillna("") + " " + df['prob_desc_notes'].fillna("")


In [None]:
from huggingface_hub import login
login(token="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")

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# model_name = "Mallard74/bert-xcode-tags-classification"
model_name = "Mallard74/codebert-xcode-tags-classification"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.eval()  # inference mode

tokenizer_config.json:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/958 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [None]:
def preprocess_text(row):
    return (
        str(row.get("prob_desc_description", "")) + " " +
        str(row.get("prob_desc_notes", "")) + " " +
        str(row.get("source_code", ""))
    )

df["model_input"] = df.apply(preprocess_text, axis=1)

In [None]:
from torch.utils.data import Dataset, DataLoader
import torch

class TagDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx]

def collate_fn(batch_texts):
    return tokenizer(
        batch_texts,
        truncation=True,
        padding=True,
        max_length=512,
        return_tensors="pt"
    )

texts = df["model_input"].tolist()
dataset = TagDataset(texts)
dataloader = DataLoader(dataset, batch_size=4, collate_fn=collate_fn)  # reduce batch_size if still crashing

In [None]:
model.config.id2label

{0: 'games',
 1: 'geometry',
 2: 'graphs',
 3: 'math',
 4: 'number theory',
 5: 'probabilities',
 6: 'strings',
 7: 'trees'}

In [None]:
from tqdm import tqdm
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Run batched inference
all_preds = []

with torch.no_grad():
    for batch in tqdm(dataloader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        probs = torch.sigmoid(outputs.logits).cpu().numpy()
        all_preds.extend(probs)

100%|██████████| 1236/1236 [02:35<00:00,  7.97it/s]


In [None]:
threshold = 0.5
predicted_labels = [
    [model.config.id2label[i] for i, prob in enumerate(row) if prob > threshold]
    for row in all_preds
]

In [None]:
focus_tags = list(model.config.id2label.values())  # if they match yours
mlb = MultiLabelBinarizer(classes=focus_tags)

y_true = mlb.fit_transform(df["new_tags"].apply(lambda tags: [t for t in tags if t in focus_tags]))

In [None]:
y_pred = mlb.transform(predicted_labels)

In [None]:
print(classification_report(y_true, y_pred, target_names=focus_tags))
print(f"Hamming Loss: {hamming_loss(y_true, y_pred):.4f}")

               precision    recall  f1-score   support

        games       0.00      0.00      0.00       105
     geometry       0.69      0.05      0.10       166
       graphs       0.57      0.21      0.30       542
         math       0.38      0.85      0.53      1408
number theory       0.00      0.00      0.00       350
probabilities       0.00      0.00      0.00        92
      strings       0.58      0.76      0.66       422
        trees       0.84      0.29      0.43       324

    micro avg       0.43      0.51      0.47      3409
    macro avg       0.38      0.27      0.25      3409
 weighted avg       0.44      0.51      0.39      3409
  samples avg       0.34      0.30      0.31      3409

Hamming Loss: 0.0996


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report, hamming_loss
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
import pandas as pd
from tqdm import tqdm
from itertools import product

# Prepare labels
focus_tags = ['games', 'geometry', 'graphs', 'math', 'number theory', 'probabilities', 'strings', 'trees']
texts = df["model_input"]
y_raw = df["new_tags"].apply(lambda tags: [t for t in tags if t in focus_tags])
mlb = MultiLabelBinarizer(classes=focus_tags)
y = mlb.fit_transform(y_raw)

# Define TF-IDF configurations
tfidf_configs = [
    {"max_features": 20000, "min_df": 1, "ngram_range": (1, 1), },
    {"max_features": 20000, "min_df": 3, "ngram_range": (1, 2), },
    {"max_features": 20000, "min_df": 3, "ngram_range": (1, 3), },
    {"max_features": 10000, "min_df": 1, "ngram_range": (1, 1), },
    {"max_features": 10000, "min_df": 3, "ngram_range": (1, 2), },
    {"max_features": 10000, "min_df": 3, "ngram_range": (1, 3), },
    {"max_features": 50000, "min_df": 1, "ngram_range": (1, 1), },
    {"max_features": 50000, "min_df": 3, "ngram_range": (1, 2), },
]

# Logistic Regression parameters
lr_params = [
             {"C": 1.0, "class_weight": "balanced", "solver": "liblinear", "max_iter": 1000},
             {"C": 1.0, "class_weight": None, "solver": "liblinear", "max_iter": 1000},
             {"C": 1.0, "class_weight": "balanced", "solver": "liblinear", "max_iter": 1000, "penalty": "l1"},
             {"C": 0.5, "class_weight": "balanced", "solver": "liblinear", "max_iter": 1000, "penalty": "l1"},
             {"C": 1.0, "class_weight": "balanced", "max_iter": 1000},
             {"C": 1.0, "class_weight": None, "max_iter": 1000},
             {"C": 0.5, "class_weight": "balanced", "max_iter": 1000}
            ]


# === CV ===
kf = KFold(n_splits=3, shuffle=True, random_state=42)
results = []

print("🔁 Starting grid search over TF-IDF + LR params...\n")

for tfidf_params, lr_params in tqdm(list(product(tfidf_configs, lr_params)), desc="Grid Search"):
    tfidf = TfidfVectorizer(
        **tfidf_params
    )
    X = tfidf.fit_transform(texts)

    f1s, hammings = [], []

    for train_idx, val_idx in kf.split(X):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        model = OneVsRestClassifier(LogisticRegression(
            **lr_params
        ))
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)

        report = classification_report(y_val, y_pred, output_dict=True, zero_division=0)
        f1s.append(report["weighted avg"]["f1-score"])
        hammings.append(hamming_loss(y_val, y_pred))

    results.append({
        "tfidf_params": tfidf_params,
        "lr_params": lr_params,
        "f1_mean": np.mean(f1s),
        "f1_std": np.std(f1s),
        "hamming_mean": np.mean(hammings),
        "hamming_std": np.std(hammings),
    })

# === SORT RESULTS ===
results_df = pd.DataFrame(results).sort_values(by="f1_mean", ascending=False)
print("\n🏆 Best Config:\n", results_df.iloc[0])

🔁 Starting grid search over TF-IDF + LR params...



Grid Search: 100%|██████████| 56/56 [08:00<00:00,  8.58s/it]


🏆 Best Config:
 tfidf_params    {'max_features': 20000, 'min_df': 3, 'ngram_ra...
lr_params       {'C': 1.0, 'class_weight': 'balanced', 'max_it...
f1_mean                                                  0.573491
f1_std                                                    0.01437
hamming_mean                                             0.079969
hamming_std                                               0.00257
Name: 18, dtype: object





In [None]:
results_df.iloc[0]['lr_params']

{'C': 1.0, 'class_weight': 'balanced', 'max_iter': 1000}