In [2]:
import pandas as pd
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,hamming_loss
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from scipy.sparse import hstack
import os
from lightgbm import LGBMClassifier
import warnings
warnings.filterwarnings("ignore")

In [3]:
# Paths
DATA_PATH = "../data/code_classification_dataset_v3.csv"
MODEL_DIR = "../outputs"
MODEL_FILE = os.path.join(MODEL_DIR, "model_description_only.pkl")
VECTORIZER_FILE = os.path.join(MODEL_DIR, "vectorizer_description_only.pkl")
LABEL_BINARIZER_FILE = os.path.join(MODEL_DIR, "label_binarizer.pkl")


In [4]:
# Ensure output directory exists
os.makedirs(MODEL_DIR, exist_ok=True)

In [5]:
# 1. Load data
df = pd.read_csv(DATA_PATH)
df["new_tags"] = df["new_tags"].apply(eval)  # Convert stringified list to list

# 2. Combine description + code
df["full_text"] = df["prob_desc_description"].fillna("") + " "  + df['prob_desc_notes'].fillna("") + " " + df["source_code"].fillna("")

# 3. Smart features to include
smart_features = [
    "difficulty", "time_limit_sec", "memory_limit_mb",
    "num_imports", "has_recursion", "uses_modulo",
    "uses_bitwise_ops", "uses_dfs_bfs",
    "has_graph_keywords", "has_dp_keywords", "has_equation"
]

# 2. Feature selection
numerical_features = [
    "difficulty", "time_limit_sec", "memory_limit_mb",
    "num_imports", "desc_length_words", "code_length_lines",
    "has_recursion", "uses_modulo", "uses_bitwise_ops",
    "uses_dfs_bfs", "has_graph_keywords", "has_dp_keywords", "has_equation"
]

# BASELINE

Considers just the problem description as feature

In [6]:
# 2. Prepare input/output
# X = df["prob_desc_description"].fillna("")
# X = df[numerical_features].fillna(0)
X = df["full_text"]

# X_text = df["full_text"]
# X_smart = df[numerical_features].fillna(0)
# y_raw = df["new_tags"]
# X = df["full_text"]
y_raw = df["new_tags"]
y_raw[0]

['geometry', 'brute force']

In [7]:
X[0]

'Iahub has drawn a set of n points in the cartesian plane which he calls "special points". A quadrilateral is a simple polygon without self-intersections with four sides (also called edges) and four vertices (also called corners). Please note that a quadrilateral doesn\'t have to be convex. A special quadrilateral is one which has all four vertices in the set of special points. Given the set of special points, please calculate the maximal area of a special quadrilateral.  NoteIn the test example we can choose first 4 points to be the vertices of the quadrilateral. They form a square by side 4, so the area is 4¬∑4\u2009=\u200916. # calculate convex of polygon v.\n# v is list of complexes stand for points.\ndef convex(v, eps=1e-8):\n\n    # fetch the seed point\n    v.sort(key=lambda x:(x.real,x.imag))\n    v = v[0:1] + sorted(v[1:], key=lambda x:(x-v[0]).imag/abs(x-v[0]))\n\n    n = 1\n    for i in range(2, len(v)):\n        while n > 1 and ((v[n]-v[n-1])*(v[i]-v[n]).conjugate()).imag>-

In [8]:
# 3. Binarize labels
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(y_raw)
y[0]

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0])

In [9]:
# 4. Train/test split

# 3. Scale numeric columns (only non-binary)
# numeric_cols = ["difficulty", "time_limit_sec", "memory_limit_mb", "num_imports", "desc_length_words", "code_length_lines"]
# X[numeric_cols] = StandardScaler().fit_transform(X[numeric_cols])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# X_text_train, X_text_test, X_smart_train, X_smart_test, y_train, y_test = train_test_split(
#     X_text, X_smart, y, test_size=0.2, random_state=42
# )

In [8]:
# 5. TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=20000, ngram_range=(1, 3), min_df=3)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [None]:
# 6. TF-IDF vectorization
# vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2), min_df=3)
# X_text_train_tfidf = vectorizer.fit_transform(X_text_train)
# X_text_test_tfidf = vectorizer.transform(X_text_test)

# # # 7. Standardize smart features
# scaler = StandardScaler()
# numeric_cols = ["difficulty", "time_limit_sec", "memory_limit_mb", "num_imports", "desc_length_words", "code_length_lines"]
# X_smart_train[numeric_cols] = scaler.fit_transform(X_smart_train[numeric_cols])
# X_smart_test[numeric_cols] = scaler.transform(X_smart_test[numeric_cols])

# # # 8. Combine TF-IDF + smart features
# X_train_combined = hstack([X_text_train_tfidf, X_smart_train])
# X_test_combined = hstack([X_text_test_tfidf, X_smart_test])

In [10]:
# X_train_tfidf.shape, X_test_tfidf.shape, y_train.shape, y_test.shape
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((3952,), (989,), (3952, 26), (989, 26))

In [9]:
# 6. Train model
model = OneVsRestClassifier(LogisticRegression(max_iter=1000, class_weight='balanced'))
# model = OneVsRestClassifier(LGBMClassifier(n_estimators=200, random_state=42))
model.fit(X_train_tfidf, y_train)
# model.fit(X_train_combined, y_train)
# model.fit(X_train, y_train)

In [10]:
# 7. Evaluate
y_pred = model.predict(X_test_tfidf)
# y_pred = model.predict(X_test_combined)
# y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=mlb.classes_, zero_division=0))

                         precision    recall  f1-score   support

          binary search       0.30      0.31      0.31       118
               bitmasks       0.38      0.51      0.44        43
            brute force       0.28      0.38      0.33       156
          combinatorics       0.47      0.47      0.47        57
constructive algorithms       0.40      0.53      0.46       219
        data structures       0.37      0.56      0.45       128
        dfs and similar       0.58      0.71      0.64       112
     divide and conquer       0.12      0.24      0.16        17
                     dp       0.50      0.49      0.49       176
                    dsu       0.22      0.42      0.29        31
                  games       0.55      0.81      0.65        21
               geometry       0.56      0.68      0.61        34
                 graphs       0.59      0.65      0.61       133
                 greedy       0.56      0.61      0.59       354
                hashing 

In [11]:
focus_tags = [
    'math', 'graphs', 'strings', 'number theory',
    'trees', 'geometry', 'games', 'probabilities'
]

# 1. Filter to focus tag indices
focus_indices = [i for i, tag in enumerate(mlb.classes_) if tag in focus_tags]

# 2. Slice true and predicted labels
y_true_focus = y_test[:, focus_indices]
y_pred_focus = y_pred[:, focus_indices]

# 3. Classification report for focus tags
print("üìä Classification Report ‚Äî Focus Tags Only:")
print(classification_report(y_true_focus, y_pred_focus, target_names=focus_tags))

# 4. Hamming Loss (all tags ‚Äî global view)
hloss = hamming_loss(y_test, y_pred)
print(f"\n‚ö†Ô∏è Hamming Loss (All Tags): {hloss:.4f}")

üìä Classification Report ‚Äî Focus Tags Only:
               precision    recall  f1-score   support

         math       0.55      0.81      0.65        21
       graphs       0.56      0.68      0.61        34
      strings       0.59      0.65      0.61       133
number theory       0.51      0.60      0.55       267
        trees       0.51      0.66      0.58        65
     geometry       0.71      0.55      0.62        22
        games       0.61      0.78      0.68        87
probabilities       0.49      0.75      0.59        59

    micro avg       0.54      0.66      0.59       688
    macro avg       0.56      0.68      0.61       688
 weighted avg       0.54      0.66      0.59       688
  samples avg       0.34      0.36      0.33       688


‚ö†Ô∏è Hamming Loss (All Tags): 0.1229


# Improvements on BASELINE

## Using Semantic Embedding

In [20]:
df["text_for_embedding"] = (
    df["prob_desc_description"].fillna("") + " " +
    df["prob_desc_notes"].fillna("") + " " #+
    # df["source_code"].fillna("")
)

In [21]:
from sentence_transformers import SentenceTransformer

model_name = "all-MiniLM-L6-v2"
embedder = SentenceTransformer(model_name)

X_embed = embedder.encode(df["text_for_embedding"].tolist(), show_progress_bar=True)

Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 155/155 [00:25<00:00,  6.00it/s]


In [22]:
X_train, X_test, y_train, y_test = train_test_split(X_embed, y, test_size=0.2, random_state=42)

model = OneVsRestClassifier(LogisticRegression(max_iter=1000, class_weight="balanced"))
model.fit(X_train, y_train)

In [23]:
# 7. Evaluate
# y_pred = model.predict(X_test_tfidf)
# y_pred = model.predict(X_test_combined)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=mlb.classes_, zero_division=0))

                         precision    recall  f1-score   support

          binary search       0.19      0.57      0.28       118
               bitmasks       0.15      0.53      0.24        43
            brute force       0.19      0.49      0.27       156
          combinatorics       0.13      0.49      0.20        57
constructive algorithms       0.36      0.58      0.44       219
        data structures       0.21      0.60      0.31       128
        dfs and similar       0.34      0.63      0.44       112
     divide and conquer       0.07      0.71      0.13        17
                     dp       0.26      0.56      0.35       176
                    dsu       0.08      0.52      0.14        31
                  games       0.30      0.81      0.44        21
               geometry       0.27      0.82      0.41        34
                 graphs       0.38      0.63      0.48       133
                 greedy       0.50      0.67      0.57       354
                hashing 

In [24]:
focus_tags = [
    'math', 'graphs', 'strings', 'number theory',
    'trees', 'geometry', 'games', 'probabilities'
]

# 1. Filter to focus tag indices
focus_indices = [i for i, tag in enumerate(mlb.classes_) if tag in focus_tags]

# 2. Slice true and predicted labels
y_true_focus = y_test[:, focus_indices]
y_pred_focus = y_pred[:, focus_indices]

# 3. Classification report for focus tags
print("üìä Classification Report ‚Äî Focus Tags Only:")
print(classification_report(y_true_focus, y_pred_focus, target_names=focus_tags))

# 4. Hamming Loss (all tags ‚Äî global view)
hloss = hamming_loss(y_test, y_pred)
print(f"\n‚ö†Ô∏è Hamming Loss (All Tags): {hloss:.4f}")

üìä Classification Report ‚Äî Focus Tags Only:
               precision    recall  f1-score   support

         math       0.30      0.81      0.44        21
       graphs       0.27      0.82      0.41        34
      strings       0.38      0.63      0.48       133
number theory       0.44      0.67      0.53       267
        trees       0.23      0.58      0.33        65
     geometry       0.19      0.73      0.30        22
        games       0.54      0.82      0.65        87
probabilities       0.39      0.75      0.51        59

    micro avg       0.37      0.69      0.48       688
    macro avg       0.34      0.73      0.46       688
 weighted avg       0.40      0.69      0.50       688
  samples avg       0.31      0.38      0.32       688


‚ö†Ô∏è Hamming Loss (All Tags): 0.2208


### Trying CodeBERT

In [6]:
from huggingface_hub import login
login(token="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")

In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

model_name = "Mallard74/bert-xcode-tags-classification"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.eval()  # inference mode

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [8]:
def preprocess_text(row):
    return (
        str(row.get("prob_desc_description", "")) + " " +
        str(row.get("prob_desc_notes", "")) + " " +
        str(row.get("source_code", ""))
    )

df["model_input"] = df.apply(preprocess_text, axis=1)

In [10]:
from torch.utils.data import Dataset, DataLoader
import torch

class TagDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx]

def collate_fn(batch_texts):
    return tokenizer(
        batch_texts,
        truncation=True,
        padding=True,
        max_length=512,
        return_tensors="pt"
    )

texts = df["model_input"].tolist()
dataset = TagDataset(texts)
dataloader = DataLoader(dataset, batch_size=1, collate_fn=collate_fn)  # reduce batch_size if still crashing

In [11]:
model.config.id2label

{0: 'games',
 1: 'geometry',
 2: 'graphs',
 3: 'math',
 4: 'number theory',
 5: 'probabilities',
 6: 'strings',
 7: 'trees'}

In [12]:
from tqdm import tqdm
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Run batched inference
all_preds = []

with torch.no_grad():
    for batch in tqdm(dataloader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        probs = torch.sigmoid(outputs.logits).cpu().numpy()
        all_preds.extend(probs)

  0%|          | 0/4941 [00:00<?, ?it/s]

: 

In [None]:
all_preds