# Import libraries

In [128]:
import os
import json
import numpy as np
import pandas as pd
import xgboost as xgb
import joblib

from scipy.sparse import hstack
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score

# Import Data

In [129]:
data_list=[]

folder_path='code_classification_dataset'
for filename in os.listdir(folder_path):
    filepath=os.path.join(folder_path, filename)
    with open(filepath, 'r') as f:
        data=json.load(f)
        data_list.append(data)

df=pd.DataFrame(data_list)
df.head()

Unnamed: 0,prob_desc_time_limit,prob_desc_sample_outputs,src_uid,prob_desc_notes,prob_desc_description,prob_desc_output_spec,prob_desc_input_spec,prob_desc_output_to,prob_desc_input_from,lang,...,difficulty,file_name,code_uid,prob_desc_memory_limit,prob_desc_sample_inputs,exec_outcome,source_code,prob_desc_created_at,tags,hidden_unit_tests
0,1 second,"[""16.000000""]",bb3fc45f903588baf131016bea175a9f,NoteIn the test example we can choose first 4 ...,Iahub has drawn a set of n points in the carte...,Output a single real number — the maximal area...,The first line contains integer n (4 ≤ n ≤ 300...,standard output,standard input,Python 3,...,2100.0,train_048.jsonl,b9da6adb7ff5f00a9127e39974a16d61,256 megabytes,"[""5\n0 0\n0 4\n4 0\n4 4\n2 3""]",PASSED,# calculate convex of polygon v.\n# v is list ...,1377876600,"[geometry, brute force]",
1,2 seconds,"[""1"", ""-1"", ""2"", ""-1"", ""0""]",7898b8258297a6cde8fecb1079172e10,"NoteIn the first test, here is an example plac...",A monopole magnet is a magnet that only has on...,"Output a single integer, the minimum possible ...",The first line contains two integers $$$n$$$ a...,standard output,standard input,PyPy 2,...,2000.0,train_007.jsonl,09b419a2ea7207df6e9f6f972fa83d11,256 megabytes,"[""3 3\n.#.\n###\n##."", ""4 2\n##\n.#\n.#\n##"", ...",PASSED,\n\nfrom math import factorial as fac\nfrom co...,1588775700,"[dsu, constructive algorithms, dfs and similar]",
2,1 second,"[""14"", ""9"", ""3""]",7b12845f668e28b7f18019d5ab5eaec7,NoteIn the first example the optimal sequence ...,Polycarp has prepared $$$n$$$ competitive prog...,Print one integer — the maximum number of prob...,The first line of the input contains one integ...,standard output,standard input,PyPy 3,...,1800.0,train_037.jsonl,99ac0b5c4a37e839f43173d671845752,256 megabytes,"[""18\n2 1 2 10 2 10 10 2 2 1 10 10 10 10 1 1 1...",PASSED,from collections import *\nfrom math import *\...,1542378900,"[sortings, greedy]",
3,4 seconds,"[""3"", ""0"", ""21""]",3875486b0178bf04b3463a15f36f8169,NoteIn the first example three operations are ...,Monocarp has arranged $$$n$$$ colored marbles ...,Print the minimum number of operations Monocar...,The first line contains one integer $$$n$$$ $$...,standard output,standard input,PyPy 3,...,2200.0,train_077.jsonl,13d679e5837ff7ae9285a59d62b8c9a4,256 megabytes,"[""7\n3 4 2 3 4 2 2"", ""5\n20 1 14 10 2"", ""13\n5...",PASSED,# Num of the element\nimport sys\ninput = sys....,1568543700,"[dp, bitmasks]",
4,1 second,"[""3 abc\n2 bc\n1 c\n0 \n1 d"", ""18 abbcd...tw\n...",7d6faccc88a6839822fa0c0ec8c00251,NoteConsider the first example. The longest s...,Some time ago Lesha found an entertaining stri...,In $$$|s|$$$ lines print the lengths of the an...,The only line contains the string $$$s$$$ ($$$...,standard output,standard input,Python 3,...,2700.0,train_011.jsonl,00750c7d58b33d3ac03785c4a39a4ff9,256 megabytes,"[""abcdd"", ""abbcdddeaaffdfouurtytwoo""]",PASSED,s = input().strip();N = len(s)\nif len(s) == 1...,1601827500,"[dp, implementation, greedy, strings]",


# Preprocessing

In [130]:
# Delete all tags we aren't trying to predict
tags_to_keep=['math', 'graphs', 'strings', 'number theory', 'trees', 'geometry', 'games', 'probabilities']
df['tags']=df['tags'].apply(lambda taglist: [tag for tag in taglist if tag in tags_to_keep])

# Multilabel binarization
mlb=MultiLabelBinarizer(classes=tags_to_keep)
y=pd.DataFrame(mlb.fit_transform(df['tags']), columns=mlb.classes_)

# Putting all the text in lowercase
df["prob_desc_description"].str.lower()
df["source_code"].str.lower()

# Deleting rows with no value or -1 in 'difficulty'
valid_rows=df['difficulty'].notna()&(df['difficulty']!=-1)

df=df[valid_rows].reset_index(drop=True)
y=y[valid_rows].reset_index(drop=True)

In [131]:
# Identification of the examples with no tags
no_tag_mask=(y.sum(axis=1)==0)

# Undersampling examples with none of the eight tags (to reduce class imbalance)
no_tag_df=df[no_tag_mask].sample(frac=0.4, random_state=42)
no_tag_y=y[no_tag_mask].loc[no_tag_df.index]

with_tag_df=df[~no_tag_mask]
with_tag_y=y[~no_tag_mask]

# Concatenation of the dataframes with and without tags
df=pd.concat([with_tag_df, no_tag_df])
y=pd.concat([with_tag_y, no_tag_y])

# Shuffling of the rows so we don't have all no tag rows at the end
shuffled_idx=df.sample(frac=1, random_state=42).index
df=df.loc[shuffled_idx].reset_index(drop=True)
y=y.loc[shuffled_idx].reset_index(drop=True)

In [132]:
# Vectorization of prob_desc_description with TF-IDF
tfidf_desc=TfidfVectorizer(max_features=10000, ngram_range=(1,2))
X_desc=tfidf_desc.fit_transform(df['prob_desc_description'])

# Vectorization of source_code with TF-IDF
tfidf_code=TfidfVectorizer(max_features=10000, ngram_range=(1,2), token_pattern=r'\b\w+\b')
X_code=tfidf_code.fit_transform(df['source_code'])

# Adding difficulty column
X_difficulty=df['difficulty'].values.reshape(-1,1)
scaler=StandardScaler()
X_difficulty=scaler.fit_transform(X_difficulty)

# Concatenation of the three matrices
X=hstack([X_desc, X_code, X_difficulty])

# Splitting training and testing dataset

In [133]:
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=42)

# Model Training

In [134]:
# Formatting for XGBoost
X_train_dense=X_train.toarray()
X_test_dense=X_test.toarray()

# Computing scale_pos_weights parameter for class imbalance
tag_count={}
for tag in y.columns:
    tag_count[tag]=y[tag].sum()
scale_pos_weights={tag: (y.shape[0] - count) / count for tag, count in tag_count.items()}

# Training XGBoost on every tag
models={}
y_pred_proba=np.zeros(y_test.shape)

for i, tag in enumerate(y.columns):
    model = xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        use_label_encoder=False,
        n_estimators=250,
        max_depth=6,
        learning_rate=0.1,
        n_jobs=-1,
        scale_pos_weight=scale_pos_weights[tag] 
    )
    model.fit(X_train_dense, y_train.iloc[:, i])
    y_pred_proba[:, i]=model.predict_proba(X_test_dense)[:, 1]
    models[tag]=model


# Finding optimal threshold 

In [135]:
best_f1_score=0
best_threshold=0
for threshold in np.arange(0.2,0.8,0.05):
    y_pred=(y_pred_proba>=threshold).astype(int)
    f1_micro=f1_score(y_test, y_pred, average='micro')
    if f1_micro>=best_f1_score:
        best_f1_score=f1_micro
        best_threshold=threshold

print(f"Best threshold: {best_threshold:.2f} | F1-score micro: {best_f1_score:.2f}")

Best threshold: 0.30 | F1-score micro: 0.69


# Computing metrics for the model

In [136]:
for i,tag in enumerate(y.columns):  
    y_pred_tag=(y_pred_proba[:, i]>=best_threshold).astype(int)
    f1=f1_score(y_test.iloc[:, i], y_pred_tag)
    print(f"F1-score for tag {tag}: {f1:.2f}")

roc_auc_micro=roc_auc_score(y_test, y_pred_proba, average='micro')
print(f"ROC AUC micro: {roc_auc_micro:.2f}")
print(f"F1-score micro: {best_f1_score:.2f}")

F1-score for tag math: 0.70
F1-score for tag graphs: 0.67
F1-score for tag strings: 0.72
F1-score for tag number theory: 0.64
F1-score for tag trees: 0.69
F1-score for tag geometry: 0.63
F1-score for tag games: 0.84
F1-score for tag probabilities: 0.48
ROC AUC micro: 0.94
F1-score micro: 0.69


# Saving models

In [139]:
# Saving vectorizers and scaler
joblib.dump(tfidf_desc, 'saved_models/tfidf_desc.joblib')
joblib.dump(tfidf_code, 'saved_models/tfidf_code.joblib')
joblib.dump(scaler, 'saved_models/scaler.joblib')

# Saving XGBoost models 
for tag, model in models.items():
    joblib.dump(model, f'saved_models/xgb_model_{tag}.joblib')
