In [1]:
import pandas as pd
import ast
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, hamming_loss
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

In [6]:
LABELS_TEST_PATH = '../data/DM2023_train_labels.tsv'
data = pd.read_csv(LABELS_TEST_PATH, sep='\t', header=None, names=['id', 'target', 'similar'], nrows=2000)

data['target'] = data['target'].apply(ast.literal_eval)
data['similar'] = data['similar'].apply(ast.literal_eval)

data.head()

Unnamed: 0,id,target,similar
0,1416989.txt,"[K.4.3, I.2.9, I.6.5, J.1]","[[I.2], [J.1], [K.5.2, J.4, K.4.m], [J.1, H.3...."
1,580106.txt,"[H.3.5, D.3.2, I.7.2]","[[D.3.4, D.3.2, H.3.5, I.7.2], [D.3.2, H.3.4, ..."
2,1516665.txt,"[H.2.8, J.3]","[[I.6.5, I.2.4, I.5.3, J.3], [H.3.5, H.3.4, J...."
3,873827.txt,"[J.1, I.2.8, I.6.3]","[[F.2.2, J.1, J.6], [I.2.8, F.2.2, I.6.5], [F...."
4,1755942.txt,[D.4.6],"[[K.6.5, C.2.0, C.5.3, H.4.3], [D.4.6], [D.2.1..."


In [7]:
all_similar_codes = [code for sublist in data['similar'].tolist() for item in sublist for code in item]
all_target_codes = [code for sublist in data['target'].tolist() for code in sublist]

all_codes = list(set(all_similar_codes + all_target_codes))

mlb = MultiLabelBinarizer(classes=all_codes)

similar_encoded = mlb.fit_transform(data['similar'].apply(lambda x: [code for sublist in x for code in sublist]))

target_encoded = mlb.transform(data['target'])

X = pd.DataFrame(similar_encoded, columns=mlb.classes_)
y = pd.DataFrame(target_encoded, columns=mlb.classes_)

X.head(), y.head()

(   I.2  G.2.3  K.6.0  B.0  K.8.0  B.1.4  I.6.m  B.4  I.4.1  D.1.4  ...  H.5.m  \
 0    1      0      0    0      0      0      0    0      0      0  ...      0   
 1    0      0      1    0      0      0      0    0      0      0  ...      0   
 2    0      0      0    0      0      0      0    0      0      0  ...      0   
 3    0      0      0    0      0      0      0    0      0      0  ...      0   
 4    0      0      0    0      0      0      0    0      0      0  ...      0   
 
    H.3.3  C.2.m  F.2.1  G.1.1  B.8.m  D.4.2  I.6.7  I.2.10  I.5.2  
 0      1      0      0      0      0      0      0       0      0  
 1      0      1      0      0      0      0      0       0      0  
 2      1      0      0      0      0      0      0       0      0  
 3      0      0      1      0      0      0      0       0      0  
 4      1      0      0      0      0      1      0       0      0  
 
 [5 rows x 354 columns],
    I.2  G.2.3  K.6.0  B.0  K.8.0  B.1.4  I.6.m  B.4  I.4.1  D.1.

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1600, 354), (400, 354), (1600, 354), (400, 354))

In [10]:
pipeline = Pipeline([
    ('clf', RandomForestClassifier(random_state=42))
])

# Define hyperparameters to search
param_grid = {
    'clf__n_estimators': [100, 200, 300],
    'clf__max_depth': [None, 10, 20],
    'clf__min_samples_split': [2, 5, 10],
    'clf__min_samples_leaf': [1, 2, 4]
}

# Randomized search for hyperparameter optimization
clf = RandomizedSearchCV(pipeline, param_distributions=param_grid, n_iter=10, cv=5, verbose=2, n_jobs=-1, random_state=42)
clf.fit(X_train, y_train)

# Predict
y_pred = clf.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
hamming = hamming_loss(y_test, y_pred)

# Print metrics
print(f'Accuracy: {accuracy}')
print(f'Hamming Loss: {hamming}')
# clf = MultiOutputClassifier(LogisticRegression()).fit(X_train, y_train)

# y_pred = clf.predict(X_test)

# accuracy = accuracy_score(y_test, y_pred)
# hamming = hamming_loss(y_test, y_pred)

# print(f'Accuracy: {accuracy}')
# print(f'Hamming Loss: {hamming}')

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=5, clf__n_estimators=100; total time=  10.2s
[CV] END clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=5, clf__n_estimators=100; total time=  10.4s
[CV] END clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=5, clf__n_estimators=100; total time=  10.6s
[CV] END clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=5, clf__n_estimators=100; total time=  10.7s
[CV] END clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=5, clf__n_estimators=100; total time=  11.4s
[CV] END clf__max_depth=None, clf__min_samples_leaf=1, clf__min_samples_split=2, clf__n_estimators=100; total time=  16.3s
[CV] END clf__max_depth=None, clf__min_samples_leaf=1, clf__min_samples_split=2, clf__n_estimators=100; total time=  16.6s
[CV] END clf__max_depth=None, clf__min_samples_leaf=1, clf__min_samples_split=2, clf__n_



[CV] END clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=2, clf__n_estimators=200; total time=  27.0s
[CV] END clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=2, clf__n_estimators=200; total time=  26.3s
[CV] END clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=2, clf__n_estimators=200; total time=  27.2s
[CV] END clf__max_depth=None, clf__min_samples_leaf=2, clf__min_samples_split=2, clf__n_estimators=200; total time=  30.3s
[CV] END clf__max_depth=None, clf__min_samples_leaf=2, clf__min_samples_split=2, clf__n_estimators=200; total time=  31.3s
[CV] END clf__max_depth=None, clf__min_samples_leaf=2, clf__min_samples_split=2, clf__n_estimators=200; total time=  32.0s
[CV] END clf__max_depth=None, clf__min_samples_leaf=2, clf__min_samples_split=2, clf__n_estimators=200; total time=  29.3s
[CV] END clf__max_depth=20, clf__min_samples_leaf=2, clf__min_samples_split=10, clf__n_estimators=200; total time=  29.0s
[CV] END clf__max_depth