In [1]:
import pandas as pd
import numpy as np

from dsgd.DSClassifierMultiQ import DSClassifierMultiQ

from sklearn.metrics import accuracy_score, confusion_matrix, f1_score


In [2]:
df = pd.read_csv("https://huggingface.co/datasets/furrutiav/sac_nllf/raw/main/train.csv")
df = df.drop(columns=['index'])
print("Dataframe shape: ", df.shape)
df.head()
test = pd.read_csv("https://huggingface.co/datasets/furrutiav/sac_nllf/raw/main/test.csv")
test = test.drop(columns=['index'])
print("Dataframe shape: ", test.shape)

Dataframe shape:  (1400, 27)
Dataframe shape:  (400, 27)


In [3]:
y = df['label']
X = df.drop(columns=['label'])
y = y.to_numpy()
X = X.to_numpy()    
y_test = test['label']
X_test = test.drop(columns=['label'])
X_test = X_test.to_numpy()
y_test = y_test.to_numpy()

In [4]:
DSCs = {}
time = {}
accuracys = {}
learning_rates = [0.0001, 0.001,]
batch_sizes = [2000, 3000, 4000]
min_dl = [1e-7, 1e-6, 1e-5,]
num_rules = [3, 5, 7,]
df_results = pd.DataFrame(columns=['lr', 'bs', 'mdl', 'nr', 'accuracy', 'f1_micro', 'f1_macro'])
results = []
for lr in learning_rates:
    for bs in batch_sizes:
        for mdl in min_dl:
            for nr in num_rules:
                DSC = DSClassifierMultiQ(2, min_iter=20, max_iter=500, debug_mode=True, lr=lr, batch_size=bs,
                                lossfn="MSE", num_workers=1, min_dloss=mdl, precompute_rules=True)
                print(f"Training DSC with lr={lr}, bs={bs}, mdl={mdl}, nr={nr}")
                losses, epoch, dt = DSC.fit(X, y, add_single_rules=True,
                            single_rules_breaks=nr, add_mult_rules=False,
                                column_names=df.columns[:-1], print_every_epochs=1, print_final_model=False)
                y_pred = DSC.predict(X_test)
                acc = accuracy_score(y_test, y_pred)
                f1_micro = f1_score(y_test, y_pred, average='micro')
                f1_macro = f1_score(y_test, y_pred, average='macro')
                print(f"Accuracy: {acc}, F1 micro: {f1_micro}, F1 macro: {f1_macro}")
                
                new_row = pd.DataFrame({
                    'lr': [lr], 'bs': [bs], 'mdl': [mdl], 'nr': [nr],
                    'accuracy': [acc], 'f1_micro': [f1_micro], 'f1_macro': [f1_macro]
                })
                df_results = pd.concat([df_results, new_row])
                

df_results.to_csv("results2.csv", index=False)

Training DSC with lr=0.0001, bs=2000, mdl=1e-07, nr=3
Optimization started
Processing epoch	500	0.2204	
Training time: 1219.24s, epochs: 500

Least training loss reached: 0.220
Accuracy: 0.665, F1 micro: 0.665, F1 macro: 0.664320248503219
Training DSC with lr=0.0001, bs=2000, mdl=1e-07, nr=5
Optimization started
Processing epoch	1	1.0000	

  df_results = pd.concat([df_results, new_row])


Processing epoch	500	0.2086	
Training time: 1176.80s, epochs: 500

Least training loss reached: 0.209
Accuracy: 0.64, F1 micro: 0.64, F1 macro: 0.6395584591124127
Training DSC with lr=0.0001, bs=2000, mdl=1e-07, nr=7
Optimization started
Processing epoch	500	0.2081	
Training time: 1178.64s, epochs: 500

Least training loss reached: 0.208
Accuracy: 0.655, F1 micro: 0.655, F1 macro: 0.6539532084555781
Training DSC with lr=0.0001, bs=2000, mdl=1e-06, nr=3
Optimization started
Processing epoch	500	0.2186	
Training time: 1178.71s, epochs: 500

Least training loss reached: 0.219
Accuracy: 0.665, F1 micro: 0.665, F1 macro: 0.664698228405565
Training DSC with lr=0.0001, bs=2000, mdl=1e-06, nr=5
Optimization started
Processing epoch	500	0.2118	
Training time: 1179.86s, epochs: 500

Least training loss reached: 0.212
Accuracy: 0.66, F1 micro: 0.66, F1 macro: 0.6591478696741855
Training DSC with lr=0.0001, bs=2000, mdl=1e-06, nr=7
Optimization started
Processing epoch	500	0.2179	
Training time: 1

In [5]:
df_results = pd.read_csv('results2.csv')
df_results = df_results.sort_values(by='accuracy', ascending=False)
df_results.head(5)

Unnamed: 0,lr,bs,mdl,nr,accuracy,f1_micro,f1_macro
17,0.0001,3000,1e-05,7,0.675,0.675,0.672635
7,0.0001,2000,1e-05,5,0.6725,0.6725,0.671907
24,0.0001,4000,1e-05,3,0.67,0.67,0.669173
15,0.0001,3000,1e-05,3,0.67,0.67,0.666321
0,0.0001,2000,1e-07,3,0.665,0.665,0.66432


In [8]:
#random forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, max_depth=4, random_state=0)
rf.fit(X, y)
y_pred = rf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
f1_micro = f1_score(y_test, y_pred, average='micro')
f1_macro = f1_score(y_test, y_pred, average='macro')
print(f"Accuracy: {acc}, F1 micro: {f1_micro}, F1 macro: {f1_macro}")

Accuracy: 0.675, F1 micro: 0.675, F1 macro: 0.6710193339406822
