In [5]:
import pandas as pd
import numpy as np

from dsgd.DSClassifierMultiQ import DSClassifierMultiQ

from sklearn.metrics import accuracy_score, confusion_matrix, f1_score


In [6]:
df = pd.read_csv("https://huggingface.co/datasets/furrutiav/sac_nllf/raw/main/train.csv")
df = df.drop(columns=['index', "b1(N)", "b2(N)", "b3(N)", "b4(N)", "b5(N)", "b6(N)", "b7(N)", "b8(N)", "b9(N)", "b10(N)", "b11(N)", "b12(N)", "b13(N)"])
print("Dataframe shape: ", df.shape)
df.head()
test = pd.read_csv("https://huggingface.co/datasets/furrutiav/sac_nllf/raw/main/test.csv")
test = test.drop(columns=['index', "b1(N)", "b2(N)", "b3(N)", "b4(N)", "b5(N)", "b6(N)", "b7(N)", "b8(N)", "b9(N)", "b10(N)", "b11(N)", "b12(N)", "b13(N)"])
print("Dataframe shape: ", test.shape)

Dataframe shape:  (1400, 14)
Dataframe shape:  (400, 14)


In [7]:
y = df['label']
X = df.drop(columns=['label'])
y = y.to_numpy()
X = X.to_numpy()    
y_test = test['label']
X_test = test.drop(columns=['label'])
X_test = X_test.to_numpy()
y_test = y_test.to_numpy()

In [8]:
DSCs = {}
time = {}
accuracys = {}
learning_rates = [0.0001,]
batch_sizes = [2000, 3000, 4000]
min_dl = [1e-7, 1e-6, 1e-5,]
num_rules = [3, 5, 7,]
df_results = pd.DataFrame(columns=['lr', 'bs', 'mdl', 'nr', 'accuracy', 'f1_micro', 'f1_macro'])
results = []
for lr in learning_rates:
    for bs in batch_sizes:
        for mdl in min_dl:
            for nr in num_rules:
                DSC = DSClassifierMultiQ(2, min_iter=20, max_iter=500, debug_mode=True, lr=lr, batch_size=bs,
                                lossfn="MSE", num_workers=1, min_dloss=mdl, precompute_rules=True)
                print(f"Training DSC with lr={lr}, bs={bs}, mdl={mdl}, nr={nr}")
                losses, epoch, dt = DSC.fit(X, y, add_single_rules=True,
                            single_rules_breaks=nr, add_mult_rules=False,
                                column_names=df.columns[:-1], print_every_epochs=1, print_final_model=False)
                y_pred = DSC.predict(X_test)
                acc = accuracy_score(y_test, y_pred)
                f1_micro = f1_score(y_test, y_pred, average='micro')
                f1_macro = f1_score(y_test, y_pred, average='macro')
                print(f"Accuracy: {acc}, F1 micro: {f1_micro}, F1 macro: {f1_macro}")
                new_row = pd.DataFrame({
                    'lr': [lr], 'bs': [bs], 'mdl': [mdl], 'nr': [nr],
                    'accuracy': [acc], 'f1_micro': [f1_micro], 'f1_macro': [f1_macro]
                })
                df_results = pd.concat([df_results, new_row])
                results.append({
                    'lr': lr, 'bs': bs, 'mdl': mdl, 'nr': nr,
                    'accuracy': acc, 'f1_micro': f1_micro, 'f1_macro': f1_macro
                })
                

df_results.to_csv("results_pos.csv", index=False)

Training DSC with lr=0.0001, bs=2000, mdl=1e-07, nr=3
Optimization started
Processing epoch	500	0.2377	
Training time: 1213.97s, epochs: 500

Least training loss reached: 0.238
Accuracy: 0.62, F1 micro: 0.62, F1 macro: 0.6190476190476191
Training DSC with lr=0.0001, bs=2000, mdl=1e-07, nr=5
Optimization started
Processing epoch	1	1.0000	

  df_results = pd.concat([df_results, new_row])


Processing epoch	500	0.2319	
Training time: 1177.36s, epochs: 500

Least training loss reached: 0.232
Accuracy: 0.6175, F1 micro: 0.6175, F1 macro: 0.6125969298939706
Training DSC with lr=0.0001, bs=2000, mdl=1e-07, nr=7
Optimization started
Processing epoch	500	0.2227	
Training time: 1202.16s, epochs: 500

Least training loss reached: 0.223
Accuracy: 0.675, F1 micro: 0.675, F1 macro: 0.6713769306605324
Training DSC with lr=0.0001, bs=2000, mdl=1e-06, nr=3
Optimization started
Processing epoch	500	0.2289	
Training time: 1203.10s, epochs: 500

Least training loss reached: 0.229
Accuracy: 0.64, F1 micro: 0.64, F1 macro: 0.6382273138378052
Training DSC with lr=0.0001, bs=2000, mdl=1e-06, nr=5
Optimization started
Processing epoch	500	0.2186	
Training time: 1208.55s, epochs: 500

Least training loss reached: 0.219
Accuracy: 0.655, F1 micro: 0.655, F1 macro: 0.6544471153846154
Training DSC with lr=0.0001, bs=2000, mdl=1e-06, nr=7
Optimization started
Processing epoch	500	0.2187	
Training ti

In [9]:
df_results = pd.read_csv('results_pos.csv')
df_results = df_results.sort_values(by='accuracy', ascending=False)
df_results.head(5)

Unnamed: 0,lr,bs,mdl,nr,accuracy,f1_micro,f1_macro
5,0.0001,2000,1e-06,7,0.6975,0.6975,0.694597
8,0.0001,2000,1e-05,7,0.6875,0.6875,0.68717
20,0.0001,4000,1e-07,7,0.6775,0.6775,0.670328
26,0.0001,4000,1e-05,7,0.675,0.675,0.672346
2,0.0001,2000,1e-07,7,0.675,0.675,0.671377


In [12]:
# corremos los 5 mejores modelos y vemos las reglas que generan
dscs = []
for i in range(5):
    lr = df_results.iloc[i]['lr']
    bs = int(df_results.iloc[i]['bs'])
    mdl = df_results.iloc[i]['mdl']
    nr = int(df_results.iloc[i]['nr'])
    DSC = DSClassifierMultiQ(2, min_iter=20, max_iter=500, debug_mode=True, lr=lr, batch_size=bs,
                    lossfn="MSE", num_workers=1, min_dloss=mdl, precompute_rules=True)
    dscs.append(DSC)
    print(f"Training DSC with lr={lr}, bs={bs}, mdl={mdl}, nr={nr}")
    losses, epoch, dt = DSC.fit(X, y, add_single_rules=True,
                single_rules_breaks=nr, add_mult_rules=False,
                    column_names=df.columns[:-1], print_every_epochs=1, print_final_model=False)
    y_pred = DSC.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1_micro = f1_score(y_test, y_pred, average='micro')
    f1_macro = f1_score(y_test, y_pred, average='macro')
    print(f"Accuracy: {acc}, F1 micro: {f1_micro}, F1 macro: {f1_macro}")
    print("Rules:")
    DSC.print_most_important_rules(classes=["0",  "1"], threshold=0.19)

Training DSC with lr=0.0001, bs=2000, mdl=1e-06, nr=7
Optimization started
Processing epoch	500	0.2222	
Training time: 1167.56s, epochs: 500

Least training loss reached: 0.222
Accuracy: 0.67, F1 micro: 0.67, F1 macro: 0.6691729323308271
Rules:


Most important rules for class 0

	[0.233] R54: 0.827 < b8(Y) < 1.022
			0: 0.233	1: 0.000	Unc: 0.767

	[0.224] R73: 0.255 < b3(Y) < 0.438
			0: 0.224	1: 0.000	Unc: 0.776

	[0.218] R52: 0.552 < b8(Y) < 0.682
			0: 0.218	1: 0.000	Unc: 0.782

	[0.211] R0: b2(Y) < 0.599
			0: 0.211	1: 0.000	Unc: 0.789

	[0.201] R51: 0.421 < b8(Y) < 0.552
			0: 0.201	1: 0.000	Unc: 0.799

	[0.196] R63: b4(Y) > 1.061
			0: 0.192	1: 0.008	Unc: 0.800

	[0.195] R19: 0.718 < b6(Y) < 0.824
			0: 0.192	1: 0.008	Unc: 0.801

	[0.193] R99: 0.169 < b13(Y) < 0.299
			0: 0.187	1: 0.013	Unc: 0.801

	[0.192] R98: 0.023 < b13(Y) < 0.169
			0: 0.185	1: 0.015	Unc: 0.801

Most important rules for class 1

	[0.200] R87: b12(Y) > 0.694
			0: 0.000	1: 0.200	Unc: 0.800

	[0.195] R91: 0.1

In [14]:
#random forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, max_depth=4, random_state=0)
rf.fit(X, y)
y_pred = rf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
f1_micro = f1_score(y_test, y_pred, average='micro')
f1_macro = f1_score(y_test, y_pred, average='macro')
print(f"Accuracy: {acc}, F1 micro: {f1_micro}, F1 macro: {f1_macro}")

Accuracy: 0.67, F1 micro: 0.67, F1 macro: 0.6669946265042004
