In [48]:
import os
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

In [2]:
csv_root_path = "../datasets/intermediate-features/"

In [56]:
def calculate_scores(targets, predictions):
    accuracy = accuracy_score(targets, predictions)
    print(f"Accuracy: {accuracy}")
    balanced_accuracy = balanced_accuracy_score(targets, predictions)
    print(f"Balanced accuracy: {balanced_accuracy}")
    f1_weighted = f1_score(targets, predictions, average="weighted")
    print(f"F1-weighted: {f1_weighted}")
    f1_micro = f1_score(targets, predictions, average="micro")
    print(f"F1-micro: {f1_micro}")
    f1_macro = f1_score(targets, predictions, average="macro")
    print(f"F1-macro: {f1_macro}")
    # TODO: add to log file

In [5]:
csv_paths = os.listdir(path=csv_root_path)
df_comparison = pd.DataFrame()

for i, path in enumerate(csv_paths):
    df_full = pd.read_csv(os.path.join(csv_root_path, path), index_col=0)
    df_comparison[f"target_{i}"] = df_full["target"]
    df_comparison[f"set_{i}"] = df_full["set"]

In [6]:
df_comparison[df_comparison.columns[0::2]].T.duplicated()  # verify if split is the same
# first row must be False, all other True

target_0     False
target_1      True
target_2      True
target_3      True
target_4      True
target_5      True
target_6      True
target_7      True
target_8      True
target_9      True
target_10     True
target_11     True
target_12     True
target_13     True
target_14     True
target_15     True
dtype: bool

In [7]:
df_comparison[df_comparison.columns[1::2]].T.duplicated()  # verify if split is the same
# first row must be False, all other True

set_0     False
set_1      True
set_2      True
set_3      True
set_4      True
set_5      True
set_6      True
set_7      True
set_8      True
set_9      True
set_10     True
set_11     True
set_12     True
set_13     True
set_14     True
set_15     True
dtype: bool

In [9]:
csv_path = os.path.join(csv_root_path, csv_paths[0])

df_full = pd.read_csv(csv_path, index_col=0)
df_train = df_full[df_full["set"] == "train"]
df_valid = df_full[df_full["set"] == "valid"]
df_test = df_full[df_full["set"] == "test"]

In [27]:
df_train_features = df_train.loc[:, ~df_train.columns.isin(["target", "set"])]
df_valid_features = df_valid.loc[:, ~df_valid.columns.isin(["target", "set"])]
df_test_features = df_test.loc[:, ~df_test.columns.isin(["target", "set"])]

scaler = StandardScaler()
train_features = scaler.fit_transform(df_train_features)
valid_features = scaler.transform(df_valid_features)
test_features = scaler.transform(df_test_features)

In [40]:
train_targets = df_train["target"].to_numpy()
valid_targets = df_valid["target"].to_numpy()
test_targets = df_test["target"].to_numpy()

In [57]:
model_lr = LogisticRegression(max_iter=10000)
model_lr.fit(train_features, train_targets)

valid_pred = model_lr.predict(valid_features)
calculate_scores(valid_targets, valid_pred)
print("---")
test_pred = model_lr.predict(test_features)
calculate_scores(test_targets, test_pred)

Accuracy: 0.49707602339181284
Balanced accuracy: 0.18841269841269842
F1-weighted: 0.47049228480976374
F1-micro: 0.49707602339181284
F1-macro: 0.18185394182561992
---
Accuracy: 0.4941860465116279
Balanced accuracy: 0.19741223202289854
F1-weighted: 0.47738929595412566
F1-micro: 0.4941860465116279
F1-macro: 0.18899783705839945


In [None]:
def calculate_scores(targets, predictions):
    accuracy = accuracy_score(targets, predictions)
    print(f"Accuracy: {accuracy}")
    balanced_accuracy = balanced_accuracy_score(targets, predictions)
    print(f"Balanced accuracy: {balanced_accuracy}")
    f1_weighted = f1_score(targets, predictions, average="weighted")
    print(f"F1-weighted: {f1_weighted}")
    f1_micro = f1_score(targets, predictions, average="micro")
    print(f"F1-micro: {f1_micro}")
    f1_macro = f1_score(targets, predictions, average="macro")
    print(f"F1-macro: {f1_macro}")
    # TODO: add to log file

In [69]:
scores = {}
max_neighbors = 10
for k in range(1, max_neighbors):
    model_knn = KNeighborsClassifier(n_neighbors=k)
    model_knn.fit(train_features, train_targets)
    valid_pred = model_knn.predict(valid_features)
    score = f1_score(df_valid_targets, valid_pred, average="weighted")
    scores[k] = score
    print(f"Score with {i} neighbors: {score}")

Score with 15 neighbors: 0.525850771507081
Score with 15 neighbors: 0.49388110677755936
Score with 15 neighbors: 0.47750100441944543
Score with 15 neighbors: 0.4862665435518621
Score with 15 neighbors: 0.48095075916434227
Score with 15 neighbors: 0.46612882571391984
Score with 15 neighbors: 0.4672006102212051
Score with 15 neighbors: 0.4672006102212051
Score with 15 neighbors: 0.4672006102212051


In [74]:
best_k = max(scores, key=scores.get)
model_knn = KNeighborsClassifier(n_neighbors=best_k)
model_knn.fit(train_features, train_targets)
test_pred = model_knn.predict(test_features)
calculate_scores(test_targets, test_pred)

Accuracy: 0.6046511627906976
Balanced accuracy: 0.2996058403500427
F1-weighted: 0.5548930992384169
F1-micro: 0.6046511627906976
F1-macro: 0.3106667970652645
