In [None]:
import pandas as pd
import os
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.svm import SVC
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import StratifiedKFold

In [None]:
embeddings_df = pd.read_csv(os.path.join("..","datasets", "embeddings.csv"))
accounts_df = pd.read_csv(os.path.join("..","datasets", "accounts_processed.csv"))

In [None]:
accounts_df.head(5)
accounts_df['username'] = accounts_df['username'].apply(lambda s: s.lower())

In [None]:
embeddings_df.head(5)

In [None]:
embeddings_df['username'] = embeddings_df['username'].astype(object)

In [None]:
full_df = pd.merge(embeddings_df, accounts_df, on='username', how='inner')

In [None]:
embeddings_df.count()

In [None]:
full_df.count()


In [None]:
full_df['username'].value_counts() >1


In [None]:
full_df[full_df['username'] == 'z_frankiewicz']


In [None]:
index_to_be_deleted = 545

In [None]:
full_df = full_df.drop(545)


In [None]:
full_df.count()

In [None]:
df_to_training = full_df[['embedding','username', 'pozycja', 'coalition', 'party']]

In [None]:
df_to_training['coalition'].value_counts()

In [None]:
df_to_training['party'].value_counts()

In [None]:
df_to_training['pozycja'].value_counts()

In [None]:
embedding_size = 768

In [None]:
party_labels = df_to_training['party']
coalition_labels = df_to_training['coalition']
position_labels = df_to_training['pozycja']
parties_number_of_classes = len(party_labels.unique())
coalitions_number_of_classes = len(coalition_labels.unique())
positions_number_of_classes = len(position_labels.unique())

party_labels = party_labels.to_numpy()
coalition_labels = coalition_labels.to_numpy()
position_labels = position_labels.to_numpy()

party_encoder = OneHotEncoder(handle_unknown='ignore')
party_encoded = party_encoder.fit_transform(party_labels.reshape(-1,1)).toarray()

coalition_encoder = OneHotEncoder(handle_unknown='ignore')
coalition_encoded = coalition_encoder.fit_transform(coalition_labels.reshape(-1,1)).toarray()

position_encoder = OneHotEncoder(handle_unknown='ignore')
position_encoded = position_encoder.fit_transform(position_labels.reshape(-1,1)).toarray()


In [None]:
features = df_to_training['embedding']
features = list(features)
features = [np.fromstring(embedding[1:-1].replace("\n",""), count=embedding_size, sep=" ") for embedding in features]
features = np.array(features)

In [None]:
scaler = StandardScaler().fit(features)
features_scaled = scaler.transform(features)

minmax_scaler = MinMaxScaler().fit(features)
features_min_max = minmax_scaler.transform(features)

In [None]:
label_names =[]
train_f1_scores =[]
val_f1_scores = []
features_names = []
neighbours_list =[]
weights_list = []
distance_list = []

for l, encoder, labels_name in [(coalition_labels, coalition_encoder, "coalitions"),
                   (party_labels, party_encoder, "parties"),
                   (position_labels, position_encoder, "positions")]:
    for name, feature in [("scaled", features_scaled),
                          ("original", features),
                          ("minmax", features_min_max)]:
        for neighbors in [5,10,20,30,40,50]:
            for weights in ['uniform', 'distance']:
                for distance in ['euclidean', 'manhattan', 'chebyshev', 'minkowski']:
                    svm = KNeighborsClassifier(n_neighbors=neighbors, weights=weights, metric=distance)
                    mean_train_f1_score, mean_val_f1_score = train_model_kfold(svm, 10, feature, l, encoder)
                    print(f"Training F1-score: {mean_train_f1_score}, validation F1-score: {mean_val_f1_score},"
                          f"features - {name}, labels name - {labels_name}, n_neighbours={neighbors}, weights={weights}, distance={distance}")
                    label_names.append(labels_name)
                    train_f1_scores.append(mean_train_f1_score)
                    val_f1_scores.append(mean_val_f1_score)
                    features_names.append(name)
                    neighbours_list.append(neighbors)
                    weights_list.append(weights)
                    distance_list.append(distance)

results = pd.DataFrame(data={"label_name": label_names,
                             "feature_type": features_names,
                             "distance": distance_list,
                             "n_neighbours": neighbours_list,
                             "weights": weights_list,
                             "train_f1_score": train_f1_scores,
                             "val_f1_score": val_f1_scores})

results.to_csv(
    os.path.join(PATH_TO_RESULTS, "knn.csv"),
    index=False)