In [1]:
import pandas as pd
import os
import sklearn
import json
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.svm import SVC
import tensorflow as tf
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import StratifiedKFold
from tensorflow.python.keras.callbacks import EarlyStopping

In [2]:
embeddings_df = pd.read_csv(os.path.join("..","datasets", "embeddings.csv"))
accounts_df = pd.read_csv(os.path.join("..","datasets", "accounts_processed.csv"))

In [3]:
accounts_df.head(5)
accounts_df['username'] = accounts_df['username'].apply(lambda s: s.lower())

In [4]:
embeddings_df.head(5)

Unnamed: 0,username,embedding
0,achybicka,[-4.93041947e-02 -5.91295818e-03 -8.47795755e-...
1,ac_sobol,[-8.40702951e-02 6.32756064e-03 -9.36329067e-...
2,adambielan,[-6.17371537e-02 -1.24293072e-02 -7.66848996e-...
3,adamgaweda,[ 2.38768775e-02 1.51185095e-02 -8.82171020e-...
4,adamowiczpawel,[-4.28744927e-02 1.10798636e-02 -9.27639380e-...


In [5]:
embeddings_df['username'] = embeddings_df['username'].astype(object)

In [6]:
full_df = pd.merge(embeddings_df, accounts_df, on='username', how='inner')

In [7]:
embeddings_df.count()

username     548
embedding    548
dtype: int64

In [8]:
full_df.count()


username         548
embedding        548
Unnamed: 0       548
pozycja          548
coalition        548
party            548
name             548
link do konta    548
tweets_count     548
dtype: int64

In [11]:
df_to_training = full_df[['embedding','username', 'pozycja', 'coalition', 'party']]

In [12]:
df_to_training['coalition'].value_counts()

Zjednoczona Prawica    236
KO                     177
Lewica                  54
niez.                   37
PSL-Kukiz15             33
Konfederacja            11
Name: coalition, dtype: int64

In [13]:
df_to_training['party'].value_counts()

PiS                             199
Platforma Obywatelska           149
niez.                            55
SLD                              29
PSL                              26
Solidarna Polska                 17
Wiosna                           17
Porozumienie                     16
Nowoczesna                        7
Kukiz15                           6
Razem                             6
KORWiN                            4
Ruch Narodowy                     4
Partia Zieloni                    3
Bezpartyjni Samorządowcy          3
Konfederacja                      2
Inicjatywa Polska                 2
Polska Partia Socjalistyczna      1
Konfederacja Korony Polskiej      1
Teraz!                            1
Name: party, dtype: int64

In [14]:
df_to_training['pozycja'].value_counts()

Sejm                     373
Senat                     49
Europoseł                 46
Prezydent miasta          45
Marszałek Województwa     11
Wojewoda                  11
Polska 2050               10
Prezydent Polski           2
Premier                    1
Name: pozycja, dtype: int64

In [None]:
embedding_size = 768

In [None]:
party_labels = df_to_training['party']
coalition_labels = df_to_training['coalition']
position_labels = df_to_training['pozycja']
parties_number_of_classes = len(party_labels.unique())
coalitions_number_of_classes = len(coalition_labels.unique())
positions_number_of_classes = len(position_labels.unique())

party_labels = party_labels.to_numpy()
coalition_labels = coalition_labels.to_numpy()
position_labels = position_labels.to_numpy()

party_encoder = OneHotEncoder(handle_unknown='ignore')
party_encoded = party_encoder.fit_transform(party_labels.reshape(-1,1)).toarray()

coalition_encoder = OneHotEncoder(handle_unknown='ignore')
coalition_encoded = coalition_encoder.fit_transform(coalition_labels.reshape(-1,1)).toarray()

position_encoder = OneHotEncoder(handle_unknown='ignore')
position_encoded = position_encoder.fit_transform(position_labels.reshape(-1,1)).toarray()


In [None]:
features = df_to_training['embedding']
features = list(features)
features = [np.fromstring(embedding[1:-1].replace("\n",""), count=embedding_size, sep=" ") for embedding in features]
features = np.array(features)

In [None]:
scaler = StandardScaler().fit(features)
features_scaled = scaler.transform(features)

minmax_scaler = MinMaxScaler().fit(features)
features_min_max = minmax_scaler.transform(features)

In [None]:
def train_model_kfold(sklearn_model, splits: int, features_arr, labels, labels_encoder):
    skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=42)
    train_f1_scores = []
    val_f1_scores = []

    for index, (train_indices, val_indices) in enumerate(skf.split(features_arr, labels)):
        train_x, val_x = features_arr[train_indices], features_arr[val_indices]
        train_y, val_y = labels[train_indices], labels[val_indices]

        train_y = labels_encoder.transform(train_y.reshape(-1,1)).toarray()
        val_y = labels_encoder.transform(val_y.reshape(-1,1)).toarray()

        train_y, val_y = np.argmax(train_y,axis=1), np.argmax(val_y,axis=1)
        sklearn_model.fit(train_x, train_y, )
        train_pred = sklearn_model.predict(train_x)
        val_pred = sklearn_model.predict(val_x)

        train_f1_score = f1_score(train_y, train_pred, average='macro')
        val_f_score = f1_score(val_y, val_pred, average='macro')
        train_f1_scores.append(train_f1_score)
        val_f1_scores.append(val_f_score)
    return np.mean(train_f1_scores), np.mean(val_f1_scores)

In [None]:
PATH_TO_RESULTS = os.path.join("..", "reports", "classification")

In [None]:
label_names =[]
train_f1_scores =[]
val_f1_scores = []
features_names = []
cs =[]
tols = []

for l, encoder, labels_name in [(coalition_labels, coalition_encoder, "coalitions"),
                   (party_labels, party_encoder, "parties"),
                   (position_labels, position_encoder, "positions")]:
    for name, feature in [("scaled", features_scaled),
                          ("original", features),
                          ("minmax", features_min_max)]:
        for c in [0.1, 0.25, 0.5, 0.75, 1.0]:
            for tol in [1e-4, 1e-3, 1e-2, 1e-1]:
                lr_model = LogisticRegression(penalty='l2', max_iter=100000, C=c, tol=tol)
                mean_train_f1_score, mean_val_f1_score = train_model_kfold(lr_model, 10, feature, l, encoder)
                print(f"Training F1-score: {mean_train_f1_score}, validation F1-score: {mean_val_f1_score},"
                      f"features - {name}, labels name - {labels_name}, C={c}, tol={tol}")
                label_names.append(labels_name)
                train_f1_scores.append(mean_train_f1_score)
                val_f1_scores.append(mean_val_f1_score)
                features_names.append(name)
                cs.append(c)
                tols.append(tol)


results = pd.DataFrame(data={"label_name": label_names,
                             "feature_type": features_names,
                             "C": cs,
                             "tol": tols,
                             "train_f1_score": train_f1_scores,
                             "val_f1_score": val_f1_scores})

results.to_csv(
    os.path.join(PATH_TO_RESULTS, "logistic_regression.csv"),
    index=False)


In [None]:
label_names =[]
train_f1_scores =[]
val_f1_scores = []
features_names = []
msss =[]
msls = []
crits = []

for l, encoder, labels_name in [(coalition_labels, coalition_encoder, "coalitions"),
                   (party_labels, party_encoder, "parties"),
                   (position_labels, position_encoder, "positions")]:
    for name, feature in [("scaled", features_scaled),
                          ("original", features),
                          ("minmax", features_min_max)]:
        for crit in ['gini', 'entropy']:
            for mss in range(2,20):
                for msl in range(2,20):
                    dec_tree = DecisionTreeClassifier(criterion=crit, min_samples_leaf=msl, min_samples_split=mss)
                    mean_train_f1_score, mean_val_f1_score = train_model_kfold(dec_tree, 10, feature, l, encoder)
                    print(f"Training F1-score: {mean_train_f1_score}, validation F1-score: {mean_val_f1_score},"
                          f"features - {name}, labels name - {labels_name}, criterion={crit}, mss={mss}, msl={msl}")
                    label_names.append(labels_name)
                    train_f1_scores.append(mean_train_f1_score)
                    val_f1_scores.append(mean_val_f1_score)
                    features_names.append(name)
                    msss.append(mss)
                    msls.append(msl)
                    crits.append(crit)


results = pd.DataFrame(data={"label_name": label_names,
                             "feature_type": features_names,
                             "min_samples_leaf": msls,
                             "min_samples_split": msss,
                             "criterion": crits,
                             "train_f1_score": train_f1_scores,
                             "val_f1_score": val_f1_scores})

results.to_csv(
    os.path.join(PATH_TO_RESULTS, "decision_tree.csv"),
    index=False)

In [None]:
label_names =[]
train_f1_scores =[]
val_f1_scores = []
features_names = []
cs =[]
tols = []
kernels = []

for l, encoder, labels_name in [(coalition_labels, coalition_encoder, "coalitions"),
                   (party_labels, party_encoder, "parties"),
                   (position_labels, position_encoder, "positions")]:
    for name, feature in [("scaled", features_scaled),
                          ("original", features),
                          ("minmax", features_min_max)]:
        for c in [0.1, 0.25, 0.5, 0.75, 1.0]:
            for tol in [1e-4, 1e-3, 1e-2, 1e-1]:
                for kernel in ['linear', 'poly', 'rbf', 'sigmoid']:
                    svm = SVC(kernel=kernel, C=c, tol=tol)
                    mean_train_f1_score, mean_val_f1_score = train_model_kfold(svm, 10, feature, l, encoder)
                    print(f"Training F1-score: {mean_train_f1_score}, validation F1-score: {mean_val_f1_score},"
                          f"features - {name}, labels name - {labels_name}, C={c}, tol={tol}, kernel={kernel}")
                    label_names.append(labels_name)
                    train_f1_scores.append(mean_train_f1_score)
                    val_f1_scores.append(mean_val_f1_score)
                    features_names.append(name)
                    cs.append(c)
                    tols.append(tol)
                    kernels.append(kernel)
results = pd.DataFrame(data={"label_name": label_names,
                             "feature_type": features_names,
                             "kernel": kernels,
                             "C": cs,
                             "tol": tols,
                             "train_f1_score": train_f1_scores,
                             "val_f1_score": val_f1_scores})

results.to_csv(
    os.path.join(PATH_TO_RESULTS, "svm.csv"),
    index=False)

In [None]:
label_names =[]
train_f1_scores =[]
val_f1_scores = []
features_names = []
neighbours_list =[]
weights_list = []
distance_list = []

for l, encoder, labels_name in [(coalition_labels, coalition_encoder, "coalitions"),
                   (party_labels, party_encoder, "parties"),
                   (position_labels, position_encoder, "positions")]:
    for name, feature in [("scaled", features_scaled),
                          ("original", features),
                          ("minmax", features_min_max)]:
        for neighbors in [5,10,20,30,40,50]:
            for weights in ['uniform', 'distance']:
                for distance in ['euclidean', 'manhattan', 'chebyshev', 'minkowski']:
                    svm = KNeighborsClassifier(n_neighbors=neighbors, weights=weights, metric=distance)
                    mean_train_f1_score, mean_val_f1_score = train_model_kfold(svm, 10, feature, l, encoder)
                    print(f"Training F1-score: {mean_train_f1_score}, validation F1-score: {mean_val_f1_score},"
                          f"features - {name}, labels name - {labels_name}, n_neighbours={neighbors}, weights={weights}, distance={distance}")
                    label_names.append(labels_name)
                    train_f1_scores.append(mean_train_f1_score)
                    val_f1_scores.append(mean_val_f1_score)
                    features_names.append(name)
                    neighbours_list.append(neighbors)
                    weights_list.append(weights)
                    distance_list.append(distance)

results = pd.DataFrame(data={"label_name": label_names,
                             "feature_type": features_names,
                             "distance": distance_list,
                             "n_neighbours": neighbours_list,
                             "weights": weights_list,
                             "train_f1_score": train_f1_scores,
                             "val_f1_score": val_f1_scores})

results.to_csv(
    os.path.join(PATH_TO_RESULTS, "knn.csv"),
    index=False)