## About this notebook:
Manually annotated dataset of 100 positive and 100 negative labels with hypothesis "This text is racist" from VNN forum are used to train Logistic Regression and Random Forrest models.

---

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
from umap import UMAP
import plotly.express as px
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
import joblib

In [None]:
data_folder = Path('/mnt/c/Yose/Data/vnn_data/active_learning/')
df = pd.read_csv(data_folder / 'df_labeled_racism.tsv',  sep = '\t', index_col=0)

### Read in relevant columns from df into X (embeddings) and y (labels) -> train|val|test split - 60|20|20

In [None]:
X, y = df['chunk_embedding'], df['racist_text']
X.shape, y.shape

In [None]:
X = X.apply(lambda x: np.fromstring(x[1:-1], sep=' ')).tolist() # transform X from str of embeddings to np array
X = np.array(X)

In [None]:
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.25, random_state=42)

X_train.shape, X_val.shape, X_test.shape, y_train.shape, y_val.shape, y_test.shape

---
# Visualization

In [None]:
umap_params = {
    'n_neighbors':20,
    'n_components':3,
    'min_dist':0.05, 
    'metric':'cosine'
}

X_umap = UMAP(**umap_params).fit_transform(X)

In [None]:
df['umap_x'] = X_umap[:, 0]
df['umap_y'] = X_umap[:, 1]
df['umap_z'] = X_umap[:, 2]

In [None]:
fig = px.scatter_3d(
    df,
    x="umap_x",
    y="umap_y",
    z="umap_z",
    color="racist_text",
    title="200 annotated embeddings with hypotesis 'This text is racist'",
    hover_data={"index": df.index, "umap_x": False, "umap_y": False, "umap_z": False},
    width=1000,
    height=500,
)
fig.show()

---
## Train Logistic regression model
Embeddings are alrady normalized and do not need scaling

In [None]:
# parameter tuning of hyperparameters: 'l1-ratio', 'C'
param_grid = {"l1_ratio": [0, 0.1, 0.5, 1], "C": [0.01, 0.05, 0.08, 0.1, 0.12, 0.15, 0.3]} 
model_lr = GridSearchCV(LogisticRegression(penalty = 'elasticnet', solver='saga', max_iter=10000), param_grid=param_grid, cv=5, scoring="f1")
model_lr.fit(X_train, y_train)
model_lr.best_params_

# Output: {'C': 0.1, 'l1_ratio': 0}

In [None]:
# train model with chosen parameters: l1-ratio = 0 (<=> penalty = l2), C = 0.1
# calculate scores for model on train and val data

model_lr = LogisticRegression(penalty = 'l2', solver='saga', max_iter=10000, C=0.1)
model_lr.fit(X_train, y_train)
y_pred_lr = model_lr.predict(X_val)

print(f'Score train: {100 * model_lr.score(X_train, y_train):.2f} %')
print(f'Score val: {100 * model_lr.score(X_val, y_val):.2f} %')

# Output example: Score train: 97.50 %, Score val: 95.00 %

In [None]:
# train model on train_val data with paramters from model tuning and evalute using text data

model_lr = LogisticRegression(penalty = 'l2', solver='saga', max_iter=10000, C=0.1)
model_lr.fit(X_trainval, y_trainval)
y_pred_lr = model_lr.predict(X_test)

print(f'Score trainval: {100 * model_lr.score(X_trainval, y_trainval):.2f} %')
print(f'Score test: {100 * model_lr.score(X_test, y_test):.2f} %')

# Output example: Score trainval: 94.38 %, Score test: 95.00 %

### Evaluate Logistic Regression model

In [None]:
# classification report
print(classification_report(y_test, y_pred_lr))

# confusion matix
cm = confusion_matrix(y_test, y_pred_lr)
ConfusionMatrixDisplay(cm).plot();

In [None]:
# Get the indexes where y_test and y_pred do not match and save into false_neg and false_pos

pd.set_option('display.max_colwidth', None)

comparison = (y_test != y_pred_lr)
mismatched_indexes = comparison[comparison].index.tolist()
y_pred_lr_series = pd.Series(y_pred_lr, index=y_test.index) # array -> Series so can index same as y_test
false_pos = [index for index in mismatched_indexes if y_test[index] == 0 and y_pred_lr_series[index] == 1]
false_neg = [index for index in mismatched_indexes if y_test[index] == 1 and y_pred_lr_series[index] == 0]

print(f"False positives: \n{df.loc[false_pos, ['article_id', 'chunk_index', 'text_chunk', 'racist_text']]}")
print(f"False negatives: \n{df.loc[false_neg, ['article_id', 'chunk_index', 'text_chunk', 'racist_text']]}")


### Train LogReg model on all data and save

In [None]:
# train model on all data and save
model_lr = LogisticRegression(penalty = 'l2', solver='saga', max_iter=10000, C=0.1)
model_lr.fit(X,y);

In [None]:
joblib.dump(model_lr, data_folder / 'model_supervised_lr.pkl')

---
## Train Random forrest model
Random Forrest does not require scaling

In [None]:
param_grid = {"n_estimators": [100, 120, 150, 200], "max_depth": [2, 3, 4, 5]} 
model_rf = GridSearchCV(RandomForestClassifier(), param_grid=param_grid, cv=5, verbose=1, scoring="f1")
model_rf.fit(X_train, y_train)
model_rf.best_params_

# Output: {'max_depth': 5, 'n_estimators': 120}

In [None]:
# to reduce risk for overfitting chosen parameters are: lower max_depth and higher n_estimators compared to results from GridSearchCV
# calculate scores for model on train and val data
model_rf = RandomForestClassifier(n_estimators=150, max_depth=3)
model_rf.fit(X_train, y_train)
y1_pred_rf = model_rf.predict(X_val)

print(f'Score train: {100 * model_rf.score(X_train, y_train):.2f} %')
print(f'Score val: {100 * model_rf.score(X_val, y_val):.2f} %')

# Output example: Score train: 100.00 %, Score val: 95.00 %

In [None]:
# train model on train_val data with paramters from model tuning and evalute using text data
model_rf = RandomForestClassifier(n_estimators=150, max_depth=3)
model_rf.fit(X_trainval, y_trainval)
y_pred_rf = model_rf.predict(X_test)

print(f'Score trainval: {100 * model_rf.score(X_trainval, y_trainval):.2f} %')
print(f'Score test: {100 * model_rf.score(X_test, y_test):.2f} %')

# Output example: Score trainval: 100.00 %, Score test: 95.00 %

### Evaluate Random Forrest model

In [None]:
# classification report
print(classification_report(y_test, y_pred_rf))

# confusion matix
cm = confusion_matrix(y_test, y_pred_rf)
ConfusionMatrixDisplay(cm).plot();

In [None]:
# Get the indexes where y_test and y_pred do not match and save into false_neg and false_pos

pd.set_option('display.max_colwidth', None)

comparison = (y_test != y_pred_rf)
mismatched_indexes = comparison[comparison].index.tolist()
y_pred_rf_series = pd.Series(y_pred_rf, index=y_test.index) # array -> Series so can index same as y_test
false_pos = [index for index in mismatched_indexes if y_test[index] == 0 and y_pred_rf_series[index] == 1]
false_neg = [index for index in mismatched_indexes if y_test[index] == 1 and y_pred_rf_series[index] == 0]

print(f"False positives: \n{df.loc[false_pos, ['article_id', 'chunk_index', 'text_chunk', 'racist_text']]}")
print(f"False negatives: \n{df.loc[false_neg, ['article_id', 'chunk_index', 'text_chunk', 'racist_text']]}")

### Train RandomForrest model on all data and save

In [None]:
model_rf = RandomForestClassifier(n_estimators=150, max_depth=3)
model_rf.fit(X, y);

In [None]:
joblib.dump(model_rf, data_folder / 'model_supervised_rf.pkl')