# Imports

In [467]:
import pandas as pd
import numpy as np
import optuna
import json
import pickle
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error

# Load data

In [468]:
data = pd.read_csv('data/Ethos_Dataset_Binary.csv', on_bad_lines='skip', sep=';')

In [469]:
data.shape

(998, 2)

In [470]:
data.head(3)

Unnamed: 0,comment,isHate
0,comment you should know women sports are joke ...,1.0
1,comment you look like sloth with deeper down s...,1.0
2,comment you look like russian and speak like i...,1.0


In [471]:
data.isHate.unique()

array([1.        , 0.98387097, 0.98360656, 0.97826087, 0.97333333,
       0.96666667, 0.95454545, 0.94545455, 0.9375    , 0.90384615,
       0.85714286, 0.8490566 , 0.84615385, 0.83333333, 0.82142857,
       0.75      , 0.72222222, 0.67857143, 0.66666667, 0.60344828,
       0.53061224, 0.5       , 0.4       , 0.33333333, 0.30232558,
       0.296875  , 0.25      , 0.2       , 0.16666667, 0.16071429,
       0.15254237, 0.11111111, 0.10344828, 0.09090909, 0.03896104,
       0.03773585, 0.03174603, 0.03030303, 0.02985075, 0.02631579,
       0.01886792, 0.01639344, 0.        ])

In [472]:
data['isHate'] = data['isHate'].round(0)

In [473]:
data.isHate.unique()

array([1., 0.])

## train-test split

In [474]:
X = data['comment']
y = data['isHate']

In [475]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

In [476]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(698,) (300,) (698,) (300,)


# Feature extraction

In [477]:
word_vec = TfidfVectorizer(
    max_features=300,
    strip_accents='unicode',
    analyzer="word",
    stop_words="english",
    ngram_range=(1, 5),
)

In [478]:
X_train = word_vec.fit_transform(X_train)
X_test = word_vec.transform(X_test)

# Model creation and evaluation

In [479]:
clf = LGBMClassifier(random_state=42, learning_rate=0.5) 

In [480]:
clf.get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.5,
 'max_depth': -1,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'num_leaves': 31,
 'objective': None,
 'random_state': 42,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'silent': 'warn',
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0}

In [481]:
clf.fit(X_train, y_train)

In [482]:
proba_train = clf.predict_proba(X_train)
proba_test = clf.predict_proba(X_test)
pred_train = clf.predict(X_train)
pred_test = clf.predict(X_test)

In [483]:
metrics_train = {"accuracy": accuracy_score(y_train, pred_train),
                 "f1_macro": f1_score(y_train, pred_train, average='macro'),
                 "roc_auc": roc_auc_score(y_train, proba_train[:, 1]),
                }

metrics_test = {"accuracy": accuracy_score(y_test, pred_test),
                "f1_macro": f1_score(y_test, pred_test, average='macro'),
                "roc_auc": roc_auc_score(y_test, proba_test[:, 1]),
                }

In [484]:
metrics_before_tuning = pd.DataFrame([metrics_train, metrics_test], index=['train', 'test']).T

In [485]:
metrics_before_tuning

Unnamed: 0,train,test
accuracy,0.921203,0.8
f1_macro,0.910336,0.779239
roc_auc,0.972565,0.819483


## dvc artifacts

In [486]:
baseline_model_params = dict()
baseline_model_params['params'] = clf.get_params()
baseline_model_params['accuracy'] = accuracy_score(y_test, pred_test)
baseline_model_params['f1_macro'] = f1_score(y_test, pred_test, average='macro')
baseline_model_params['roc_auc'] = roc_auc_score(y_test, proba_test[:, 1])

with open('baseline_model_params.json', 'w') as file:
    json.dump(baseline_model_params, file) 

In [487]:
with open('LGBMC_baseline_model.pkl', mode='wb') as file:
        pickle.dump(clf, file)

# Hyperparameter optimization

In [488]:
def objective(trial):
    
    param_grid = {
        # "device_type": trial.suggest_categorical("device_type", ['gpu']),
        "objective": "binary",
        "boosting_type": "dart",
        "n_estimators": trial.suggest_categorical("n_estimators", [1000]),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "num_leaves": trial.suggest_int("num_leaves", 20, 3000, step=20),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
    }
    
    model = LGBMClassifier(**param_grid)  
    
    model.fit(X_train, y_train)
    
    pred_test = model.predict(X_test)
    accuracy = accuracy_score(y_test, pred_test)
    
    return accuracy

In [489]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)
print('Number of finished trials:', len(study.trials))

[32m[I 2023-06-10 10:58:00,997][0m A new study created in memory with name: no-name-bdc54109-faca-48bc-85cf-0ce7f040b131[0m
[32m[I 2023-06-10 10:58:02,856][0m Trial 0 finished with value: 0.8066666666666666 and parameters: {'n_estimators': 1000, 'learning_rate': 0.2486955111283389, 'num_leaves': 1280, 'max_depth': 9}. Best is trial 0 with value: 0.8066666666666666.[0m
[32m[I 2023-06-10 10:58:04,597][0m Trial 1 finished with value: 0.8233333333333334 and parameters: {'n_estimators': 1000, 'learning_rate': 0.02301986381315008, 'num_leaves': 900, 'max_depth': 9}. Best is trial 1 with value: 0.8233333333333334.[0m
[32m[I 2023-06-10 10:58:06,113][0m Trial 2 finished with value: 0.8066666666666666 and parameters: {'n_estimators': 1000, 'learning_rate': 0.2122263275522174, 'num_leaves': 760, 'max_depth': 8}. Best is trial 1 with value: 0.8233333333333334.[0m
[32m[I 2023-06-10 10:58:06,886][0m Trial 3 finished with value: 0.83 and parameters: {'n_estimators': 1000, 'learning_rate

Number of finished trials: 20


In [490]:
print('Best trial:', study.best_trial.params)

Best trial: {'n_estimators': 1000, 'learning_rate': 0.08951996820130269, 'num_leaves': 80, 'max_depth': 5}


# Model with best params

In [491]:
clf_best = LGBMClassifier(**study.best_trial.params)

In [492]:
clf_best.fit(X_train, y_train)

In [493]:
proba_train = clf_best.predict_proba(X_train)
proba_test = clf_best.predict_proba(X_test)
pred_train = clf_best.predict(X_train)
pred_test = clf_best.predict(X_test)

In [494]:
metrics_train = {"accuracy": accuracy_score(y_train, pred_train),
                 "f1_macro": f1_score(y_train, pred_train, average='macro'),
                 "roc_auc": roc_auc_score(y_train, proba_train[:, 1]),
                }

metrics_test = {"accuracy": accuracy_score(y_test, pred_test),
                "f1_macro": f1_score(y_test, pred_test, average='macro'),
                "roc_auc": roc_auc_score(y_test, proba_test[:, 1]),
                }

In [495]:
metrics_after_tuning = pd.DataFrame([metrics_train, metrics_test], index=['train', 'test']).T

In [496]:
metrics_after_tuning

Unnamed: 0,train,test
accuracy,0.918338,0.803333
f1_macro,0.906863,0.781414
roc_auc,0.96556,0.842506


## dvc artifacts

In [497]:
tuned_model_params = dict()
tuned_model_params['params'] = clf_best.get_params()
tuned_model_params['accuracy'] = accuracy_score(y_test, pred_test)
tuned_model_params['f1_macro'] = f1_score(y_test, pred_test, average='macro')
tuned_model_params['roc_auc'] = roc_auc_score(y_test, proba_test[:, 1])

with open('tuned_model_params.json', 'w') as file:
    json.dump(tuned_model_params, file) 

In [498]:
with open('LGBMC_tuned_model.pkl', mode='wb') as file:
        pickle.dump(clf_best, file)