# Modeling & Training

## XGBOOST classifier

## Outline
- [Necessary packages](#necessary_packages)
- [Data Loading](#data_loading)
- [Modeling and training](#modeling_and_training)
- [Conclusion](#conclusion)
- [Save the best model](#save_the_best_model)

<div id="necessary_packages" >
    <h3>Necessary packages</h3>
</div>

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MaxAbsScaler,FunctionTransformer,StandardScaler
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.model_selection import cross_validate,GridSearchCV,KFold,train_test_split
from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score,auc,confusion_matrix,make_scorer
from sklearn.decomposition import TruncatedSVD
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from joblib import dump

<div id="data_loading" >
    <h3>Data Loading</h3>
</div>

In [2]:
path = os.path.join("..","..","data","clean_df.csv")
df = pd.read_csv(path, encoding="iso-8859-1")
df.fillna("",inplace=True)

In [3]:
df.columns

Index(['class', 'content', 'urls_count', 'digits_count',
       'contains_currency_symbols', 'length'],
      dtype='object')

<div id="modeling_and_training" >
    <h3>Modeling and training</h3>
</div>

In [4]:
X = df["content"]
y = df["class"]
X = X[y != -1]
y = y[y != -1]

In [5]:
def results_to_df(results):
    _results = {}
    keys = ["mean_fit_time","mean_score_time","params","mean_test_accuracy","mean_test_recall","mean_test_precision","mean_test_f1_score"]
    for key in keys:
        _results[key] = results[key]
    return pd.DataFrame(_results)

In [6]:
def evaluate_cv(models,metrics,cv,X,y):

    df = []
    index = models.keys()
    columns = ["fit_time","score_time"]
    columns = columns + list(metrics.keys())
    
    for model in models.values():
        results = cross_validate(model, X, y, cv=cv,scoring=metrics)
        scores = []
        for score in results.values():
            scores.append(score.mean())
        df.append(scores)

    df = pd.DataFrame(data=df,index=index,columns=columns)
    return df

In [7]:
params = {
    "scale_pos_weight": [1,4,6,10,50,75,99],
    "max_depth": [5,10,20],
    "learning_rate":[0.001,0.005,0.01]
}

In [8]:
scoring = {
    "accuracy":make_scorer(accuracy_score),
    "f1_score":make_scorer(f1_score),
    "precision":make_scorer(precision_score),
    "recall":make_scorer(recall_score)
}

In [9]:
models = {}

In [10]:
models["cv"] = Pipeline(steps=[
    ("cv", CountVectorizer()),
    ("estimator",GridSearchCV(estimator=XGBClassifier(),param_grid=params,cv=KFold(shuffle=True),scoring=scoring,refit="f1_score"))
])

In [11]:
models["cv"].fit(X, y)

In [12]:
results_to_df(models["cv"]["estimator"].cv_results_)

Unnamed: 0,mean_fit_time,mean_score_time,params,mean_test_accuracy,mean_test_recall,mean_test_precision,mean_test_f1_score
0,0.697138,0.014071,"{'learning_rate': 0.001, 'max_depth': 5, 'scal...",0.846725,0.805840,0.741730,0.732728
1,0.953054,0.012966,"{'learning_rate': 0.001, 'max_depth': 5, 'scal...",0.817992,0.983379,0.615931,0.757276
2,0.699128,0.013887,"{'learning_rate': 0.001, 'max_depth': 5, 'scal...",0.822919,0.976599,0.625067,0.761399
3,0.713316,0.012484,"{'learning_rate': 0.001, 'max_depth': 5, 'scal...",0.818953,0.979807,0.618750,0.757857
4,0.614879,0.011805,"{'learning_rate': 0.001, 'max_depth': 5, 'scal...",0.815012,0.979807,0.612582,0.753519
...,...,...,...,...,...,...,...
58,5.697664,0.013649,"{'learning_rate': 0.01, 'max_depth': 20, 'scal...",0.897108,0.931575,0.764711,0.839319
59,2.779513,0.013164,"{'learning_rate': 0.01, 'max_depth': 20, 'scal...",0.880291,0.925141,0.732903,0.816841
60,4.655739,0.020364,"{'learning_rate': 0.01, 'max_depth': 20, 'scal...",0.877325,0.931757,0.723595,0.813977
61,4.177692,0.017281,"{'learning_rate': 0.01, 'max_depth': 20, 'scal...",0.871394,0.935393,0.711143,0.807431


In [13]:
models["tfidf"] = Pipeline(steps=[
    ("cv", TfidfVectorizer()),
    ("estimator",GridSearchCV(estimator=XGBClassifier(),param_grid=params,cv=KFold(shuffle=True),scoring=scoring,refit="f1_score"))
])

In [14]:
models["tfidf"].fit(X, y)

In [15]:
results_to_df(models["tfidf"]["estimator"].cv_results_)

Unnamed: 0,mean_fit_time,mean_score_time,params,mean_test_accuracy,mean_test_recall,mean_test_precision,mean_test_f1_score
0,0.967166,0.013808,"{'learning_rate': 0.001, 'max_depth': 5, 'scal...",0.865478,0.720818,0.796725,0.753923
1,0.989497,0.012931,"{'learning_rate': 0.001, 'max_depth': 5, 'scal...",0.826869,0.969346,0.631593,0.764242
2,1.075257,0.012698,"{'learning_rate': 0.001, 'max_depth': 5, 'scal...",0.826845,0.975695,0.632732,0.766542
3,1.081417,0.013195,"{'learning_rate': 0.001, 'max_depth': 5, 'scal...",0.830830,0.989590,0.632423,0.771542
4,1.141887,0.016478,"{'learning_rate': 0.001, 'max_depth': 5, 'scal...",0.819948,0.992980,0.619076,0.761921
...,...,...,...,...,...,...,...
58,3.281051,0.011980,"{'learning_rate': 0.01, 'max_depth': 20, 'scal...",0.909008,0.937436,0.786256,0.855106
59,3.308904,0.011969,"{'learning_rate': 0.01, 'max_depth': 20, 'scal...",0.902083,0.930450,0.773936,0.844650
60,3.109631,0.011973,"{'learning_rate': 0.01, 'max_depth': 20, 'scal...",0.907043,0.955060,0.775031,0.855032
61,2.921145,0.011915,"{'learning_rate': 0.01, 'max_depth': 20, 'scal...",0.904073,0.941691,0.774950,0.849220


- Choosing the best model.

In [16]:
models["cv"]["estimator"].best_params_

{'learning_rate': 0.01, 'max_depth': 20, 'scale_pos_weight': 1}

In [17]:
models["tfidf"]["estimator"].best_params_

{'learning_rate': 0.01, 'max_depth': 10, 'scale_pos_weight': 6}

In [18]:
_models = {}

In [19]:
_models["cv"] = Pipeline(steps=[
    ("cv", CountVectorizer()),
    ("estimator",XGBClassifier(**models["cv"]["estimator"].best_params_))
])

In [20]:
_models["tfidf"] = Pipeline(steps=[
    ("cv", TfidfVectorizer()),
    ("estimator",XGBClassifier(**models["tfidf"]["estimator"].best_params_))
])

In [21]:
evaluation_df = evaluate_cv(_models,scoring,cv=KFold(shuffle=True),X=X,y=y)

In [22]:
evaluation_df

Unnamed: 0,fit_time,score_time,accuracy,f1_score,precision,recall
cv,1.706859,0.030936,0.904073,0.841432,0.805186,0.883931
tfidf,1.795834,0.03175,0.892147,0.831968,0.762019,0.918193


<div id="conclusion" >
    <h3>Conclusion</h3>
</div>

- Bag of word as a feature extraction technique.
- f1-score of 0.84.

<div id="save_the_best_model" >
    <h3>Save the best model to the disk</h3>
</div>

In [23]:
dump(value=_models[evaluation_df.index[evaluation_df["f1_score"].argmax()]],
     filename=os.path.join("..","..","models","ssl","xgboost.joblib"))

['../../models/ssl/xgboost.joblib']