# Modeling & Training

## Decision tree classifier

## Outline
- [Necessary packages](#necessary_packages)
- [Data Loading](#data_loading)
- [Modeling and training](#modeling_and_training)
- [Conclusion](#conclusion)
- [Save the best model](#save_the_best_model)

<div id="necessary_packages" >
    <h3>Necessary packages</h3>
</div>

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MaxAbsScaler,FunctionTransformer,StandardScaler
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate,GridSearchCV,KFold,train_test_split
from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score,auc,confusion_matrix,make_scorer
from sklearn.decomposition import TruncatedSVD
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from joblib import dump

<div id="data_loading" >
    <h3>Data Loading</h3>
</div>

In [2]:
path = os.path.join("..","..","data","clean_df.csv")
df = pd.read_csv(path, encoding="iso-8859-1")
df.fillna("",inplace=True)

In [3]:
df.columns

Index(['class', 'content', 'urls_count', 'digits_count',
       'contains_currency_symbols', 'length'],
      dtype='object')

<div id="modeling_and_training" >
    <h3>Modeling and training</h3>
</div>

In [4]:
X = df["content"]
y = df["class"]
X = X[y != -1]
y = y[y != -1]

In [5]:
def results_to_df(results):
    _results = {}
    keys = ["mean_fit_time","mean_score_time","params","mean_test_accuracy","mean_test_recall","mean_test_precision","mean_test_f1_score"]
    for key in keys:
        _results[key] = results[key]
    return pd.DataFrame(_results)

In [6]:
scoring = {
    "accuracy":make_scorer(accuracy_score),
    "f1_score":make_scorer(f1_score),
    "precision":make_scorer(precision_score),
    "recall":make_scorer(recall_score)
}

In [7]:
params = {
    "estimator__min_samples_split": [2,5,10,20],
    "estimator__min_impurity_decrease":[0.0,0.005,0.01],
    "estimator__max_depth":[10,20,50,100,None]
}

In [8]:
def evaluate_cv(models,metrics,cv,X,y):

    df = []
    index = models.keys()
    columns = ["fit_time","score_time"]
    columns = columns + list(metrics.keys())
    
    for model in models.values():
        results = cross_validate(model, X, y, cv=cv,scoring=metrics)
        scores = []
        for score in results.values():
            scores.append(score.mean())
        df.append(scores)

    df = pd.DataFrame(data=df,index=index,columns=columns)
    return df

In [9]:
models = {}

In [10]:
models["cv_smote"] = Pipeline(steps=[
    ("cv", CountVectorizer()),
    ("grid_search", GridSearchCV(
        estimator=Pipeline(steps=[
            ("oversampler", SMOTE(random_state=48)),
            ("estimator",DecisionTreeClassifier())
        ]),
        param_grid=params,
        scoring=scoring,
        refit="f1_score",cv=KFold(n_splits=5, shuffle=True))
    )
])

In [11]:
models["cv_smote"].fit(X, y)

In [12]:
results_to_df(models["cv_smote"]["grid_search"].cv_results_)

Unnamed: 0,mean_fit_time,mean_score_time,params,mean_test_accuracy,mean_test_recall,mean_test_precision,mean_test_f1_score
0,0.115828,0.009659,"{'estimator__max_depth': 10, 'estimator__min_i...",0.886241,0.887134,0.760333,0.81733
1,0.104551,0.009624,"{'estimator__max_depth': 10, 'estimator__min_i...",0.884266,0.886313,0.757278,0.814652
2,0.102307,0.009474,"{'estimator__max_depth': 10, 'estimator__min_i...",0.892182,0.896662,0.77036,0.82679
3,0.097577,0.009497,"{'estimator__max_depth': 10, 'estimator__min_i...",0.888226,0.88939,0.763715,0.820296
4,0.096489,0.009607,"{'estimator__max_depth': 10, 'estimator__min_i...",0.885256,0.875814,0.763884,0.814272
5,0.093243,0.009462,"{'estimator__max_depth': 10, 'estimator__min_i...",0.888226,0.876225,0.771607,0.818146
6,0.092459,0.009465,"{'estimator__max_depth': 10, 'estimator__min_i...",0.886246,0.878891,0.764021,0.815724
7,0.088965,0.00944,"{'estimator__max_depth': 10, 'estimator__min_i...",0.889216,0.878891,0.771886,0.819709
8,0.084552,0.009441,"{'estimator__max_depth': 10, 'estimator__min_i...",0.868444,0.87929,0.725147,0.792586
9,0.083034,0.009437,"{'estimator__max_depth': 10, 'estimator__min_i...",0.867454,0.875653,0.724252,0.790593


In [13]:
models["tfidf_smote"] = Pipeline(steps=[
    ("cv", TfidfVectorizer()),
    ("grid_search", GridSearchCV(
        estimator=Pipeline(steps=[
            ("oversampler", SMOTE(random_state=48)),
            ("estimator",DecisionTreeClassifier())
        ]),
        param_grid=params,
        scoring=scoring,
        refit="f1_score",cv=KFold(n_splits=5, shuffle=True))
    )
])

In [14]:
models["tfidf_smote"].fit(X, y)

In [15]:
results_to_df(models["tfidf_smote"]["grid_search"].cv_results_)

Unnamed: 0,mean_fit_time,mean_score_time,params,mean_test_accuracy,mean_test_recall,mean_test_precision,mean_test_f1_score
0,0.771004,0.00964,"{'estimator__max_depth': 10, 'estimator__min_i...",0.891172,0.916549,0.755453,0.828094
1,0.729157,0.009367,"{'estimator__max_depth': 10, 'estimator__min_i...",0.892167,0.920058,0.756226,0.829984
2,0.734218,0.009893,"{'estimator__max_depth': 10, 'estimator__min_i...",0.891177,0.916549,0.755331,0.828003
3,0.735287,0.009541,"{'estimator__max_depth': 10, 'estimator__min_i...",0.896127,0.920058,0.764384,0.834703
4,0.736061,0.009396,"{'estimator__max_depth': 10, 'estimator__min_i...",0.894152,0.916779,0.761688,0.831821
5,0.72747,0.009366,"{'estimator__max_depth': 10, 'estimator__min_i...",0.891182,0.916779,0.755649,0.828306
6,0.729338,0.009329,"{'estimator__max_depth': 10, 'estimator__min_i...",0.891182,0.916779,0.755649,0.828306
7,0.729681,0.00941,"{'estimator__max_depth': 10, 'estimator__min_i...",0.892167,0.916779,0.757796,0.829589
8,0.725667,0.009382,"{'estimator__max_depth': 10, 'estimator__min_i...",0.886231,0.910698,0.747403,0.820822
9,0.721726,0.009356,"{'estimator__max_depth': 10, 'estimator__min_i...",0.886231,0.910698,0.747403,0.820822


In [16]:
models["cv"] = Pipeline(steps=[
    ("cv", CountVectorizer()),
    ("grid_search", GridSearchCV(
        estimator=Pipeline(steps=[
            ("estimator",DecisionTreeClassifier(class_weight="balanced"))
        ]),
        param_grid=params,
        scoring=scoring,
        refit="f1_score",cv=KFold(n_splits=5, shuffle=True))
    )
])

In [17]:
models["cv"].fit(X, y)

In [18]:
results_to_df(models["cv"]["grid_search"].cv_results_)

Unnamed: 0,mean_fit_time,mean_score_time,params,mean_test_accuracy,mean_test_recall,mean_test_precision,mean_test_f1_score
0,0.084464,0.009316,"{'estimator__max_depth': 10, 'estimator__min_i...",0.878301,0.954787,0.716854,0.817813
1,0.084734,0.00942,"{'estimator__max_depth': 10, 'estimator__min_i...",0.87533,0.951338,0.71279,0.813726
2,0.081491,0.009298,"{'estimator__max_depth': 10, 'estimator__min_i...",0.875326,0.961683,0.709357,0.815447
3,0.078187,0.009307,"{'estimator__max_depth': 10, 'estimator__min_i...",0.878301,0.951338,0.718321,0.817205
4,0.085207,0.009638,"{'estimator__max_depth': 10, 'estimator__min_i...",0.87434,0.940994,0.714166,0.810412
5,0.08312,0.00933,"{'estimator__max_depth': 10, 'estimator__min_i...",0.878296,0.944442,0.720703,0.815839
6,0.081231,0.009335,"{'estimator__max_depth': 10, 'estimator__min_i...",0.881252,0.944442,0.727099,0.819622
7,0.075844,0.009295,"{'estimator__max_depth': 10, 'estimator__min_i...",0.882242,0.94789,0.727723,0.821559
8,0.072868,0.009339,"{'estimator__max_depth': 10, 'estimator__min_i...",0.860503,0.94789,0.6869,0.795262
9,0.073284,0.009676,"{'estimator__max_depth': 10, 'estimator__min_i...",0.861493,0.94789,0.688343,0.796326


In [19]:
models["tfidf"] = Pipeline(steps=[
    ("cv", TfidfVectorizer()),
    ("grid_search", GridSearchCV(
        estimator=Pipeline(steps=[
            ("estimator",DecisionTreeClassifier(class_weight="balanced"))
        ]),
        param_grid=params,
        scoring=scoring,
        refit="f1_score",cv=KFold(n_splits=5, shuffle=True))
    )
])

In [20]:
models["tfidf"].fit(X, y)

In [21]:
results_to_df(models["tfidf"]["grid_search"].cv_results_)

Unnamed: 0,mean_fit_time,mean_score_time,params,mean_test_accuracy,mean_test_recall,mean_test_precision,mean_test_f1_score
0,0.116225,0.009355,"{'estimator__max_depth': 10, 'estimator__min_i...",0.892177,0.9267,0.755673,0.83201
1,0.114933,0.009536,"{'estimator__max_depth': 10, 'estimator__min_i...",0.887236,0.932552,0.744497,0.82648
2,0.110143,0.009284,"{'estimator__max_depth': 10, 'estimator__min_i...",0.888221,0.932872,0.745595,0.827832
3,0.105295,0.009302,"{'estimator__max_depth': 10, 'estimator__min_i...",0.889206,0.92594,0.750553,0.827889
4,0.113058,0.009303,"{'estimator__max_depth': 10, 'estimator__min_i...",0.890201,0.932992,0.75072,0.830789
5,0.11309,0.009314,"{'estimator__max_depth': 10, 'estimator__min_i...",0.891196,0.929043,0.753018,0.830695
6,0.108373,0.009315,"{'estimator__max_depth': 10, 'estimator__min_i...",0.881295,0.923102,0.734615,0.817564
7,0.106605,0.009685,"{'estimator__max_depth': 10, 'estimator__min_i...",0.889211,0.92603,0.7512,0.828391
8,0.103002,0.009336,"{'estimator__max_depth': 10, 'estimator__min_i...",0.880305,0.925655,0.733425,0.816831
9,0.104577,0.009373,"{'estimator__max_depth': 10, 'estimator__min_i...",0.882286,0.922377,0.737792,0.818601


- Choosing the best model overall.

In [22]:
def get_best_params(pipeline):
    params = pipeline["grid_search"].best_params_
    result = {}
    for key in params.keys():
        result[key.split("__")[1]] = params[key]
    return result

In [23]:
for model in models.values():
    print(get_best_params(model))

{'max_depth': 20, 'min_impurity_decrease': 0.0, 'min_samples_split': 2}
{'max_depth': 50, 'min_impurity_decrease': 0.0, 'min_samples_split': 2}
{'max_depth': 100, 'min_impurity_decrease': 0.0, 'min_samples_split': 10}
{'max_depth': None, 'min_impurity_decrease': 0.0, 'min_samples_split': 10}


In [24]:
_models = {}

In [25]:
_models["cv_smote"] = Pipeline(steps=[
    ("cv", CountVectorizer()),
    ("oversampler", SMOTE(random_state=48)),
    ("estimator",DecisionTreeClassifier(**get_best_params(models["cv_smote"])))
])

In [26]:
_models["tfidf_smote"] = Pipeline(steps=[
    ("cv", TfidfVectorizer()),
    ("oversampler", SMOTE(random_state=48)),
    ("estimator",DecisionTreeClassifier(**get_best_params(models["tfidf_smote"])))
])

In [27]:
_models["cv"] = Pipeline(steps=[
    ("ct", CountVectorizer()),
    ("estimator",DecisionTreeClassifier(**get_best_params(models["cv"])))
])

In [28]:
_models["tfidf"] = Pipeline(steps=[
    ("cv", TfidfVectorizer()),
    ("estimator",DecisionTreeClassifier(**get_best_params(models["tfidf"])))
])

In [29]:
evaluation_df = evaluate_cv(_models,scoring,cv=KFold(shuffle=True),X=X,y=y)

In [30]:
evaluation_df

Unnamed: 0,fit_time,score_time,accuracy,f1_score,precision,recall
cv_smote,0.242604,0.024164,0.901097,0.836348,0.797112,0.883853
tfidf_smote,1.131949,0.025333,0.887207,0.803059,0.801169,0.818317
cv,0.232857,0.023859,0.900083,0.825263,0.831019,0.823231
tfidf,0.227081,0.025194,0.876369,0.772343,0.826395,0.727991


<div id="conclusion" >
    <h3>Conclusion</h3>
</div>

- Bag of words (CountVectorizer) as feature extraction technique.
- with over sampling.

<div id="save_the_best_model" >
    <h3>Save the best model to the disk</h3>
</div>

In [31]:
dump(value=_models[evaluation_df.index[evaluation_df["f1_score"].argmax()]],
     filename=os.path.join("..","..","models","ssl","dtree.joblib"))

['../../models/ssl/dtree.joblib']