# Modeling & Training

## Support Vector machine

## Outline
- [Necessary packages](#necessary_packages)
- [Data Loading](#data_loading)
- [Modeling and training](#modeling_and_training)
- [Conclusion](#conclusion)
- [Save the best model](#save_the_best_model)

<div id="necessary_packages" >
    <h3>Necessary packages</h3>
</div>

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MaxAbsScaler,FunctionTransformer,StandardScaler
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate,GridSearchCV,KFold,train_test_split
from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score,auc,confusion_matrix,make_scorer
from sklearn.decomposition import TruncatedSVD
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from joblib import dump

<div id="data_loading" >
    <h3>Data Loading</h3>
</div>

In [2]:
path = os.path.join("..","..","data","clean_df.csv")
df = pd.read_csv(path, encoding="iso-8859-1")
df.fillna("",inplace=True)

In [3]:
df.columns

Index(['class', 'content', 'urls_count', 'digits_count',
       'contains_currency_symbols', 'length'],
      dtype='object')

<div id="modeling_and_training" >
    <h3>Modeling and training</h3>
</div>

In [4]:
X = df["content"]
y = df["class"]
X = X[y != -1]
y = y[y != -1]

In [5]:
def results_to_df(results):
    _results = {}
    keys = ["mean_fit_time","mean_score_time","params","mean_test_accuracy","mean_test_recall","mean_test_precision","mean_test_f1_score"]
    for key in keys:
        _results[key] = results[key]
    return pd.DataFrame(_results)

In [6]:
scoring = {
    "accuracy":make_scorer(accuracy_score),
    "f1_score":make_scorer(f1_score),
    "precision":make_scorer(precision_score),
    "recall":make_scorer(recall_score)
}

In [7]:
params = {
    "estimator__C": [0.1,1,10,20],
    "estimator__kernel":["sigmoid","rbf","linear"],
}

In [8]:
def evaluate_cv(models,metrics,cv,X,y):

    df = []
    index = models.keys()
    columns = ["fit_time","score_time"]
    columns = columns + list(metrics.keys())
    
    for model in models.values():
        results = cross_validate(model, X, y, cv=cv,scoring=metrics)
        scores = []
        for score in results.values():
            scores.append(score.mean())
        df.append(scores)

    df = pd.DataFrame(data=df,index=index,columns=columns)
    return df

In [9]:
models = {}

In [10]:
models["cv"] = Pipeline(steps=[
    ("cv", CountVectorizer()),
    ("grid_search", GridSearchCV(
        estimator=Pipeline(steps=[
            ("oversampler", SMOTE(random_state=48)),
            ("scaler",MaxAbsScaler()),
            ("estimator",SVC(probability=True))
        ]),
        param_grid=params,
        scoring=scoring,
        refit="f1_score",cv=KFold(n_splits=5, shuffle=True))
    )
])

In [11]:
models["cv"].fit(X, y)

In [12]:
results_to_df(models["cv"]["grid_search"].cv_results_)

Unnamed: 0,mean_fit_time,mean_score_time,params,mean_test_accuracy,mean_test_recall,mean_test_precision,mean_test_f1_score
0,4.82024,0.163197,"{'estimator__C': 0.1, 'estimator__kernel': 'si...",0.533161,0.99661,0.387259,0.554947
1,4.883171,0.157122,"{'estimator__C': 0.1, 'estimator__kernel': 'rbf'}",0.451046,0.332655,0.208401,0.25391
2,3.273671,0.107121,"{'estimator__C': 0.1, 'estimator__kernel': 'li...",0.807092,0.914168,0.613629,0.732031
3,3.554674,0.116493,"{'estimator__C': 1, 'estimator__kernel': 'sigm...",0.780364,0.957658,0.575156,0.716605
4,4.221417,0.136251,"{'estimator__C': 1, 'estimator__kernel': 'rbf'}",0.738819,1.0,0.526001,0.687934
5,2.115513,0.067412,"{'estimator__C': 1, 'estimator__kernel': 'line...",0.870424,0.851926,0.738286,0.790024
6,2.223319,0.067425,"{'estimator__C': 10, 'estimator__kernel': 'sig...",0.844711,0.925876,0.66368,0.772996
7,3.917315,0.098734,"{'estimator__C': 10, 'estimator__kernel': 'rbf'}",0.835824,0.968763,0.641808,0.771256
8,1.751606,0.057008,"{'estimator__C': 10, 'estimator__kernel': 'lin...",0.869419,0.821976,0.751189,0.783323
9,1.924583,0.059185,"{'estimator__C': 20, 'estimator__kernel': 'sig...",0.840731,0.88534,0.670256,0.762625


In [13]:
models["tfidf"] = Pipeline(steps=[
    ("cv", TfidfVectorizer()),
    ("grid_search", GridSearchCV(
        estimator=Pipeline(steps=[
            ("oversampler", SMOTE(random_state=48)),
            ("scaler",MaxAbsScaler()),
            ("estimator",SVC(probability=True))
        ]),
        param_grid=params,
        scoring=scoring,
        refit="f1_score",cv=KFold(n_splits=5, shuffle=True))
    )
])

In [14]:
models["tfidf"].fit(X, y)

In [15]:
results_to_df(models["tfidf"]["grid_search"].cv_results_)

Unnamed: 0,mean_fit_time,mean_score_time,params,mean_test_accuracy,mean_test_recall,mean_test_precision,mean_test_f1_score
0,6.355462,0.17264,"{'estimator__C': 0.1, 'estimator__kernel': 'si...",0.92682,0.954866,0.820804,0.881946
1,5.411173,0.136601,"{'estimator__C': 0.1, 'estimator__kernel': 'rbf'}",0.679535,0.905886,0.470513,0.618336
2,3.342667,0.089281,"{'estimator__C': 0.1, 'estimator__kernel': 'li...",0.952509,0.873834,0.959101,0.914041
3,3.419078,0.082954,"{'estimator__C': 1, 'estimator__kernel': 'sigm...",0.962415,0.943167,0.925842,0.933938
4,5.17029,0.118506,"{'estimator__C': 1, 'estimator__kernel': 'rbf'}",0.79624,0.956492,0.589732,0.729279
5,3.281315,0.087989,"{'estimator__C': 1, 'estimator__kernel': 'line...",0.956475,0.886058,0.959556,0.921037
6,2.922892,0.066823,"{'estimator__C': 10, 'estimator__kernel': 'sig...",0.961415,0.913354,0.951755,0.931442
7,5.870245,0.117137,"{'estimator__C': 10, 'estimator__kernel': 'rbf'}",0.808111,0.96307,0.604255,0.742233
8,3.346905,0.087654,"{'estimator__C': 10, 'estimator__kernel': 'lin...",0.956475,0.886058,0.959556,0.921037
9,3.000023,0.071425,"{'estimator__C': 20, 'estimator__kernel': 'sig...",0.963396,0.920022,0.952219,0.935088


In [16]:
models["cv_svd"] = Pipeline(steps=[
    ("cv",CountVectorizer()),
    ("dr",TruncatedSVD(n_components=100)),
    ("grid_search", GridSearchCV(
        estimator=Pipeline(steps=[
            ("oversampler", SMOTE(random_state=48)),
            ("scaler",StandardScaler()),
            ("estimator",SVC(probability=True))
        ]),
        param_grid=params,
        scoring=scoring,
        refit="f1_score",cv=KFold(n_splits=5, shuffle=True))
    )
])

In [17]:
models["cv_svd"].fit(X, y)

In [18]:
results_to_df(models["cv_svd"]["grid_search"].cv_results_)

Unnamed: 0,mean_fit_time,mean_score_time,params,mean_test_accuracy,mean_test_recall,mean_test_precision,mean_test_f1_score
0,0.65295,0.028198,"{'estimator__C': 0.1, 'estimator__kernel': 'si...",0.855592,0.865021,0.704166,0.773879
1,0.533853,0.032884,"{'estimator__C': 0.1, 'estimator__kernel': 'rbf'}",0.688392,0.834796,0.476849,0.606894
2,0.203788,0.012623,"{'estimator__C': 0.1, 'estimator__kernel': 'li...",0.881295,0.9606,0.719059,0.821779
3,0.463042,0.02188,"{'estimator__C': 1, 'estimator__kernel': 'sigm...",0.820982,0.944408,0.623817,0.751216
4,0.368635,0.023486,"{'estimator__C': 1, 'estimator__kernel': 'rbf'}",0.868449,0.889088,0.719709,0.795178
5,0.200874,0.010802,"{'estimator__C': 1, 'estimator__kernel': 'line...",0.910979,0.93086,0.792864,0.855819
6,0.255699,0.014654,"{'estimator__C': 10, 'estimator__kernel': 'sig...",0.835785,0.849482,0.669895,0.74736
7,0.251455,0.017562,"{'estimator__C': 10, 'estimator__kernel': 'rbf'}",0.886256,0.863773,0.767276,0.812537
8,0.41569,0.01005,"{'estimator__C': 10, 'estimator__kernel': 'lin...",0.913954,0.910549,0.810656,0.856983
9,0.229454,0.013844,"{'estimator__C': 20, 'estimator__kernel': 'sig...",0.820948,0.801105,0.654791,0.720072


In [19]:
models["tfidf_svd"] = Pipeline(steps=[
    ("cv",TfidfVectorizer()),
    ("dr",TruncatedSVD(n_components=100)),
    ("grid_search", GridSearchCV(
        estimator=Pipeline(steps=[
            ("oversampler", SMOTE(random_state=48)),
            ("scaler",StandardScaler()),
            ("estimator",SVC(probability=True))
        ]),
        param_grid=params,
        scoring=scoring,
        refit="f1_score",cv=KFold(n_splits=5, shuffle=True))
    )
])

In [20]:
models["tfidf_svd"].fit(X, y)

In [21]:
results_to_df(models["tfidf_svd"]["grid_search"].cv_results_)

Unnamed: 0,mean_fit_time,mean_score_time,params,mean_test_accuracy,mean_test_recall,mean_test_precision,mean_test_f1_score
0,0.602991,0.025603,"{'estimator__C': 0.1, 'estimator__kernel': 'si...",0.917885,0.9838,0.786476,0.873834
1,0.42401,0.026027,"{'estimator__C': 0.1, 'estimator__kernel': 'rbf'}",0.882286,0.740128,0.835257,0.78378
2,0.176817,0.010661,"{'estimator__C': 0.1, 'estimator__kernel': 'li...",0.949539,0.962368,0.876803,0.91719
3,0.320403,0.015864,"{'estimator__C': 1, 'estimator__kernel': 'sigm...",0.909004,0.958302,0.777492,0.857534
4,0.299555,0.019203,"{'estimator__C': 1, 'estimator__kernel': 'rbf'}",0.95745,0.956157,0.904061,0.929261
5,0.30128,0.010404,"{'estimator__C': 1, 'estimator__kernel': 'line...",0.938668,0.935125,0.864969,0.897974
6,0.197239,0.012456,"{'estimator__C': 10, 'estimator__kernel': 'sig...",0.885241,0.86374,0.769855,0.81292
7,0.260134,0.016996,"{'estimator__C': 10, 'estimator__kernel': 'rbf'}",0.968336,0.945901,0.945741,0.945738
8,0.491726,0.009656,"{'estimator__C': 10, 'estimator__kernel': 'lin...",0.924826,0.897833,0.851201,0.873334
9,0.195166,0.012295,"{'estimator__C': 20, 'estimator__kernel': 'sig...",0.856538,0.857633,0.713436,0.775388


- Choosing the best model overall.

In [22]:
def get_best_params(pipeline):
    params = pipeline["grid_search"].best_params_
    result = {}
    for key in params.keys():
        result[key.split("__")[1]] = params[key]
    return result

In [23]:
for model in models.values():
    print(get_best_params(model))

{'C': 1, 'kernel': 'linear'}
{'C': 20, 'kernel': 'sigmoid'}
{'C': 10, 'kernel': 'linear'}
{'C': 10, 'kernel': 'rbf'}


In [24]:
_models = {}

In [25]:
_models["cv"] = Pipeline(steps=[
    ("cv", CountVectorizer()),
    ("ft", FunctionTransformer()),
    ("oversampler", SMOTE(random_state=48)),
    ("scaler",MaxAbsScaler()),
    ("estimator",SVC(probability=True,**get_best_params(models["cv"])))
])

In [26]:
_models["cv_svd"] = Pipeline(steps=[
    ("cv",CountVectorizer()),
    ("dr",TruncatedSVD(n_components=100)),
    ("oversampler", SMOTE(random_state=48)),
    ("scaler",StandardScaler()),
    ("estimator",SVC(probability=True,**get_best_params(models["cv_svd"])))
])

In [27]:
_models["tfidf"] = Pipeline(steps=[
    ("cv", TfidfVectorizer()),
    ("ft", FunctionTransformer()),
    ("oversampler", SMOTE(random_state=48)),
    ("scaler",MaxAbsScaler()),
    ("estimator",SVC(probability=True,**get_best_params(models["tfidf"])))
])

In [28]:
_models["tfidf_svd"] = Pipeline(steps=[
    ("cv",TfidfVectorizer()),
    ("dr",TruncatedSVD(n_components=100)),
    ("oversampler", SMOTE(random_state=48)),
    ("scaler",StandardScaler()),
    ("estimator",SVC(probability=True,**get_best_params(models["tfidf_svd"])))
])

In [29]:
evaluation_df = evaluate_cv(_models,scoring,cv=KFold(shuffle=True),X=X,y=y)

In [30]:
evaluation_df

Unnamed: 0,fit_time,score_time,accuracy,f1_score,precision,recall
cv,2.227077,0.079038,0.860518,0.77302,0.729154,0.828816
cv_svd,1.069855,0.028762,0.932737,0.885551,0.85324,0.921114
tfidf,3.015046,0.079291,0.964386,0.937951,0.935885,0.941993
tfidf_svd,0.830321,0.037185,0.926786,0.886562,0.800334,0.996552


<div id="conclusion" >
    <h3>Conclusion</h3>
</div>

- Tf-Idf as a feature extraction technique.
- without dimentiality reduction.
- f1-score of 0.94.

<div id="save_the_best_model" >
    <h3>Save the best model to the disk</h3>
</div>

In [31]:
dump(value=_models[evaluation_df.index[evaluation_df["f1_score"].argmax()]],filename=os.path.join("..","..","models","ssl","svm.joblib"))

['../../models/ssl/svm.joblib']