# Modeling & Training

## Logistic Regression

## Outline
- [Necessary packages](#necessary_packages)
- [Data Loading](#data_loading)
- [Modeling and training](#modeling_and_training)
- [Conclusion](#conclusion)
- [Save the best model](#save_the_best_model)

<div id="necessary_packages" >
    <h3>Necessary packages</h3>
</div>

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler,MaxAbsScaler,FunctionTransformer
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import cross_validate,GridSearchCV,KFold
from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score,auc,confusion_matrix,make_scorer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE, ADASYN
from joblib import dump

<div id="data_loading" >
    <h3>Data Loading</h3>
</div>

In [2]:
path = os.path.join("..","..","data","clean_df.csv")
df = pd.read_csv(path, encoding="iso-8859-1")
df.fillna("",inplace=True)

In [3]:
df.columns

Index(['class', 'content', 'urls_count', 'digits_count',
       'contains_currency_symbols', 'length'],
      dtype='object')

<div id="modeling_and_training" >
    <h3>Modeling and training</h3>
</div>

In [4]:
X = df["content"]
y = df["class"]
X = X[y != -1]
y = y[y != -1]

In [5]:
params = {
    "class_weight": [
        { 0:0.05,1:0.95 },
        { 0:0.1,1:0.9},
        { 0:0.15,1:0.85 },
        { 0: 0.2,1: 0.8 },
        { 0: 0.3,1: 0.7},
        { 0: 0.35,1: 0.65},
        { 0: 0.4,1: 0.6},
        { 0: 0.45,1: 0.55},
        { 0: 0.5,1: 0.5}
    ],
    "tol": [0.001,None]
}

In [6]:
scoring = {
    "accuracy":make_scorer(accuracy_score),
    "f1_score":make_scorer(f1_score),
    "precision":make_scorer(precision_score),
    "recall":make_scorer(recall_score)
}

In [7]:
models = {}

In [8]:
def results_to_df(results):
    _results = {}
    keys = ["mean_fit_time","mean_score_time","params","mean_test_accuracy","mean_test_recall","mean_test_precision","mean_test_f1_score"]
    for key in keys:
        _results[key] = results[key]
    return pd.DataFrame(_results)

In [9]:
def create_model(estimator,feature_extractor,with_svd,resampler=None):
    
    steps = [("feature_extractor",feature_extractor)]

    if with_svd:
        steps.append(("dr",TruncatedSVD(n_components=100)))
        steps.append(("scaler",StandardScaler()))
    else:
        steps.append(("scaler",MaxAbsScaler()))

    if resampler is not None:
        steps.append(("resampler",resampler))

    steps.append(("estimator",estimator))

    model = Pipeline(steps=steps)

    return model

In [10]:
def evaluate_cv(models,metrics,cv,X,y):

    df = []
    index = models.keys()
    columns = ["fit_time","score_time"]
    columns = columns + list(metrics.keys())
    
    for model in models.values():
        results = cross_validate(model, X, y, cv=cv,scoring=metrics)
        scores = []
        for score in results.values():
            scores.append(score.mean())
        df.append(scores)

    df = pd.DataFrame(data=df,index=index,columns=columns)
    return df

In [11]:
estimator = GridSearchCV(
    SGDClassifier(max_iter=200,loss="log_loss"),
    param_grid=params,
    scoring=scoring,
    refit="f1_score",
    cv=KFold(n_splits=5, shuffle=True)
)

- Using Bag of words
- No dimentiality reduction.
- MaxAbsScaler
- try different class weights to handle the problem of imbalanced data.

In [12]:
models["cv"] = create_model(estimator,feature_extractor=CountVectorizer(),with_svd=False)

In [13]:
models["cv"].fit(X, y)

In [14]:
results_to_df(models["cv"]["estimator"].cv_results_)

Unnamed: 0,mean_fit_time,mean_score_time,params,mean_test_accuracy,mean_test_recall,mean_test_precision,mean_test_f1_score
0,0.012118,0.018155,"{'class_weight': {0: 0.05, 1: 0.95}, 'tol': 0....",0.908999,0.9783,0.76786,0.860258
1,0.066456,0.008495,"{'class_weight': {0: 0.05, 1: 0.95}, 'tol': None}",0.918895,0.981809,0.786519,0.873022
2,0.005153,0.008413,"{'class_weight': {0: 0.1, 1: 0.9}, 'tol': 0.001}",0.940638,0.960366,0.850431,0.901559
3,0.066542,0.008465,"{'class_weight': {0: 0.1, 1: 0.9}, 'tol': None}",0.945579,0.920364,0.892514,0.905866
4,0.005119,0.00888,"{'class_weight': {0: 0.15, 1: 0.85}, 'tol': 0....",0.934692,0.924654,0.859758,0.890329
5,0.066123,0.00847,"{'class_weight': {0: 0.15, 1: 0.85}, 'tol': None}",0.926786,0.827873,0.909199,0.865847
6,0.004901,0.00836,"{'class_weight': {0: 0.2, 1: 0.8}, 'tol': 0.001}",0.927781,0.867938,0.879663,0.872207
7,0.065902,0.008451,"{'class_weight': {0: 0.2, 1: 0.8}, 'tol': None}",0.91887,0.783971,0.922898,0.847134
8,0.004799,0.008341,"{'class_weight': {0: 0.3, 1: 0.7}, 'tol': 0.001}",0.925806,0.813901,0.920729,0.862418
9,0.065851,0.00839,"{'class_weight': {0: 0.3, 1: 0.7}, 'tol': None}",0.909969,0.729746,0.946217,0.823564


- Using Bag of words
- Using dimentiality reduction.
- Standard Scaler.
- try different class weights to handle the problem of imbalanced data.

In [15]:
models["cv_svd"] = create_model(estimator,feature_extractor=CountVectorizer(),with_svd=True)

In [16]:
models["cv_svd"].fit(X,y)

In [17]:
results_to_df(models["cv_svd"]["estimator"].cv_results_)

Unnamed: 0,mean_fit_time,mean_score_time,params,mean_test_accuracy,mean_test_recall,mean_test_precision,mean_test_f1_score
0,0.051056,0.023136,"{'class_weight': {0: 0.05, 1: 0.95}, 'tol': 0....",0.865483,0.960823,0.691303,0.803664
1,0.131078,0.019205,"{'class_weight': {0: 0.05, 1: 0.95}, 'tol': None}",0.858572,0.959588,0.680724,0.796021
2,0.042059,0.019204,"{'class_weight': {0: 0.1, 1: 0.9}, 'tol': 0.001}",0.899107,0.955939,0.75679,0.844457
3,0.130226,0.019359,"{'class_weight': {0: 0.1, 1: 0.9}, 'tol': None}",0.889226,0.959588,0.735603,0.832481
4,0.040819,0.019136,"{'class_weight': {0: 0.15, 1: 0.85}, 'tol': 0....",0.910003,0.955884,0.780866,0.859443
5,0.1307,0.018653,"{'class_weight': {0: 0.15, 1: 0.85}, 'tol': None}",0.905048,0.959218,0.767918,0.852775
6,0.036818,0.019039,"{'class_weight': {0: 0.2, 1: 0.8}, 'tol': 0.001}",0.923831,0.959643,0.811683,0.878934
7,0.131166,0.019728,"{'class_weight': {0: 0.2, 1: 0.8}, 'tol': None}",0.915924,0.956141,0.794081,0.867201
8,0.030558,0.018802,"{'class_weight': {0: 0.3, 1: 0.7}, 'tol': 0.001}",0.926796,0.939306,0.829043,0.879833
9,0.13024,0.018498,"{'class_weight': {0: 0.3, 1: 0.7}, 'tol': None}",0.928771,0.949583,0.82906,0.884522


- Tfidf Vectorizer.
- no dimentiality reduction.
- MaxAbsScaler
- trying different weights.

In [18]:
models["tfidf"] = create_model(estimator,feature_extractor=TfidfVectorizer(),with_svd=False)

In [19]:
models["tfidf"].fit(X, y)

In [20]:
results_to_df(models["tfidf"]["estimator"].cv_results_)

Unnamed: 0,mean_fit_time,mean_score_time,params,mean_test_accuracy,mean_test_recall,mean_test_precision,mean_test_f1_score
0,0.004887,0.008282,"{'class_weight': {0: 0.05, 1: 0.95}, 'tol': 0....",0.941652,0.996,0.831727,0.906153
1,0.067051,0.008342,"{'class_weight': {0: 0.05, 1: 0.95}, 'tol': None}",0.943618,0.996,0.836658,0.909105
2,0.004758,0.008239,"{'class_weight': {0: 0.1, 1: 0.9}, 'tol': 0.001}",0.953514,0.985443,0.869411,0.923367
3,0.068265,0.008417,"{'class_weight': {0: 0.1, 1: 0.9}, 'tol': None}",0.96241,0.992721,0.888932,0.937735
4,0.004672,0.008208,"{'class_weight': {0: 0.15, 1: 0.85}, 'tol': 0....",0.95945,0.982908,0.887183,0.93251
5,0.06712,0.008433,"{'class_weight': {0: 0.15, 1: 0.85}, 'tol': None}",0.970326,0.979651,0.920981,0.949308
6,0.004674,0.008208,"{'class_weight': {0: 0.2, 1: 0.8}, 'tol': 0.001}",0.966371,0.971294,0.915273,0.942309
7,0.066224,0.008315,"{'class_weight': {0: 0.2, 1: 0.8}, 'tol': None}",0.970331,0.97208,0.925829,0.948231
8,0.004732,0.008257,"{'class_weight': {0: 0.3, 1: 0.7}, 'tol': 0.001}",0.968356,0.968444,0.922457,0.944796
9,0.067617,0.009299,"{'class_weight': {0: 0.3, 1: 0.7}, 'tol': None}",0.968346,0.955796,0.934475,0.944822


- Tfidf Vectorizer.
- dimentiality reduction.
- StandardScaler.
- trying different weights.

In [21]:
models["tfidf_svd"] = create_model(estimator,feature_extractor=TfidfVectorizer(),with_svd=True)

In [22]:
models["tfidf_svd"].fit(X, y)

In [23]:
results_to_df(models["tfidf_svd"]["estimator"].cv_results_)

Unnamed: 0,mean_fit_time,mean_score_time,params,mean_test_accuracy,mean_test_recall,mean_test_precision,mean_test_f1_score
0,0.037372,0.019194,"{'class_weight': {0: 0.05, 1: 0.95}, 'tol': 0....",0.941633,0.977251,0.844374,0.905277
1,0.129826,0.019055,"{'class_weight': {0: 0.05, 1: 0.95}, 'tol': None}",0.937677,0.980641,0.832152,0.899307
2,0.031984,0.019234,"{'class_weight': {0: 0.1, 1: 0.9}, 'tol': 0.001}",0.943613,0.962372,0.858562,0.906311
3,0.129864,0.018903,"{'class_weight': {0: 0.1, 1: 0.9}, 'tol': None}",0.943613,0.973861,0.851878,0.907169
4,0.033301,0.019013,"{'class_weight': {0: 0.15, 1: 0.85}, 'tol': 0....",0.944613,0.942418,0.873606,0.905582
5,0.129912,0.018885,"{'class_weight': {0: 0.15, 1: 0.85}, 'tol': None}",0.945593,0.966076,0.861121,0.909252
6,0.02775,0.019389,"{'class_weight': {0: 0.2, 1: 0.8}, 'tol': 0.001}",0.946598,0.945045,0.87635,0.908818
7,0.130185,0.01905,"{'class_weight': {0: 0.2, 1: 0.8}, 'tol': None}",0.947574,0.959727,0.870858,0.911969
8,0.033103,0.019059,"{'class_weight': {0: 0.3, 1: 0.7}, 'tol': 0.001}",0.950554,0.927539,0.900912,0.913348
9,0.129488,0.018975,"{'class_weight': {0: 0.3, 1: 0.7}, 'tol': None}",0.949563,0.952848,0.8804,0.914397


- now we gonna try oversampling with the different preprocessing methods,here we'll use tow oversampling techniques.

In [24]:
_models = {}

In [25]:
_models["cv_smote"] = create_model(
    estimator=SGDClassifier(loss="log_loss"),
    feature_extractor=CountVectorizer(),
    with_svd=False,
    resampler=SMOTE(random_state=42)
)

In [26]:
_models["cv_svd_smote"] = create_model(
    estimator=SGDClassifier(loss="log_loss"),
    feature_extractor=CountVectorizer(),
    with_svd=True,
    resampler=SMOTE(random_state=42)
)

In [27]:
_models["tfidf_smote"] = create_model(
    estimator=SGDClassifier(loss="log_loss"),
    feature_extractor=TfidfVectorizer(),
    with_svd=False,
    resampler=SMOTE(random_state=42)
)

In [28]:
_models["tfidf_svd_smote"] = create_model(
    estimator=SGDClassifier(loss="log_loss"),
    feature_extractor=TfidfVectorizer(),
    with_svd=True,
    resampler=SMOTE(random_state=42)
)

In [29]:
_models["cv_adasyn"] = create_model(
    estimator=SGDClassifier(loss="log_loss"),
    feature_extractor=CountVectorizer(),
    with_svd=False,
    resampler=ADASYN(random_state=42)
)

In [30]:
_models["cv_svd_adasyn"] = create_model(
    estimator=SGDClassifier(loss="log_loss"),
    feature_extractor=CountVectorizer(),
    with_svd=True,
    resampler=ADASYN(random_state=42)
)

In [31]:
_models["tfidf_adasyn"] = create_model(
    estimator=SGDClassifier(loss="log_loss"),
    feature_extractor=TfidfVectorizer(),
    with_svd=False,
    resampler=ADASYN(random_state=42)
)

In [32]:
_models["tfidf_svd_adasyn"] = create_model(
    estimator=SGDClassifier(loss="log_loss"),
    feature_extractor=TfidfVectorizer(),
    with_svd=True,
    resampler=ADASYN(random_state=42)
)

In [33]:
evaluation_df = evaluate_cv(_models,scoring,cv=KFold(n_splits=5, shuffle=True),X=X, y=y)

In [34]:
evaluation_df

Unnamed: 0,fit_time,score_time,accuracy,f1_score,precision,recall
cv_smote,0.689846,0.028392,0.930766,0.876901,0.891362,0.864426
cv_svd_smote,0.626957,0.058319,0.934717,0.888196,0.876796,0.902672
tfidf_smote,0.661829,0.025657,0.946583,0.904258,0.907784,0.903008
tfidf_svd_smote,0.630237,0.057573,0.940653,0.892291,0.934138,0.85866
cv_adasyn,1.58308,0.030326,0.939692,0.893557,0.907933,0.882805
cv_svd_adasyn,0.634134,0.054128,0.926786,0.876577,0.846727,0.910059
tfidf_adasyn,1.543348,0.027635,0.944628,0.905224,0.912161,0.899192
tfidf_svd_adasyn,0.624815,0.061283,0.934732,0.878411,0.918302,0.842699


- choosing the best model overall

In [35]:
__models = {}

In [36]:
__models["cv"] = create_model(
    SGDClassifier(**models["cv"]["estimator"].best_params_,loss="log_loss"),
    CountVectorizer(),
    with_svd=False,
    resampler=None
)

In [37]:
__models["cv_svd"] = create_model(
    SGDClassifier(**models["cv_svd"]["estimator"].best_params_,loss="log_loss"),
    CountVectorizer(),
    with_svd=True,
    resampler=None
)

In [38]:
__models["tfidf"] = create_model(
    SGDClassifier(**models["tfidf"]["estimator"].best_params_,loss="log_loss"),
    CountVectorizer(),
    with_svd=True,
    resampler=None
)

In [39]:
__models["tfidf_svd"] = create_model(
    SGDClassifier(**models["tfidf_svd"]["estimator"].best_params_,loss="log_loss"),
    CountVectorizer(),
    with_svd=True,
    resampler=None
)

In [40]:
__models["with_oversampling"] = _models[evaluation_df.index[evaluation_df["f1_score"].argmax()]]

In [41]:
evaluation_df = evaluate_cv(__models,scoring,cv=KFold(n_splits=5, shuffle=True),X=X, y=y)

In [42]:
evaluation_df

Unnamed: 0,fit_time,score_time,accuracy,f1_score,precision,recall
cv,0.410599,0.024542,0.907009,0.819674,0.922455,0.738126
cv_svd,0.98877,0.037401,0.930742,0.887524,0.834392,0.948252
tfidf,0.999447,0.037407,0.92185,0.87303,0.821693,0.934193
tfidf_svd,1.013939,0.037433,0.922841,0.874014,0.824758,0.931623
with_oversampling,1.545952,0.027103,0.936707,0.890522,0.8804,0.903999


<div id="conclusion" >
    <h3>Conclusion</h3>
</div>

- After trying several feature extraction and balancing techniques the best model the one with:
    - Tfidf as its feature extraction technique.
    - with dimentiality reduction.
    - oversampling with SMOTE
    - an f1 score of 0.89.

<div id="save_the_best_model" >
    <h3>Save the best model</h3>
</div>

In [43]:
dump(value=__models[evaluation_df.index[evaluation_df["f1_score"].argmax()]],filename=os.path.join("..","..","models","ssl","sgd.joblib"))

['../../models/ssl/sgd.joblib']