# Modeling & Training

## Logistic Regression

## Outline
- [Necessary packages](#necessary_packages)
- [Data Loading](#data_loading)
- [Modeling and training](#modeling_and_training)
- [Conclusion](#conclusion)
- [Save the best model](#save_the_best_model)

<div id="necessary_packages" >
    <h3>Necessary packages</h3>
</div>

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler,MaxAbsScaler,FunctionTransformer
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import cross_validate,GridSearchCV,KFold
from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score,auc,confusion_matrix,make_scorer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline
from sklearn.pipeline import Pipeline as SKPipeline
from imblearn.over_sampling import SMOTE, ADASYN
from joblib import dump

<div id="data_loading" >
    <h3>Data Loading</h3>
</div>

In [2]:
path = os.path.join("..","..","data","clean_df.csv")
df = pd.read_csv(path, encoding="iso-8859-1")
df.fillna("",inplace=True)

In [3]:
df.columns

Index(['class', 'content', 'urls_count', 'digits_count',
       'contains_currency_symbols', 'length'],
      dtype='object')

<div id="modeling_and_training" >
    <h3>Modeling and training</h3>
</div>

In [4]:
X = df["content"]
y = df["class"]
X = X[y != -1]
y = y[y != -1]

In [5]:
params = {
    "class_weight": [
        { 0:0.05,1:0.95 },
        { 0:0.1,1:0.9},
        { 0:0.15,1:0.85 },
        { 0: 0.2,1: 0.8 },
        { 0: 0.3,1: 0.7},
        { 0: 0.35,1: 0.65},
        { 0: 0.4,1: 0.6},
        { 0: 0.45,1: 0.55},
        { 0: 0.5,1: 0.5}
    ]
}

In [6]:
scoring = {
    "accuracy":make_scorer(accuracy_score),
    "f1_score":make_scorer(f1_score),
    "precision":make_scorer(precision_score),
    "recall":make_scorer(recall_score)
}

In [7]:
models = {}

In [8]:
def results_to_df(results):
    _results = {}
    keys = ["mean_fit_time","mean_score_time","params","mean_test_accuracy","mean_test_recall","mean_test_precision","mean_test_f1_score"]
    for key in keys:
        _results[key] = results[key]
    return pd.DataFrame(_results)

In [17]:
def create_model(estimator,feature_extractor,with_svd,resampler=None):
    
    steps = [("feature_extractor",feature_extractor)]

    if with_svd:
        steps.append(("dr",TruncatedSVD(n_components=100)))
        steps.append(("scaler",StandardScaler()))
    else:
        steps.append(("scaler",MaxAbsScaler()))

    if resampler is not None:
        steps.append(("resampler",resampler))

    steps.append(("estimator",estimator))

    model = Pipeline(steps=steps)

    return model

In [10]:
def evaluate_cv(models,metrics,cv,X,y):

    df = []
    index = models.keys()
    columns = ["fit_time","score_time"]
    columns = columns + list(metrics.keys())
    
    for model in models.values():
        results = cross_validate(model, X, y, cv=cv,scoring=metrics)
        scores = []
        for score in results.values():
            scores.append(score.mean())
        df.append(scores)

    df = pd.DataFrame(data=df,index=index,columns=columns)
    return df

In [11]:
estimator = GridSearchCV(
    LogisticRegression(max_iter=200),
    param_grid=params,
    scoring=scoring,
    refit="f1_score",
    cv=KFold(n_splits=5, shuffle=True)
)

- Using Bag of words
- No dimentiality reduction.
- MaxAbsScaler
- try different class weights to handle the problem of imbalanced data.

In [12]:
models["cv"] = create_model(estimator,feature_extractor=CountVectorizer(),with_svd=False)

In [13]:
models["cv"].fit(X, y)

In [14]:
results_to_df(models["cv"]["estimator"].cv_results_)

Unnamed: 0,mean_fit_time,mean_score_time,params,mean_test_accuracy,mean_test_recall,mean_test_precision,mean_test_f1_score
0,0.245881,0.02502,"{'class_weight': {0: 0.05, 1: 0.95}}",0.583578,1.0,0.408355,0.577508
1,0.24389,0.029362,"{'class_weight': {0: 0.1, 1: 0.9}}",0.857572,0.993522,0.672054,0.797806
2,0.213413,0.021452,"{'class_weight': {0: 0.15, 1: 0.85}}",0.947578,0.953495,0.873491,0.910249
3,0.207196,0.025998,"{'class_weight': {0: 0.2, 1: 0.8}}",0.92088,0.81059,0.911922,0.854124
4,0.210212,0.01772,"{'class_weight': {0: 0.3, 1: 0.7}}",0.886261,0.670723,0.926181,0.772284
5,0.182735,0.018266,"{'class_weight': {0: 0.35, 1: 0.65}}",0.874394,0.623264,0.931013,0.740524
6,0.186102,0.015621,"{'class_weight': {0: 0.4, 1: 0.6}}",0.863498,0.572181,0.940615,0.706592
7,0.189277,0.02366,"{'class_weight': {0: 0.45, 1: 0.55}}",0.852607,0.528299,0.941541,0.67323
8,0.1781,0.019523,"{'class_weight': {0: 0.5, 1: 0.5}}",0.844691,0.498075,0.949772,0.648507


- Using Bag of words
- Using dimentiality reduction.
- Standard Scaler.
- try different class weights to handle the problem of imbalanced data.

In [18]:
models["cv_svd"] = create_model(estimator,feature_extractor=CountVectorizer(),with_svd=True)

In [19]:
models["cv_svd"].fit(X,y)

In [20]:
results_to_df(models["cv_svd"]["estimator"].cv_results_)

Unnamed: 0,mean_fit_time,mean_score_time,params,mean_test_accuracy,mean_test_recall,mean_test_precision,mean_test_f1_score
0,0.025452,0.019473,"{'class_weight': {0: 0.05, 1: 0.95}}",0.737926,0.972737,0.525688,0.680223
1,0.020105,0.019231,"{'class_weight': {0: 0.1, 1: 0.9}}",0.815095,0.95987,0.61865,0.74825
2,0.021213,0.019313,"{'class_weight': {0: 0.15, 1: 0.85}}",0.854631,0.956422,0.677991,0.789951
3,0.02102,0.019288,"{'class_weight': {0: 0.2, 1: 0.8}}",0.872428,0.950452,0.709216,0.809218
4,0.024733,0.026302,"{'class_weight': {0: 0.3, 1: 0.7}}",0.904082,0.946285,0.771544,0.848066
5,0.02971,0.018962,"{'class_weight': {0: 0.35, 1: 0.65}}",0.914944,0.933418,0.802623,0.861627
6,0.023535,0.019484,"{'class_weight': {0: 0.4, 1: 0.6}}",0.924831,0.926522,0.831011,0.874752
7,0.021097,0.01969,"{'class_weight': {0: 0.45, 1: 0.55}}",0.924821,0.906988,0.844073,0.872995
8,0.022458,0.019232,"{'class_weight': {0: 0.5, 1: 0.5}}",0.916895,0.866529,0.850199,0.856479


- Tfidf Vectorizer.
- no dimentiality reduction.
- MaxAbsScaler
- trying different weights.

In [21]:
models["tfidf"] = create_model(estimator,feature_extractor=TfidfVectorizer(),with_svd=False)

In [22]:
models["tfidf"].fit(X, y)

In [23]:
results_to_df(models["tfidf"]["estimator"].cv_results_)

Unnamed: 0,mean_fit_time,mean_score_time,params,mean_test_accuracy,mean_test_recall,mean_test_precision,mean_test_f1_score
0,0.117262,0.014064,"{'class_weight': {0: 0.05, 1: 0.95}}",0.794254,1.0,0.58433,0.734775
1,0.161831,0.023083,"{'class_weight': {0: 0.1, 1: 0.9}}",0.900102,1.0,0.744595,0.851157
2,0.138685,0.015393,"{'class_weight': {0: 0.15, 1: 0.85}}",0.937682,1.0,0.823664,0.901663
3,0.107142,0.012078,"{'class_weight': {0: 0.2, 1: 0.8}}",0.957465,0.993506,0.875211,0.929389
4,0.12934,0.012852,"{'class_weight': {0: 0.3, 1: 0.7}}",0.95846,0.930678,0.927432,0.928306
5,0.150454,0.019764,"{'class_weight': {0: 0.35, 1: 0.65}}",0.943623,0.852768,0.948611,0.895701
6,0.171396,0.019398,"{'class_weight': {0: 0.4, 1: 0.6}}",0.929776,0.783627,0.962945,0.861616
7,0.216722,0.024316,"{'class_weight': {0: 0.45, 1: 0.55}}",0.91791,0.736676,0.967371,0.834249
8,0.183782,0.024394,"{'class_weight': {0: 0.5, 1: 0.5}}",0.907028,0.694928,0.97116,0.808016


- Tfidf Vectorizer.
- dimentiality reduction.
- StandardScaler.
- trying different weights.

In [24]:
models["tfidf_svd"] = create_model(estimator,feature_extractor=TfidfVectorizer(),with_svd=True)

In [25]:
models["tfidf_svd"].fit(X, y)

In [26]:
results_to_df(models["tfidf_svd"]["estimator"].cv_results_)

Unnamed: 0,mean_fit_time,mean_score_time,params,mean_test_accuracy,mean_test_recall,mean_test_precision,mean_test_f1_score
0,0.023043,0.023567,"{'class_weight': {0: 0.05, 1: 0.95}}",0.894147,0.989082,0.733839,0.842103
1,0.030809,0.019038,"{'class_weight': {0: 0.1, 1: 0.9}}",0.91887,0.986005,0.785241,0.873645
2,0.018907,0.019681,"{'class_weight': {0: 0.15, 1: 0.85}}",0.934697,0.986005,0.821502,0.895516
3,0.01912,0.019286,"{'class_weight': {0: 0.2, 1: 0.8}}",0.942608,0.976275,0.848451,0.906747
4,0.020523,0.019481,"{'class_weight': {0: 0.3, 1: 0.7}}",0.956465,0.973198,0.886022,0.926618
5,0.019832,0.019391,"{'class_weight': {0: 0.35, 1: 0.65}}",0.95745,0.96543,0.893723,0.927453
6,0.021851,0.01916,"{'class_weight': {0: 0.4, 1: 0.6}}",0.958445,0.958926,0.901856,0.928751
7,0.05234,0.021453,"{'class_weight': {0: 0.45, 1: 0.55}}",0.962406,0.958926,0.913469,0.935024
8,0.019263,0.019305,"{'class_weight': {0: 0.5, 1: 0.5}}",0.96043,0.951778,0.912917,0.931282


- now we gonna try oversampling with the different preprocessing methods,here we'll use tow oversampling techniques.

In [27]:
_models = {}

In [28]:
_models["cv_smote"] = create_model(
    estimator=LogisticRegression(),
    feature_extractor=CountVectorizer(),
    with_svd=False,
    resampler=SMOTE(random_state=42)
)

In [29]:
_models["cv_svd_smote"] = create_model(
    estimator=LogisticRegression(),
    feature_extractor=CountVectorizer(),
    with_svd=True,
    resampler=SMOTE(random_state=42)
)

In [30]:
_models["tfidf_smote"] = create_model(
    estimator=LogisticRegression(),
    feature_extractor=TfidfVectorizer(),
    with_svd=False,
    resampler=SMOTE(random_state=42)
)

In [31]:
_models["tfidf_svd_smote"] = create_model(
    estimator=LogisticRegression(),
    feature_extractor=TfidfVectorizer(),
    with_svd=True,
    resampler=SMOTE(random_state=42)
)

In [32]:
_models["cv_adasyn"] = create_model(
    estimator=LogisticRegression(),
    feature_extractor=CountVectorizer(),
    with_svd=False,
    resampler=ADASYN(random_state=42)
)

In [33]:
_models["cv_svd_adasyn"] = create_model(
    estimator=LogisticRegression(),
    feature_extractor=CountVectorizer(),
    with_svd=True,
    resampler=ADASYN(random_state=42)
)

In [34]:
_models["tfidf_adasyn"] = create_model(
    estimator=LogisticRegression(),
    feature_extractor=TfidfVectorizer(),
    with_svd=False,
    resampler=ADASYN(random_state=42)
)

In [35]:
_models["tfidf_svd_adasyn"] = create_model(
    estimator=LogisticRegression(),
    feature_extractor=TfidfVectorizer(),
    with_svd=True,
    resampler=ADASYN(random_state=42)
)

In [36]:
evaluation_df = evaluate_cv(_models,scoring,cv=KFold(n_splits=5, shuffle=True),X=X, y=y)

In [37]:
evaluation_df

Unnamed: 0,fit_time,score_time,accuracy,f1_score,precision,recall
cv_smote,0.908211,0.084846,0.923865,0.858635,0.909464,0.820976
cv_svd_smote,0.676897,0.078691,0.91592,0.86639,0.803644,0.942691
tfidf_smote,0.832528,0.076019,0.971316,0.94994,0.942993,0.959118
tfidf_svd_smote,0.629916,0.079904,0.950524,0.917192,0.883949,0.953493
cv_adasyn,1.70613,0.059784,0.941667,0.896671,0.929897,0.86792
cv_svd_adasyn,0.647888,0.0814,0.909999,0.859313,0.78439,0.952489
tfidf_adasyn,1.726189,0.065272,0.970317,0.948619,0.942408,0.956119
tfidf_svd_adasyn,0.674651,0.078251,0.955499,0.924813,0.901873,0.951584


- choosing the best model overall

In [38]:
__models = {}

In [39]:
__models["cv"] = create_model(
    LogisticRegression(**models["cv"]["estimator"].best_params_),
    CountVectorizer(),
    with_svd=False,
    resampler=None
)

In [40]:
__models["cv_svd"] = create_model(
    LogisticRegression(**models["cv_svd"]["estimator"].best_params_),
    CountVectorizer(),
    with_svd=True,
    resampler=None
)

In [41]:
__models["tfidf"] = create_model(
    LogisticRegression(**models["tfidf"]["estimator"].best_params_),
    CountVectorizer(),
    with_svd=True,
    resampler=None
)

In [42]:
__models["tfidf_svd"] = create_model(
    LogisticRegression(**models["tfidf_svd"]["estimator"].best_params_),
    CountVectorizer(),
    with_svd=True,
    resampler=None
)

In [43]:
__models["with_oversampling"] = _models[evaluation_df.index[evaluation_df["f1_score"].argmax()]]

In [44]:
evaluation_df = evaluate_cv(__models,scoring,cv=KFold(n_splits=5, shuffle=True),X=X, y=y)

In [45]:
evaluation_df

Unnamed: 0,fit_time,score_time,accuracy,f1_score,precision,recall
cv,0.260647,0.085632,0.879325,0.743623,0.955639,0.609669
cv_svd,0.643494,0.068994,0.935687,0.892471,0.866972,0.921234
tfidf,0.632549,0.075859,0.935683,0.890865,0.862758,0.923105
tfidf_svd,0.656115,0.075086,0.935731,0.889423,0.879993,0.89965
with_oversampling,0.820617,0.07803,0.96837,0.944252,0.935572,0.953707


<div id="conclusion" >
    <h3>Conclusion</h3>
</div>

- After trying several feature extraction and balancing techniques the best model the one with:
    - Tfidf as its feature extraction technique.
    - without dimentiality reduction.
    - using oversampling with SMOTE.
    - an f1_score of 0.94.

<div id="save_the_best_model" >
    <h3>Save the best model</h3>
</div>

In [46]:
dump(value=__models[evaluation_df.index[evaluation_df["f1_score"].argmax()]],filename=os.path.join("..","..","models","ssl","lr.joblib"))

['../../models/ssl/lr.joblib']