# Modeling & Training

## KNN

## Outline
- [Necessary packages](#necessary_packages)
- [Data Loading](#data_loading)
- [Modeling and training](#modeling_and_training)
- [Conclusion](#conclusion)
- [Save the best model](#save_the_best_model)

<div id="necessary_packages" >
    <h3>Necessary packages</h3>
</div>

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MaxAbsScaler,FunctionTransformer,StandardScaler
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_validate,GridSearchCV,KFold,train_test_split
from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score,auc,confusion_matrix,make_scorer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from joblib import dump

<div id="data_loading" >
    <h3>Data Loading</h3>
</div>

In [2]:
path = os.path.join("..","..","data","clean_df.csv")
df = pd.read_csv(path, encoding="iso-8859-1")
df.fillna("",inplace=True)

In [3]:
df.columns

Index(['class', 'content', 'urls_count', 'digits_count',
       'contains_currency_symbols', 'length'],
      dtype='object')

<div id="modeling_and_training" >
    <h3>Modeling and training</h3>
</div>

In [4]:
X = df["content"]
y = df["class"]
X = X[y != -1]
y = y[y != -1]

In [5]:
def results_to_df(results):
    _results = {}
    keys = ["mean_fit_time","mean_score_time","params","mean_test_accuracy","mean_test_recall","mean_test_precision","mean_test_f1_score"]
    for key in keys:
        _results[key] = results[key]
    return pd.DataFrame(_results)

In [6]:
scoring = {
    "accuracy":make_scorer(accuracy_score),
    "f1_score":make_scorer(f1_score),
    "precision":make_scorer(precision_score),
    "recall":make_scorer(recall_score)
}

In [7]:
params = {
    "estimator__n_neighbors": [3,5,7,9,11,13],
    "estimator__weights":["uniform","distance"],
    "estimator__p":[1,2]
}

In [8]:
def evaluate_cv(models,metrics,cv,X,y):

    df = []
    index = models.keys()
    columns = ["fit_time","score_time"]
    columns = columns + list(metrics.keys())
    
    for model in models.values():
        results = cross_validate(model, X, y, cv=cv,scoring=metrics)
        scores = []
        for score in results.values():
            scores.append(score.mean())
        df.append(scores)

    df = pd.DataFrame(data=df,index=index,columns=columns)
    return df

In [9]:
models = {}

In [10]:
models["knn_cv"] = Pipeline(steps=[
    ("cv", CountVectorizer()),
    ("grid_search", GridSearchCV(
        estimator=Pipeline(steps=[
            ("oversampler", SMOTE(random_state=48)),
            ("scaler",MaxAbsScaler()),
            ("estimator",KNeighborsClassifier())
        ]),
        param_grid=params,
        scoring=scoring,
        refit="f1_score",cv=KFold(n_splits=5, shuffle=True))
    )
])

In [11]:
models["knn_cv"].fit(X, y)

In [12]:
results_to_df(models["knn_cv"]["grid_search"].cv_results_)

Unnamed: 0,mean_fit_time,mean_score_time,params,mean_test_accuracy,mean_test_recall,mean_test_precision,mean_test_f1_score
0,0.045533,0.313029,"{'estimator__n_neighbors': 3, 'estimator__p': ...",0.492586,0.992904,0.359799,0.52722
1,0.029085,0.138798,"{'estimator__n_neighbors': 3, 'estimator__p': ...",0.511389,0.992904,0.368559,0.536695
2,0.028889,0.749595,"{'estimator__n_neighbors': 3, 'estimator__p': ...",0.591484,0.940235,0.40612,0.566687
3,0.028884,0.756656,"{'estimator__n_neighbors': 3, 'estimator__p': ...",0.60138,0.940235,0.412195,0.572681
4,0.028808,0.142731,"{'estimator__n_neighbors': 5, 'estimator__p': ...",0.465873,1.0,0.348719,0.516149
5,0.028858,0.137203,"{'estimator__n_neighbors': 5, 'estimator__p': ...",0.486656,1.0,0.35775,0.526137
6,0.029551,0.750797,"{'estimator__n_neighbors': 5, 'estimator__p': ...",0.541038,0.970234,0.380763,0.546004
7,0.02876,0.743561,"{'estimator__n_neighbors': 5, 'estimator__p': ...",0.546978,0.970234,0.383797,0.549218
8,0.028706,0.142799,"{'estimator__n_neighbors': 7, 'estimator__p': ...",0.451027,1.0,0.342684,0.509378
9,0.028816,0.137244,"{'estimator__n_neighbors': 7, 'estimator__p': ...",0.466863,1.0,0.349099,0.516603


In [13]:
models["knn_tfidf"] = Pipeline(steps=[
    ("tfidf", TfidfVectorizer()),
    ("grid_search", GridSearchCV(
        estimator=Pipeline(steps=[
            ("oversampler", SMOTE(random_state=48)),
            ("scaler",MaxAbsScaler()),
            ("estimator",KNeighborsClassifier())
        ]),
        param_grid=params,
        scoring=scoring,
        refit="f1_score",cv=KFold(n_splits=5, shuffle=True))
    )
])

In [14]:
models["knn_tfidf"].fit(X, y)

In [15]:
results_to_df(models["knn_tfidf"]["grid_search"].cv_results_)

Unnamed: 0,mean_fit_time,mean_score_time,params,mean_test_accuracy,mean_test_recall,mean_test_precision,mean_test_f1_score
0,0.56345,0.071266,"{'estimator__n_neighbors': 3, 'estimator__p': ...",0.470912,0.803509,0.330261,0.402967
1,0.565015,0.066573,"{'estimator__n_neighbors': 3, 'estimator__p': ...",0.471897,0.803509,0.330606,0.403365
2,0.565444,0.839705,"{'estimator__n_neighbors': 3, 'estimator__p': ...",0.549998,0.961404,0.403822,0.559699
3,0.619416,0.92767,"{'estimator__n_neighbors': 3, 'estimator__p': ...",0.561845,0.961404,0.409504,0.565734
4,0.667552,0.079979,"{'estimator__n_neighbors': 5, 'estimator__p': ...",0.470892,0.803509,0.462586,0.402173
5,0.57058,0.066903,"{'estimator__n_neighbors': 5, 'estimator__p': ...",0.472867,0.803509,0.46334,0.403017
6,0.565977,0.840972,"{'estimator__n_neighbors': 5, 'estimator__p': ...",0.517368,0.989474,0.384363,0.549113
7,0.564104,0.834849,"{'estimator__n_neighbors': 5, 'estimator__p': ...",0.528245,0.989474,0.389063,0.554235
8,0.564088,0.072924,"{'estimator__n_neighbors': 7, 'estimator__p': ...",0.473862,0.810526,0.41339,0.415832
9,0.564403,0.066731,"{'estimator__n_neighbors': 7, 'estimator__p': ...",0.476828,0.810526,0.414496,0.417083


In [16]:
models["knn_cv_svd"] = Pipeline(steps=[
    ("cv",CountVectorizer()),
    ("dr",TruncatedSVD(n_components=100)),
    ("grid_search", GridSearchCV(
        estimator=Pipeline(steps=[
            ("oversampler", SMOTE(random_state=48)),
            ("scaler",StandardScaler()),
            ("estimator",KNeighborsClassifier())
        ]),
        param_grid=params,
        scoring=scoring,
        refit="f1_score",cv=KFold(n_splits=5, shuffle=True))
    )
])

In [17]:
models["knn_cv_svd"].fit(X, y)

In [18]:
results_to_df(models["knn_cv_svd"]["grid_search"].cv_results_)

Unnamed: 0,mean_fit_time,mean_score_time,params,mean_test_accuracy,mean_test_recall,mean_test_precision,mean_test_f1_score
0,0.018441,0.046274,"{'estimator__n_neighbors': 3, 'estimator__p': ...",0.83976,0.938769,0.654975,0.770691
1,0.007413,0.032406,"{'estimator__n_neighbors': 3, 'estimator__p': ...",0.840745,0.938769,0.656591,0.771789
2,0.007182,0.01162,"{'estimator__n_neighbors': 3, 'estimator__p': ...",0.848671,0.940758,0.669167,0.781142
3,0.007415,0.01196,"{'estimator__n_neighbors': 3, 'estimator__p': ...",0.849656,0.940758,0.670834,0.782252
4,0.007385,0.043651,"{'estimator__n_neighbors': 5, 'estimator__p': ...",0.829864,0.950021,0.638679,0.762602
5,0.007534,0.034371,"{'estimator__n_neighbors': 5, 'estimator__p': ...",0.829864,0.950021,0.638679,0.762602
6,0.009216,0.014572,"{'estimator__n_neighbors': 5, 'estimator__p': ...",0.833819,0.947427,0.644579,0.766159
7,0.009256,0.014769,"{'estimator__n_neighbors': 5, 'estimator__p': ...",0.834805,0.947427,0.646195,0.767258
8,0.00929,0.049978,"{'estimator__n_neighbors': 7, 'estimator__p': ...",0.808135,0.950021,0.608939,0.740665
9,0.008008,0.039946,"{'estimator__n_neighbors': 7, 'estimator__p': ...",0.811101,0.950021,0.612364,0.743391


In [19]:
models["knn_tfidf_svd"] = Pipeline(steps=[
    ("cv",TfidfVectorizer()),
    ("dr",TruncatedSVD(n_components=100)),
    ("grid_search", GridSearchCV(
        estimator=Pipeline(steps=[
            ("oversampler", SMOTE(random_state=48)),
            ("scaler",StandardScaler()),
            ("estimator",KNeighborsClassifier())
        ]),
        param_grid=params,
        scoring=scoring,
        refit="f1_score",cv=KFold(n_splits=5, shuffle=True))
    )
])

In [20]:
models["knn_tfidf_svd"].fit(X, y)

In [21]:
results_to_df(models["knn_tfidf_svd"]["grid_search"].cv_results_)

Unnamed: 0,mean_fit_time,mean_score_time,params,mean_test_accuracy,mean_test_recall,mean_test_precision,mean_test_f1_score
0,0.014785,0.055118,"{'estimator__n_neighbors': 3, 'estimator__p': ...",0.746818,0.996364,0.532543,0.693881
1,0.008298,0.036473,"{'estimator__n_neighbors': 3, 'estimator__p': ...",0.754729,0.996364,0.540612,0.700672
2,0.007904,0.012368,"{'estimator__n_neighbors': 3, 'estimator__p': ...",0.754699,0.996364,0.5399,0.700245
3,0.007967,0.012515,"{'estimator__n_neighbors': 3, 'estimator__p': ...",0.75865,0.996364,0.54389,0.703594
4,0.008429,0.049592,"{'estimator__n_neighbors': 5, 'estimator__p': ...",0.657796,1.0,0.457766,0.627445
5,0.008293,0.035342,"{'estimator__n_neighbors': 5, 'estimator__p': ...",0.675604,1.0,0.47104,0.639787
6,0.007774,0.012625,"{'estimator__n_neighbors': 5, 'estimator__p': ...",0.670673,1.0,0.467276,0.63638
7,0.007811,0.012143,"{'estimator__n_neighbors': 5, 'estimator__p': ...",0.682539,1.0,0.476328,0.644752
8,0.007786,0.045854,"{'estimator__n_neighbors': 7, 'estimator__p': ...",0.60139,1.0,0.41946,0.590695
9,0.008417,0.035813,"{'estimator__n_neighbors': 7, 'estimator__p': ...",0.622158,1.0,0.432574,0.60356


- Choosing the best model overall.

In [22]:
def get_best_params(pipeline):
    params = pipeline["grid_search"].best_params_
    result = {}
    for key in params.keys():
        result[key.split("__")[1]] = params[key]
    return result

In [24]:
for model in models.values():
    print(get_best_params(model))

{'n_neighbors': 3, 'p': 2, 'weights': 'distance'}
{'n_neighbors': 11, 'p': 2, 'weights': 'distance'}
{'n_neighbors': 3, 'p': 2, 'weights': 'distance'}
{'n_neighbors': 3, 'p': 2, 'weights': 'distance'}


In [25]:
_models = {}

In [26]:
_models["cv"] = Pipeline(steps=[
    ("ct", CountVectorizer()),
    ("oversampler", SMOTE(random_state=48)),
    ("scaler",MaxAbsScaler()),
    ("estimator",KNeighborsClassifier(**get_best_params(models["knn_cv"])))
])

In [27]:
_models["cv_svd"] = Pipeline(steps=[
    ("cv",CountVectorizer()),
    ("dr",TruncatedSVD(n_components=100)),
    ("oversampler", SMOTE(random_state=48)),
    ("scaler",StandardScaler()),
    ("estimator",KNeighborsClassifier(**get_best_params(models["knn_cv_svd"])))
])

In [28]:
_models["tfidf"] = Pipeline(steps=[
    ("cv", TfidfVectorizer()),
    ("oversampler", SMOTE(random_state=48)),
    ("scaler",MaxAbsScaler()),
    ("estimator",KNeighborsClassifier(**get_best_params(models["knn_tfidf"])))
])

In [29]:
_models["tfidf_svd"] = Pipeline(steps=[
    ("cv",TfidfVectorizer()),
    ("dr",TruncatedSVD(n_components=100)),
    ("oversampler", SMOTE(random_state=48)),
    ("scaler",StandardScaler()),
    ("estimator",KNeighborsClassifier(**get_best_params(models["knn_tfidf_svd"])))
])

In [30]:
evaluation_df = evaluate_cv(_models,scoring,cv=KFold(shuffle=True),X=X,y=y)

In [31]:
evaluation_df

Unnamed: 0,fit_time,score_time,accuracy,f1_score,precision,recall
cv,0.11867,0.658512,0.607292,0.587133,0.422077,0.969131
cv_svd,0.582305,0.080213,0.850656,0.786228,0.667042,0.959531
tfidf,0.730144,0.848247,0.611379,0.57875,0.500622,0.905714
tfidf_svd,0.570592,0.08724,0.614281,0.597965,0.427232,1.0


<div id="conclusion" >
    <h3>Conclusion</h3>
</div>

- KNN algorithm performs poorly on this dataset.
- Count vectorizer with svd gives the best results.

<div id="save_the_best_model" >
    <h3>Save the best model to the disk</h3>
</div>

In [32]:
dump(value=_models[evaluation_df.index[evaluation_df["f1_score"].argmax()]],filename=os.path.join("..","..","models","ssl","knn.joblib"))

['../../models/ssl/knn.joblib']