# Model tunning

In this notebook we will report our experiments about the impact of various classifiers (e.g., SVM, Random Forest, Boosting, logistic regression...) and for each classifier, explain the procedure that was followed to tackle parameter tuning and prevent overfitting

In [10]:
import pandas as pd
import numpy as np
from utils import Config
from utils.extract_features import *
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn import svm
import lightgbm as lgb
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score


config = Config("config/")

E_F = features_dataset(config.preprocess_all)
if config.preprocess_all:
    E_F.prepocess_data()
train_features, training_labels, test_features = E_F.load_features_all()
m = train_features.mean(axis=0)
std = train_features.std(axis=0)
if config.norm:
    train_features = (train_features - m) / std
    test_features = (test_features - m) / std

a_tester = config.features_a_tester
train_features = train_features[:, a_tester]
test_features = test_features[:, a_tester]

kf = StratifiedKFold(n_splits=config.num_split_cross_val, shuffle=True)

train_features = pd.DataFrame(train_features)
training_labels = pd.DataFrame(training_labels)


/media/benamira/19793564030D4273/MCsBackup/3A/OMA/NGSA/Assigment/kaggle_competition/utils


# Model

In [None]:
def train_predict_save(model, tr, val, y_tr, y_val):
    fscore_t = f1_score(y_tr,
                        model.predict(tr))
    fscore_v = f1_score(y_val,
                        model.predict(val))
    #print(" model: F1 score - Training %.3f - Validation %.3f" % (fscore_t, fscore_v))    
    return(fscore_t, fscore_v)

def eval_model(model, train_features, training_labels, idx):
    predicts_t = []
    predicts_v = []
    for train_index, test_index in kf.split(train_features, training_labels):
        n = int(0.1*(len(train_index)+len(test_index)))
        random.shuffle(test_index)
        test_index_new = test_index[:n]
        train_index_new = np.union1d(test_index[n:], train_index)

        X_train, X_val = train_features.iloc[train_index_new], train_features.iloc[test_index_new]
        y_train, y_val = training_labels.iloc[train_index_new], training_labels.iloc[test_index_new]
        
        if idx == 0:
            model.fit(X_train, y_train, eval_set=[(X_val, y_val)],
                    early_stopping_rounds=50, verbose=None)
        else:
            model.fit(X_train, y_train)
        fscore_t, fscore_v = train_predict_save(model, X_train, X_val, y_train, y_val)
        predicts_t.append(fscore_t)
        predicts_v.append(fscore_v)
        
    return(predicts_t, predicts_v)


modelGB = lgb.LGBMClassifier(objective='binary', reg_lambda=config.reg_lambda_gb,
                           n_estimators=config.n_estimator_GB  )
modelRF = RandomForestClassifier(n_estimators=500)
modelSVM = svm.LinearSVC()
modelL = LogisticRegression()

res_t = {}
res_v = {}
for idx, model in enumerate([modelGB, modelRF, modelSVM, modelL]):
    print(idx)
    (fscore_t, fscore_v) = eval_model(model, train_features, training_labels, idx)
    res_t[idx] = fscore_t
    res_v[idx] = fscore_v

0


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


1




# GridSearch

In [None]:
from sklearn.model_selection import GridSearchCV

# Create the parameter grid based on the results of random search
param_grid = {
    'penalty': ['l2','l1'],
    'dual': [True, False],
    'tol': [1e-4,1e-3,1e-5],
    'C': [0.1, 1, 10],
}
# Create a base model
rf = svm.LinearSVC()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid,
                          cv = 3, n_jobs = -1, verbose = 2, return_train_score=True)
# Fit the grid search to the data
grid_search.fit(train_features, training_labels);
grid_search.best_params_
print(grid_search.best_estimator_)

In [None]:
# Create a base model
rf = LogisticRegression()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid,
                          cv = 3, n_jobs = -1, verbose = 2, return_train_score=True)
# Fit the grid search to the data
grid_search.fit(train_features, training_labels);
grid_search.best_params_
print(grid_search.best_estimator_)

In [None]:
param_grid = {
    'n_estimators': [100, 500, 1000]
}
# Create a base model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid,
                          cv = 3, n_jobs = -1, verbose = 2, return_train_score=True)
# Fit the grid search to the data
grid_search.fit(train_features, training_labels);
grid_search.best_params_
print(grid_search.best_estimator_)