# Modelling

In [0]:
import pandas as pd
import numpy as np

In [0]:
pth = "./drive/My Drive/DatosInsider/DWH_tables/"

In [0]:
train_x = pd.read_csv(pth + "ins_train_x_resampled.csv")
train_y = pd.read_csv(pth + "ins_train_y_resampled.csv")

In [0]:
#dropping period
train_x.drop(["period"], axis = 1, inplace = True)

In [0]:
#data scaling
from sklearn.preprocessing import StandardScaler

std_model = StandardScaler()
std_model.fit(train_x)
train_x_scaled = pd.DataFrame(std_model.transform(train_x))

In [0]:
%%time
from sklearn.feature_selection import RFECV
from sklearn.linear_model import SGDClassifier


estimator = SGDClassifier(max_iter=10000, tol = 1e-3, random_state = 666)
selector = RFECV(estimator, step=1, cv=5,)
selector = selector.fit(train_x_scaled, np.ravel(train_y))
selector.support_



CPU times: user 1min 37s, sys: 37.9 s, total: 2min 14s
Wall time: 1min 30s


In [0]:
selector.support_


array([ True,  True,  True, False,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True, False,  True,  True, False,
       False,  True,  True,  True,  True,  True, False, False,  True,
       False, False,  True, False,  True, False, False, False,  True,
        True, False, False, False,  True,  True,  True, False,  True,
        True,  True,  True, False, False, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True, False,  True,  True,
        True, False,  True,  True, False, False, False,  True,  True,
        True, False, False,  True,  True,  True,  True, False, False,
        True,  True,  True,  True,  True,  True,  True])

In [0]:
selected_vars = train_x.columns[selector.support_]
selected_vars

Index(['leak_risk_cnt', 'leak_we_cnt', 'leak_off_hr_cnt', 'thief_we_cnt',
       'thief_off_hr_cnt', 'sabotage_risk_cnt', 'sabotage_off_hr_cnt', 'adv',
       'astronomy', 'banking', 'cars', 'chat', 'dating', 'downloads', 'gamble',
       'games-misc', 'games-online', 'government', 'hacking', 'imagehosting',
       'leaks', 'lingerie', 'news', 'other', 'radiotv', 'realestate',
       'redirector', 'remotecontrol', 'restaurants', 'searchengines',
       'shipping', 'spyware', 'telecommunication', 'tracker', 'travel',
       'updatesites', 'urlshortener', 'warez', 'webmail', 'webradio', 'webtv',
       'wellness', 'device_count', 'file_count', 'min_logon', 'avg_min_logon',
       'med_min_logon', 'med_cnt_logon', 'max_logoff', 'avg_max_logoff',
       'med_max_logoff', 'med_cnt_logoff', 'avg_logged_time',
       'med_logged_time', 'cnt_unauthorized_log', 'isSupervisor',
       'mean_salary', 'gender'],
      dtype='object')

# SGDClassifier


In [0]:
model = SGDClassifier(max_iter=10000, tol = 1e-3, random_state = 666)
model.fit(train_x_scaled.loc[:,selector.support_], np.ravel(train_y))

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=10000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=666, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [0]:
model.score(train_x_scaled.loc[:,selector.support_], np.ravel(train_y))

0.8871352333371182

# Trying in testing

In [0]:
test_x = pd.read_csv(pth + "ins_test_x.csv")
test_y = pd.read_csv(pth + "ins_test_y.csv", header=None)

test_x.drop(["period"], axis = 1, inplace = True)

In [0]:
test_x_scaled = pd.DataFrame(std_model.transform(test_x))

In [0]:
y_pred = model.predict(test_x_scaled.loc[:,selector.support_])

In [0]:
from sklearn.metrics import classification_report

print(classification_report(test_y,y_pred))

              precision    recall  f1-score   support

         0.0       0.98      0.91      0.94      3775
         1.0       0.51      0.81      0.62       418

    accuracy                           0.90      4193
   macro avg       0.74      0.86      0.78      4193
weighted avg       0.93      0.90      0.91      4193



In [0]:
from sklearn.metrics import recall_score
recall_score(test_y, y_pred, average="macro")

0.862058683735226

In [0]:
##hyperfitting 
%%time
from sklearn.model_selection import GridSearchCV
import time

t0 = time.time()

param_grid = {'loss': ['hinge', 'log', 'squared_hinge', 'perceptron'],
              'penalty': ['l2', 'l1', 'elasticnet'], 
              'n_jobs' : [-1],
              'random_state' : [666]
              }
clf = GridSearchCV(
    SGDClassifier(), param_grid
)

clf = clf.fit(train_x_scaled.loc[:,selector.support_], np.ravel(train_y))
print("done in %0.3fs" % (time.time() - t0))
print("Best estimator found by grid search:")
print(clf.best_estimator_)

done in 32.266s
Best estimator found by grid search:
SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=-1, penalty='l1',
              power_t=0.5, random_state=666, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)
CPU times: user 33.3 s, sys: 6.07 s, total: 39.4 s
Wall time: 32.3 s


In [0]:
y_pred_clf = clf.predict(test_x_scaled.loc[:,selector.support_])

print(classification_report(test_y,y_pred_clf))

              precision    recall  f1-score   support

         0.0       0.97      0.92      0.94      3775
         1.0       0.51      0.78      0.62       418

    accuracy                           0.90      4193
   macro avg       0.74      0.85      0.78      4193
weighted avg       0.93      0.90      0.91      4193



# Out of time test

In [0]:
#out of time test
insider_oot = pd.read_csv(pth + "insider_OOT.csv")
insider_oot.head()

test_x_oot = insider_oot.drop(["period", "isInsider"], axis = 1)
test_y_oot = insider_oot["isInsider"]

test_x_oot_scaled = pd.DataFrame(std_model.transform(test_x_oot))

In [0]:
y_pred_oot = clf.predict(test_x_oot_scaled.loc[:,selector.support_])

In [0]:
print(classification_report(test_y_oot,y_pred_oot))

              precision    recall  f1-score   support

         0.0       1.00      0.83      0.90      2557
         1.0       0.07      1.00      0.14        35

    accuracy                           0.83      2592
   macro avg       0.54      0.91      0.52      2592
weighted avg       0.99      0.83      0.89      2592



In [0]:
import pickle
mdlpth = "./drive/My Drive/DatosInsider/Models/"
pickle.dump(model, open(mdlpth + "SGDClassifier_StandardScaler", 'wb'))
pickle.dump(model, open(mdlpth + "SGDClassifier_Default.model", 'wb'))
pickle.dump(clf, open(mdlpth + "SGDClassifier_GridSearch.model", 'wb'))

# XGBoost

In [0]:
!pip install xgboost



In [0]:
from xgboost import XGBClassifier

In [0]:
model_xgb = XGBClassifier(random_state = 666)
model_xgb.fit(train_x_scaled.loc[:,selector.support_], np.ravel(train_y))

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=666,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [0]:
t0 = time.time()

param_grid = {'booster': ['gbtree', 'gblinear', 'dart'],
              'max_depth' : [3,4,5,6]
              }
clf_xgboost = GridSearchCV(
    XGBClassifier(), param_grid
)

clf_xgboost = clf_xgboost.fit(train_x_scaled.loc[:,selector.support_], np.ravel(train_y))
print("done in %0.3fs" % (time.time() - t0))
print("Best estimator found by grid search:")
print(clf_xgboost.best_estimator_)

done in 257.076s
Best estimator found by grid search:
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)


# Trying in testing


In [0]:
y_pred_xg = model_xgb.predict(test_x_scaled.loc[:,selector.support_])

In [0]:
print(classification_report(test_y,y_pred_xg))

              precision    recall  f1-score   support

         0.0       0.97      0.99      0.98      3775
         1.0       0.90      0.72      0.80       418

    accuracy                           0.96      4193
   macro avg       0.93      0.85      0.89      4193
weighted avg       0.96      0.96      0.96      4193



In [0]:
y_pred_xg_clf = clf_xgboost.predict(test_x_scaled.loc[:,selector.support_])
print(classification_report(test_y, y_pred_xg_clf))

              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99      3775
         1.0       0.98      0.83      0.90       418

    accuracy                           0.98      4193
   macro avg       0.98      0.91      0.94      4193
weighted avg       0.98      0.98      0.98      4193



# Out of time test

In [0]:
y_pred_oot_xgb = model_xgb.predict(test_x_oot_scaled.loc[:,selector.support_])
print(classification_report(test_y_oot, y_pred_oot_xgb))

              precision    recall  f1-score   support

         0.0       1.00      0.92      0.96      2557
         1.0       0.14      0.94      0.24        35

    accuracy                           0.92      2592
   macro avg       0.57      0.93      0.60      2592
weighted avg       0.99      0.92      0.95      2592



In [0]:
pickle.dump(model, open(model_xgb + "XGBoost.model", 'wb'))
pickle.dump(clf, open(clf_xgboost + "xgboost_GridSearch.model", 'wb'))