# Algorithm search

Now it's time to select the most appropriate algorithm for the problem. A good principle is to start with the simpler one and work your way up to more complex ones if the results are not satisfying. 

In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.utils import shuffle
import numpy as np
import pandas as pd
import random

In [2]:
from scipy.sparse import load_npz, hstack

min_count = 5

emails = pd.read_pickle('./data/emails.pkl')
subjects_BoW = load_npz('./data/subjects_BoW.npz')
contents_BoW = load_npz('./data/contents_BoW.npz')
FromUsers = load_npz('./data/FromUsers.npz')
ToUsers = load_npz('./data/ToUsers.npz')
FromDomains = load_npz('./data/FromDomains.npz')
ToDomains = load_npz('./data/ToDomains.npz')

# Drop columns that have less than the min count
subjects_BoW = subjects_BoW[:,subjects_BoW.sum(0).A[0] > min_count]
contents_BoW = contents_BoW[:,contents_BoW.sum(0).A[0] > min_count]
FromUsers = FromUsers[:,FromUsers.sum(0).A[0] > min_count]
ToUsers = ToUsers[:,ToUsers.sum(0).A[0] > min_count]
FromDomains = FromDomains[:,FromDomains.sum(0).A[0] > min_count]
ToDomains = ToDomains[:,ToDomains.sum(0).A[0] > min_count]

# Stack the data altogether
processed_data = hstack([subjects_BoW, contents_BoW, FromUsers, ToUsers, FromDomains, ToDomains], format='csr', dtype=float)
del subjects_BoW; del contents_BoW; del FromUsers; del ToUsers; del FromDomains; del ToDomains

processed_data.shape

(6362, 18993)

Data shuffling and creation of test set

In [3]:
X, y = processed_data, emails['label'][:6362].values
del processed_data
del emails

indexes = list(range(X.shape[0]))
random.seed(1)
random.shuffle(indexes)

X, y = X[indexes], y[indexes]
cutoff = int(X.shape[0]*0.5)

X_train_valid, y_train_valid = X[:cutoff], y[:cutoff]
X_test, y_test = X[cutoff:], y[cutoff:]

del X, y

# Logistic Regression

Learning algorithm, here, the logistic regression has been selected for its low complexity and more than interesting score.

In [4]:
# Initialisation
LR = LogisticRegression(class_weight='balanced')

# Training
LR.fit(X_train_valid,y_train_valid)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

## Results

A good start for scoring a classifier is to analyse their precision, recall and f1-score which is the combination of the two. Indeed, here the classes are a bid imbalanced, so this could have a good precision by just predicting the most represented class.

The f1-score usually gives a balanced scoring by giving as much importance to the minority class than the majority one.

In [8]:
predictions = LR.predict(X_test)

print(classification_report(y_test, predictions))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00      2435
          1       0.99      1.00      1.00       746

avg / total       1.00      1.00      1.00      3181



In this case, the algorithm seems to be perfectly fine for the problem. We could add some boosting to grab the last bit of precision, but it would cost more calculation at prediction time, so more money overall. Plus, the logistic regression is easy to parallelize if needs be.

## Boosting

In [5]:
from xgboost import XGBClassifier

In [6]:
def BoostedClassifier(X_train_, y_train_, X_valid_, y_valid_):
    # Initialiser l'Extreme Gradient Boosted Classifier (XGBClassifier) avec comme objectif: binaire logistique pour avoir une regression logisitique.
    clf = XGBClassifier(n_estimators=200,
                        max_depth=4,
                        objective="binary:logistic",
                        learning_rate=.1, 
                        subsample=.8, 
                        colsample_bytree=.8,
                        gamma=1,
                        reg_alpha=0,
                        reg_lambda=1,
                        n_jobs=-1)

    # Entraîner le classifieur avec la métrique gini_xgb
    # PS: mettez verbose=True si vous voulez voir l'avancement
    clf.fit(X_train_,y_train_,
             eval_set=[(X_train_,y_train_),(X_valid_,y_valid_)],
             eval_metric='error',
             early_stopping_rounds=None,
             verbose=False)
 
    return clf

In [7]:
# Séparer X_train_valid, y_train_valid en X_train, X_valid et y_train, y_valid avec un split de 0.9 (90% des données dans train et 10% dans test)
cutoff = int(X_train_valid.shape[0]*0.9)
X_valid, y_valid = X_train_valid[cutoff:], y_train_valid[cutoff:]
X_train, y_train = X_train_valid[:cutoff], y_train_valid[:cutoff]

# Appeler le BoostedClassifier
xgb = BoostedClassifier(X_train, y_train, X_valid, y_valid)

In [9]:
# Trouver l'estimateur qui a eu la meilleur performance Gini parmis tous les estimateurs
xgb_evals = xgb.evals_result_["validation_1"]['error']
best_round = np.argsort(xgb_evals)[::-1][0]

# Calculer les probabilités prédites par le meilleur estimateur sur l'ensemble de test
predictions = xgb.predict(X_test, ntree_limit=best_round)

  if diff:


In [10]:
print(classification_report(y_test, predictions))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00      2435
          1       1.00      1.00      1.00       746

avg / total       1.00      1.00      1.00      3181

