# Modelling

## Loading the processed dataset

In [2]:
# !pip install -U -q datalearn19intro
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datalearn19intro import get_processed_intro_dataset
%matplotlib inline

In [3]:
pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.max_columns', 150)
pd.set_option('display.max_rows', 200)

In [4]:
df = get_processed_intro_dataset()

In [5]:
df.shape

(1001, 22)

In [6]:
df = df.set_index('account_id')

In [7]:
df.head(8)

Unnamed: 0_level_0,is_gmail_fromaccounts,collection_21_days,billed_users_count,log_max_tsize,os_android_avg,os_chrome_os_avg,os_ios_avg,os_linux_avg,os_mac_avg,total_events_sum,notification_events_sum,new_entry_events_sum,payment_events_sum,inbox_events_sum,communicating_events_sum,non_communicating_events_sum,web_events_sum,ios_events_sum,desktop_app_events_sum,empty_events_sum,lead_score
account_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2793496,1.0,0.0,0.0,0.57,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2793497,1.0,0.0,0.0,0.57,0.45,0.0,0.55,0.0,0.0,0.01,0.04,0.02,0.0,0.0,0.0,0.0,0.0,0.17,0.0,0.0,0.0
2793498,1.0,0.0,0.0,0.43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2793499,1.0,0.0,0.0,0.57,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2793500,1.0,0.0,0.0,0.43,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2793501,0.0,0.0,0.0,0.57,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2793502,1.0,0.0,0.0,0.43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2793503,1.0,0.0,0.0,0.57,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
X = df.drop('lead_score', axis=1)

In [9]:
y = df['lead_score']

## Data split

In [10]:
from sklearn.model_selection import train_test_split

Let's split the data into train and test sets, at a 80/20 ratio.

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Model fit

In [32]:
from sklearn.linear_model import LogisticRegression

In [33]:
clf = LogisticRegression()

In [34]:
clf.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [35]:
y_pred = clf.predict(X_test)

In [36]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [37]:
accuracy_score(y_test, y_pred)

0.9701492537313433

OMG! That's amazing!

In [38]:
precision_score(y_test, y_pred)

  'precision', 'predicted', average, warn_for)


0.0

In [39]:
recall_score(y_test, y_pred)

0.0

Oh no! Maybe our model wasn't as good as we thought!
What happened?

In [40]:
np.unique(y_pred, return_counts=True)

(array([0.]), array([201]))

In [41]:
np.unique(y, return_counts=True)

(array([0., 1.]), array([980,  21]))

In [42]:
print("Accuracy: {:,.2f}%".format(100*accuracy_score(y_test, y_pred)))
print("Precision: {:,.2f}%".format(100*precision_score(y_test, y_pred)))
print("Recall: {:,.2f}%".format(100*recall_score(y_test, y_pred)))

Accuracy: 97.01%
Precision: 0.00%
Recall: 0.00%


  'precision', 'predicted', average, warn_for)


Notice, we have 980 negative examples and only 21 positive ones.

In [43]:
980/1001 * 100

97.9020979020979

Our model was optimizing accuracy when fitting its parameters, and the easiest to do that is to simply predict 0 all the time for roughly 98% accuracy!

We can use `class_weight='balanced'` to each **class** equally important instead of each row/entry.

In [44]:
clf = LogisticRegression(class_weight='balanced')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)



In [45]:
print("Accuracy: {:,.2f}%".format(100*accuracy_score(y_test, y_pred)))
print("Precision: {:,.2f}%".format(100*precision_score(y_test, y_pred)))
print("Recall: {:,.2f}%".format(100*recall_score(y_test, y_pred)))

Accuracy: 68.66%
Precision: 7.46%
Recall: 83.33%


## Hyperparameter tuning

We can define, for each hyperparameter, a range of possible values.

For logistic regression here we will play with just two hyperparameters:

* `penalty` - The loss function use. Both L1 and L2 are common loss functions.
* `C` - Inverse of regularization strength; smaller values specify stronger regularization. Regularization can prevent overfitting, a concept which you'll discuss in the advanced workshops.

In [46]:
hyparam_grid = [{
    'penalty': ['l1', 'l2'],
    'C': [1, 10, 100, 1000],
    'class_weight': ['balanced'],
}]

We can optimize are hyperparameters for various metrics...

In [49]:
scores = ['precision', 'recall']

In [70]:
# silence annoying future warnings
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [54]:
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(LogisticRegression(), hyparam_grid, cv=5,
                       scoring='%s_macro' % score)
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

# Tuning hyper-parameters for precision

Best parameters set found on development set:

{'C': 10, 'class_weight': 'balanced', 'penalty': 'l2'}

Grid scores on development set:

0.525 (+/-0.014) for {'C': 1, 'class_weight': 'balanced', 'penalty': 'l1'}
0.525 (+/-0.014) for {'C': 1, 'class_weight': 'balanced', 'penalty': 'l2'}
0.515 (+/-0.036) for {'C': 10, 'class_weight': 'balanced', 'penalty': 'l1'}
0.525 (+/-0.014) for {'C': 10, 'class_weight': 'balanced', 'penalty': 'l2'}
0.508 (+/-0.022) for {'C': 100, 'class_weight': 'balanced', 'penalty': 'l1'}
0.521 (+/-0.024) for {'C': 100, 'class_weight': 'balanced', 'penalty': 'l2'}
0.516 (+/-0.032) for {'C': 1000, 'class_weight': 'balanced', 'penalty': 'l1'}
0.513 (+/-0.031) for {'C': 1000, 'class_weight': 'balanced', 'penalty': 'l2'}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

         0.0       

We can now fit a model with the tune hyperparameters over the entire training set and test its performace on the test set. This will provide an estimate of performance which is likely to be (possibly substantially) optimistically biased.

A more advanced way to get estimates for hyperoptimization results - and a less biased one - is nested cross-validation; this is a more computationaly intensive method, and out of scope here, of course. :)

In [55]:
clf = LogisticRegression(class_weight='balanced', C=10, penalty='l2')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Accuracy: {:,.2f}%".format(100*accuracy_score(y_test, y_pred)))
print("Precision: {:,.2f}%".format(100*precision_score(y_test, y_pred)))
print("Recall: {:,.2f}%".format(100*recall_score(y_test, y_pred)))

Accuracy: 68.66%
Precision: 7.46%
Recall: 83.33%


(note: this produced no difference in relation to the default hyperparameters)

# Trying other models is easy

## SVM

In [67]:
from sklearn.svm import SVC

In [58]:
clf = RandomForestClassifier(class_weight='balanced')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Accuracy: {:,.2f}%".format(100*accuracy_score(y_test, y_pred)))
print("Precision: {:,.2f}%".format(100*precision_score(y_test, y_pred)))
print("Recall: {:,.2f}%".format(100*recall_score(y_test, y_pred)))

Accuracy: 97.01%
Precision: 0.00%
Recall: 0.00%


In [60]:
clf = SVC(class_weight='balanced')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Accuracy: {:,.2f}%".format(100*accuracy_score(y_test, y_pred)))
print("Precision: {:,.2f}%".format(100*precision_score(y_test, y_pred)))
print("Recall: {:,.2f}%".format(100*recall_score(y_test, y_pred)))

Accuracy: 68.66%
Precision: 7.46%
Recall: 83.33%


Don't be surprised that different linear models tend to achieve the same performance

In [69]:
clf = SVC(class_weight='balanced', kernel='poly')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Accuracy: {:,.2f}%".format(100*accuracy_score(y_test, y_pred)))
print("Precision: {:,.2f}%".format(100*precision_score(y_test, y_pred)))
print("Recall: {:,.2f}%".format(100*recall_score(y_test, y_pred)))

Accuracy: 2.99%
Precision: 2.99%
Recall: 100.00%


In [73]:
hyparam_grid = [{
    'kernel': ['rbf', 'poly', 'sigmoid'],
    'C': [0.1, 1, 10, 100],
    'degree': [2, 3, 4, 5],
    'class_weight': ['balanced'],
}]

In [74]:
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(SVC(), hyparam_grid, cv=5,
                       scoring='%s_macro' % score)
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

# Tuning hyper-parameters for precision

Best parameters set found on development set:

{'C': 0.1, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'rbf'}

Grid scores on development set:

0.530 (+/-0.004) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'rbf'}
0.009 (+/-0.000) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'poly'}
0.530 (+/-0.004) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'sigmoid'}
0.530 (+/-0.004) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 3, 'kernel': 'rbf'}
0.009 (+/-0.000) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 3, 'kernel': 'poly'}
0.530 (+/-0.004) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 3, 'kernel': 'sigmoid'}
0.530 (+/-0.004) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 4, 'kernel': 'rbf'}
0.009 (+/-0.000) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 4, 'kernel': 'poly'}
0.530 (+/-0.004) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 4, 'kern

## Random forest

In [68]:
from sklearn.ensemble import RandomForestClassifier

In [63]:
hyparam_grid = [{
    'n_estimators': [5, 20, 100],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 3, 5],
    'class_weight': ['balanced'],
}]

In [64]:
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(RandomForestClassifier(), hyparam_grid, cv=5,
                       scoring='%s_macro' % score)
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

# Tuning hyper-parameters for precision

Best parameters set found on development set:

{'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 5, 'n_estimators': 5}

Grid scores on development set:

0.491 (+/-0.000) for {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': None, 'n_estimators': 5}
0.491 (+/-0.000) for {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': None, 'n_estimators': 20}
0.491 (+/-0.000) for {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': None, 'n_estimators': 100}
0.524 (+/-0.058) for {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 3, 'n_estimators': 5}
0.524 (+/-0.136) for {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 3, 'n_estimators': 20}
0.511 (+/-0.083) for {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 3, 'n_estimators': 100}
0.533 (+/-0.132) for {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 5, 'n_estimators': 5}
0.491 (+/-0.000) for {'class_weight'

Without putting too much effort into hyperparameter tunning, a random forest classifier got an F1 score of 54% (averaged over classes) compared to 47% of logistic regression and SVM.

Even if that is not what we want to optimize for here (and that depends on a lot of factors), this still demonstrates why giving other models at least a superficial examination can be worthwhile.

## Final notes: The classification problem here is significantly unbalanced; we did not treat this at all except for balancing class weights. An imbalance of 98/2 probably justifies more sophisticated tools.

## One such option is to treat positive examples as anomalies, and to then draw on knowledge and methods from the field of anomaly detection.

## Other options can include more advnaced processing of the data.

## The advanced workshops might go into more details.

# We're done! Thank you!