In [3]:
## general
import os 
import joblib
import requests
from google_drive_downloader import GoogleDriveDownloader as gdd

## Data manipulation
import pandas as pd
import numpy as np

## Modeling
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing

## visuatlization
import plotly 
import plotly.express as px
import plotly.graph_objects as go


In [None]:
## download preprocess data from the workshop
gdd.download_file_from_google_drive(file_id='1YgNwvAzgKSk2tF4EJihFWQjSqO7cPAf4',
                                    dest_path=os.path.join(os.getcwd(), 'Data','preprocessed_data.jblib'))

In [4]:
## todo relative path 
data = joblib.load(os.path.join(os.getcwd(), 'Data','preprocessed_data_500K.jblib'))
data

Unnamed: 0,collection_21_days,max_team_size,min_team_size,email,time_diff_x,time_diff_y,x0_AD,x0_AE,x0_AF,x0_AG,...,x6_51-100,x6_6-10,x6_8,x6_Apenas eu,x6_MISSING,x6_Moi uniquement,x6_Nur ich,x6_Solo yo,paying,lead_score
380359,0.0,1.0,1.0,0.000000,-5.000000,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
284353,0.0,0.0,0.0,1.000000,-1.436862,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
377170,0.0,1.0,1.0,1.000000,-6.000000,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
719476,0.0,1.0,1.0,0.000000,0.000000,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
674816,0.0,5.0,2.0,0.333333,-3.000000,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68941,0.0,5.0,2.0,1.000000,-5.000000,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
956629,0.0,5.0,2.0,1.000000,-4.000000,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
588016,0.0,1.0,1.0,1.000000,-4.000000,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1015802,0.0,1.0,1.0,1.000000,3.000000,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Split to train test

In [5]:
y = data['lead_score']
X = data.loc[:,(data.columns != 'lead_score') & (data.columns != 'account_id')]

# Smaller data to work with in the workshop...
max_samples = 10000
X = X[:max_samples]
y = y[:max_samples]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
print('X_train:', X_train.shape)
print('y_train:', y_train.shape)
print('X_test:', X_test.shape)
print('y_test:', y_test.shape)

X_train: (8000, 329)
y_train: (8000,)
X_test: (2000, 329)
y_test: (2000,)


In [7]:
## check if classes ratio is the same in train and test
ratio_train = y_train.sum() / (1-y_train).sum()
ratio_test = y_test.sum() / (1-y_test).sum()
print(ratio_train, ",", ratio_test)

0.02445895761301063 , 0.029336078229541946


# Preprocess

In [8]:
scaler = preprocessing.StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
print('X_train[:10]\n', X_train[:10])
print('X_train_scaled[:10]\n', X_train_scaled[:10])

X_train[:10]
          collection_21_days  max_team_size  min_team_size  email  time_diff_x  \
1411448                 0.0      10.000000            6.0    0.0     2.000000   
1229159                 0.0       1.000000            1.0    1.0    -5.000000   
1054852                 0.0      15.000000           11.0    0.0     5.500000   
551820                  0.0      17.596731            2.0    1.0    -1.436862   
653073                  0.0      17.596731            2.0    1.0    -1.436862   
849502                  0.0       1.000000            1.0    0.0    -4.000000   
9628                    0.0      10.000000            6.0    0.0    -6.000000   
509730                  0.0      25.000000           16.0    0.2     5.500000   
1024798                 0.0       1.000000            1.0    1.0    -3.000000   
834678                  0.0       5.000000            2.0    0.0    -3.000000   

         time_diff_y  x0_AD  x0_AE  x0_AF  x0_AG  ...  x6_500   x6_51-100  \
1411448          

# Logistic Regression

In [27]:
clf = LogisticRegression(multi_class='ovr', solver='sag', max_iter=10000, class_weight='balanced').fit(X_train_scaled, y_train)
# clf = LogisticRegression(multi_class='ovr', solver='sag', max_iter=1000000).fit(X_train_scaled, y_train)
# clf = LogisticRegression(multi_class='ovr', solver='liblinear', max_iter=10000).fit(X_train_scaled, y_train)


In [34]:
# predict probabilities:
proba = clf.predict_proba(X_test_scaled)
proba[:10]

array([[7.75541760e-01, 2.24458240e-01],
       [9.99178897e-01, 8.21103204e-04],
       [1.07034136e-01, 8.92965864e-01],
       [9.99996186e-01, 3.81431931e-06],
       [7.58199333e-01, 2.41800667e-01],
       [9.99998826e-01, 1.17398773e-06],
       [9.97866706e-01, 2.13329399e-03],
       [5.29849676e-01, 4.70150324e-01],
       [9.99922302e-01, 7.76976276e-05],
       [1.02996069e-05, 9.99989700e-01]])

In [35]:
# predict decision:
pred = clf.predict(X_test_scaled)
print('pred\n======\n', pred[:10])
print('y_test\n======\n', y_test[:10])

pred
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 1.]
y_test
 1422448    0.0
1294741    0.0
475755     0.0
1426970    0.0
1367808    0.0
739182     0.0
310128     0.0
638220     0.0
1093771    0.0
151068     0.0
Name: lead_score, dtype: float64


## Metrics

In [36]:
def calc_metrics(y_true, pred):
    print('f1_score:', f1_score(y_true, pred, average='binary'))
    print('recall:', recall_score(y_true, pred, average='binary'))
    print('precision:', precision_score(y_true, pred, average='binary'))
    print('accuracy:', accuracy_score(y_true, pred))

calc_metrics(y_test, pred)

f1_score: 0.15163934426229508
recall: 0.6491228070175439
precision: 0.08584686774941995
accuracy: 0.793


In [37]:
pred_train = clf.predict(X_train_scaled[:10000])
calc_metrics(y_train[:10000], pred_train)

f1_score: 0.19382782891174874
recall: 0.93717277486911
precision: 0.10809178743961352
accuracy: 0.813875


## Feature Selection

In [41]:
# We want to enforce a sparse weight vector. 
clf_sparse = LogisticRegression(multi_class='ovr', solver='saga', penalty='l1', max_iter=10000, class_weight='balanced').fit(X_train_scaled, y_train)


In [42]:
# Observe features
nonzero_feats = np.nonzero(clf_sparse.coef_)
feature_names = list(X_train.columns)
selected = [f for i, f in enumerate(feature_names) if i in nonzero_feats[1]]
not_selected = [f for i, f in enumerate(feature_names) if i not in nonzero_feats[1]]
print('Feature Selection yielded %d selected features and %d not-selected features.' % (len(not_selected), len(selected)))
      
print('Selected features are:\n')
print(', '.join(selected))

Feature Selection yielded 160 selected features and 169 not-selected features.
Selected features are:

collection_21_days, max_team_size, min_team_size, email, time_diff_x, time_diff_y, x0_AE, x0_AL, x0_AM, x0_AR, x0_AT, x0_AU, x0_BA, x0_BE, x0_BG, x0_BH, x0_BO, x0_BR, x0_CH, x0_CL, x0_CM, x0_CN, x0_CO, x0_CR, x0_CZ, x0_DE, x0_DK, x0_DO, x0_EC, x0_EG, x0_ES, x0_ET, x0_FI, x0_FJ, x0_FR, x0_GB, x0_GH, x0_GR, x0_GT, x0_HK, x0_HN, x0_HR, x0_HU, x0_ID, x0_IE, x0_IL, x0_IN, x0_IQ, x0_IS, x0_IT, x0_JM, x0_JO, x0_JP, x0_KR, x0_KW, x0_LK, x0_LT, x0_LU, x0_MISSING, x0_MK, x0_MQ, x0_MT, x0_MU, x0_MX, x0_MY, x0_NG, x0_NL, x0_NO, x0_NP, x0_NZ, x0_OM, x0_PA, x0_PE, x0_PH, x0_PK, x0_PL, x0_PR, x0_PS, x0_PT, x0_PY, x0_QA, x0_RO, x0_RS, x0_RU, x0_SA, x0_SE, x0_SG, x0_SN, x0_SV, x0_TD, x0_TH, x0_TN, x0_TR, x0_TW, x0_TZ, x0_UA, x0_US, x0_UY, x0_VE, x0_VI, x0_VN, x0_ZA, x0_ZM, x0_ZW, x1_tablet, x2_android, x2_chrome_os, x2_ios, x2_linux, x2_mac, x3_chrome, x3_firefox, x3_generic browser, x3_internet explo

In [43]:
# Observe performance
pred_sparse = clf_sparse.predict(X_test_scaled)
calc_metrics(y_test, pred_sparse)

f1_score: 0.1510204081632653
recall: 0.6491228070175439
precision: 0.08545034642032333
accuracy: 0.792


# NN

In [44]:
pred_sparse_train = clf_sparse.predict(X_train_scaled[:1000])
calc_metrics(y_train[:1000], pred_sparse_train)

f1_score: 0.17167381974248927
recall: 0.9090909090909091
precision: 0.0947867298578199
accuracy: 0.807
