# Propensity Modeling

In [1]:
%%bash
pip3 install user_agents
pip3 install tqdm



In [2]:
import numpy as np
import pandas as pd
import pickle
import urllib
from tqdm import tqdm, trange, tqdm_pandas
import os
from user_agents import parse
import datetime
import time
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
import tensorflow as tf
from sklearn.metrics import classification_report, confusion_matrix
# pd.set_option('display.max_columns', 1000)

## Load Data:

In [3]:
%%time
with open( "../Data/Feature_Engineering/df_features_dummies.p", "rb" ) as f:
    df_features_dummies = pickle.load(f)

with open( "../Data/Feature_Engineering/df_label.p", "rb" ) as f:
    df_label = pickle.load(f)

CPU times: user 124 ms, sys: 493 ms, total: 617 ms
Wall time: 756 ms


In [4]:
%%time
X = df_features_dummies
Y = df_label

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 5.25 µs


In [5]:
print(X.shape)
bar_80 = np.floor(X.shape[0]*0.8).astype(int)
bar_80=bar_80+1
bar_80

(220047, 2351)


176038

In [6]:
X_train=X[:bar_80]
Y_train=Y[:bar_80]
X_test=X[bar_80:]
Y_test=Y[bar_80:]

## Random Forest Classifier

In [7]:
clf=RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                       max_features='auto', max_leaf_nodes=None,  bootstrap=True, oob_score=False, 
                       n_jobs=1, random_state=None, verbose=1, warm_start=False, class_weight=None)

In [8]:
%%time
clf.fit(X_train, Y_train)

CPU times: user 32 s, sys: 725 ms, total: 32.7 s
Wall time: 32.7 s


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   31.6s finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=1, warm_start=False)

In [9]:
Trained_test=clf.predict(X_test)

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.4s finished


In [10]:
print('modeling is indeed learning, #Trained_test is: ',sum(Trained_test))

modeling is indeed learning, #Trained_test is:  3703


In [11]:
sum(Trained_test==Y_test)/len(Y_test)

0.88318298529846173

In [12]:
print(classification_report(Y_test, Trained_test))

             precision    recall  f1-score   support

          0       0.90      0.97      0.93     37065
          1       0.74      0.40      0.52      6944

avg / total       0.87      0.88      0.87     44009



In [15]:
confusion_matrix(Y_test, Trained_test)

array([[36115,   950],
       [ 4191,  2753]])

In [14]:
sum(Y_test)/len(Y_test)

0.15778590742802609

**feature Importance check**

In [41]:
clf.feature_importances_

array([  2.25972110e-02,   3.28152182e-02,   2.09891256e-02, ...,
         1.54767877e-03,   1.38472197e-05,   7.66978068e-05])

---

In [10]:
logreg=linear_model.LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, 
                                        intercept_scaling=1, class_weight=None, random_state=1, solver='liblinear', max_iter=1000, 
                                        multi_class='ovr', verbose=0, warm_start=False, n_jobs=1)

In [11]:
%%time
logreg.fit(X_train, Y_train)

CPU times: user 10 s, sys: 2.21 s, total: 12.2 s
Wall time: 12.5 s


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

### Predict confidence scores for samples:

In [14]:
Y_prob=logreg.decision_function(X)

In [15]:
Y_prob

array([-2.76335062, -2.76335633, -2.92093715, ..., -2.98928395,
       -2.98928395, -2.86641158])

### Log of probability estimates:

In [16]:
Y_logProb=logreg.predict_log_proba(X_train)

In [17]:
print(len(np.exp(Y_logProb)))
print(np.exp(Y_logProb))
print('first column is prob of choosing label 0, second column is the prob of choosing label 1')

176038
[[ 0.94066293  0.05933707]
 [ 0.94066325  0.05933675]
 [ 0.94887178  0.05112822]
 ..., 
 [ 0.94855464  0.05144536]
 [ 0.94640289  0.05359711]
 [ 0.94708679  0.05291321]]
first column is prob of choosing label 0, second column is the prob of choosing label 1


### Predicted labels:

In [18]:
Y_pred = logreg.predict(X_test)

In [19]:
Y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [20]:
sum(Y_pred==Y_test)/Y_test.shape[0]

0.84221409257197388