In [1]:
import pandas as pd 
import numpy as np
import warnings 
warnings.filterwarnings('ignore')
import datapipe_utils as dt

In [2]:
bd_train=pd.read_csv("/Users/lalitsachan/Dropbox/0.0 Data/bd_train.csv")

In [3]:
bd_test=pd.read_csv("/Users/lalitsachan/Dropbox/0.0 Data/bd_test.csv")

In [4]:
def children(children):
    
    child_num=children.str.replace('Zero','0')
    child_num=child_num.str.replace('4+','4',regex=False)
    child_num=pd.to_numeric(child_num,errors='coerce')
    
    return child_num

In [5]:
def age_band(age_band):
    
    ab_num=age_band.str.replace('71+','71-71',regex=False)
    k=ab_num.str.split('-',expand=True)
    
    for i in [0,1]:
        k.iloc[:,i]=pd.to_numeric(k.iloc[:,i],errors='coerce')
        
    ab_num=0.5*(k[0]+k[1])
    
    return ab_num

In [6]:
def family_income(family_income):
    
    fi_num=family_income.replace({'>=35,000':35,
                                  '<27,500, >=25,000':26.25,
                                  '<30,000, >=27,500':28.5,
                                  '<25,000, >=22,500':23.75,
                                  '<20,000, >=17,500':18.75,
                                  '<12,500, >=10,000':11.25,
                                  '<17,500, >=15,000':16.25,
                                  '<15,000, >=12,500':13.75,
                                  '<22,500, >=20,000':21.25,
                                  '<10,000, >= 8,000':9,
                                  '< 8,000, >= 4,000':6,
                                  '< 4,000':4})
    fi_num=pd.to_numeric(fi_num,errors='coerce')
    
    return fi_num

In [7]:
simple_numeric=[ 'year_last_moved',
'Average.Credit.Card.Transaction',
'Balance.Transfer',
'Term.Deposit',
'Life.Insurance',
'Medical.Insurance',
'Average.A.C.Balance',
'Personal.Loan',
'Investment.in.Mutual.Fund',
'Investment.Tax.Saving.Bond',
'Home.Loan',
'Online.Purchase.Amount',
'Investment.in.Commudity',
'Investment.in.Equity',
'Investment.in.Derivative',
'Portfolio.Balance']

cat_to_dummies=['status',
'occupation',
'occupation_partner',
'home_status',
'self_employed',
'self_employed_partner',
'TVarea',
'gender',
'region']

custom_var_dict={'children':children,'age_band':age_band,'family_income':family_income}

In [8]:
mypipe=dt.DataPipe(simple_numeric=simple_numeric,
                   cat_to_dummies=cat_to_dummies,
                  custom_var_dict=custom_var_dict)

In [9]:
mypipe.fit(bd_train)

<datapipe_utils.DataPipe at 0x1451d1150>

In [10]:
x_train=mypipe.transform(bd_train)

In [11]:
x_train.shape

(8124, 71)

In [12]:
x_test=mypipe.transform(bd_test)

In [13]:
x_test.shape

(2031, 71)

In [14]:
bd_train

Unnamed: 0,REF_NO,children,age_band,status,occupation,occupation_partner,home_status,family_income,self_employed,self_employed_partner,...,Investment.Tax.Saving.Bond,Home.Loan,Online.Purchase.Amount,Revenue.Grid,gender,region,Investment.in.Commudity,Investment.in.Equity,Investment.in.Derivative,Portfolio.Balance
0,4888,Zero,55-60,Widowed,Retired,Unknown,Own Home,"<10,000, >= 8,000",No,No,...,33.47,12.96,4.99,1,Female,North West,91.85,25.71,95.52,249.82
1,8525,Zero,61-65,Partner,Retired,Retired,Own Home,">=35,000",No,No,...,45.96,28.95,3.99,2,Male,North West,127.65,56.21,89.20,222.27
2,3411,3,31-35,Partner,Professional,Housewife,Own Home,"<25,000, >=22,500",No,No,...,0.00,0.00,0.00,2,Female,North West,4.19,0.00,3.50,17.05
3,692,Zero,51-55,Partner,Secretarial/Admin,Other,Own Home,"<20,000, >=17,500",No,No,...,0.00,0.00,0.00,2,Male,North West,9.59,0.00,7.99,-72.74
4,10726,1,51-55,Partner,Retired,Retired,Own Home,">=35,000",No,No,...,0.00,0.00,0.00,2,Female,West Midlands,15.69,5.83,15.66,-13.98
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,2680,2,41-45,Single/Never Married,Housewife,Professional,Own Home,"<27,500, >=25,000",No,No,...,14.98,0.00,178.38,1,Female,South East,63.77,79.58,65.68,181.11
8120,7564,2,41-45,Divorced/Separated,Professional,Unknown,Own Home,"<20,000, >=17,500",No,No,...,15.48,0.00,35.47,2,Female,East Midlands,48.19,27.81,23.81,104.01
8121,9719,2,36-40,Partner,Secretarial/Admin,Manual Worker,Own Home,"<30,000, >=27,500",No,No,...,1.00,0.00,15.47,2,Female,North West,92.66,3.41,12.49,102.24
8122,9132,Zero,55-60,Partner,Housewife,Professional,Own Home,"<30,000, >=27,500",No,No,...,0.00,0.00,0.00,2,Female,South East,13.49,0.00,11.24,-63.25


In [15]:
y_train=(bd_train['Revenue.Grid']==1).astype(int)

In [16]:
from sklearn.linear_model import LogisticRegression

In [17]:
params={'class_weight':['balanced',None],
       'penalty':['l1','l2'],
       'C':[.0001,.0005,.001,.005,.01,.05,.1,1,2,5,10]}

In [18]:
model=LogisticRegression(solver='liblinear')

In [19]:
from sklearn.model_selection import GridSearchCV

In [20]:
gs=GridSearchCV(model,
               param_grid=params,
               cv=10,
               scoring='roc_auc',
               n_jobs=-1,
               verbose=20
               )

In [21]:
gs.fit(x_train,y_train)

Fitting 10 folds for each of 44 candidates, totalling 440 fits


In [22]:
gs.best_estimator_

In [23]:
params={'class_weight':['balanced',None],
       'penalty':['l1','l2'],
       'C':[.01,.02,0.3,.04,0.05,0.06,0.07,0.08,0.09,0.1]}

In [24]:
gs=GridSearchCV(model,
               param_grid=params,
               cv=10,
               scoring='roc_auc',
               n_jobs=-1,
               verbose=20
               )

In [25]:
gs.fit(x_train,y_train)

Fitting 10 folds for each of 40 candidates, totalling 400 fits




In [26]:
gs.best_estimator_

In [27]:
dt.report(gs.cv_results_,5)

Model with rank: 1
Mean validation score: 0.955286 (std: 0.006627)
Parameters: {'C': 0.06, 'class_weight': 'balanced', 'penalty': 'l1'}

Model with rank: 2
Mean validation score: 0.955272 (std: 0.006654)
Parameters: {'C': 0.05, 'class_weight': 'balanced', 'penalty': 'l1'}

Model with rank: 3
Mean validation score: 0.955175 (std: 0.006753)
Parameters: {'C': 0.04, 'class_weight': 'balanced', 'penalty': 'l1'}

Model with rank: 4
Mean validation score: 0.955166 (std: 0.006640)
Parameters: {'C': 0.07, 'class_weight': 'balanced', 'penalty': 'l1'}

Model with rank: 5
Mean validation score: 0.955060 (std: 0.006660)
Parameters: {'C': 0.08, 'class_weight': 'balanced', 'penalty': 'l1'}



In [28]:
logr=gs.best_estimator_

In [29]:
logr.fit(x_train,y_train)

In [31]:
list(zip(x_train.columns,logr.coef_[0]))

[('status_Partner', 0.056096496441068226),
 ('status_Single/Never Married', 0.0),
 ('status_Divorced/Separated', 0.0),
 ('status_Widowed', 0.0),
 ('occupation_Professional', 0.0),
 ('occupation_Retired', 0.0),
 ('occupation_Secretarial/Admin', 0.0),
 ('occupation_Housewife', 0.0),
 ('occupation_Business Manager', 0.0),
 ('occupation_Unknown', 0.0),
 ('occupation_Manual Worker', 0.0),
 ('occupation_Other', 0.0),
 ('occupation_partner_Unknown', 0.0),
 ('occupation_partner_Professional', 0.016575502248388774),
 ('occupation_partner_Retired', 0.0),
 ('occupation_partner_Manual Worker', 0.0),
 ('occupation_partner_Business Manager', 0.0),
 ('occupation_partner_Secretarial/Admin', 0.0),
 ('occupation_partner_Housewife', 0.0),
 ('occupation_partner_Other', 0.0),
 ('home_status_Own Home', 0.0),
 ('home_status_Rent from Council/HA', 0.0),
 ('home_status_Rent Privately', 0.0),
 ('home_status_Live in Parental Hom', 0.0),
 ('self_employed_No', 0.0),
 ('self_employed_partner_No', -0.254434951309541

In [32]:
(logr.coef_[0]==0).sum()

51

In [33]:
# dont use l1 penalty with linearl models as form of feature selection , because that is strictly in the context
# of linear releationship . features might hold more predictive power if we consider non-linear relationship

[CV 2/10; 1/44] START C=0.0001, class_weight=balanced, penalty=l1...............
[CV 2/10; 1/44] END C=0.0001, class_weight=balanced, penalty=l1;, score=0.946 total time=   0.0s
[CV 10/10; 1/44] START C=0.0001, class_weight=balanced, penalty=l1..............
[CV 10/10; 1/44] END C=0.0001, class_weight=balanced, penalty=l1;, score=0.939 total time=   0.0s
[CV 4/10; 2/44] START C=0.0001, class_weight=balanced, penalty=l2...............
[CV 4/10; 2/44] END C=0.0001, class_weight=balanced, penalty=l2;, score=0.961 total time=   0.1s
[CV 8/10; 2/44] START C=0.0001, class_weight=balanced, penalty=l2...............
[CV 8/10; 2/44] END C=0.0001, class_weight=balanced, penalty=l2;, score=0.954 total time=   0.1s
[CV 5/10; 3/44] START C=0.0001, class_weight=None, penalty=l1...................
[CV 5/10; 3/44] END C=0.0001, class_weight=None, penalty=l1;, score=0.893 total time=   0.0s
[CV 1/10; 4/44] START C=0.0001, class_weight=None, penalty=l2...................
[CV 1/10; 4/44] END C=0.0001, cl

[CV 7/10; 1/44] START C=0.0001, class_weight=balanced, penalty=l1...............
[CV 7/10; 1/44] END C=0.0001, class_weight=balanced, penalty=l1;, score=0.943 total time=   0.0s
[CV 9/10; 2/44] START C=0.0001, class_weight=balanced, penalty=l2...............
[CV 9/10; 2/44] END C=0.0001, class_weight=balanced, penalty=l2;, score=0.964 total time=   0.1s
[CV 3/10; 4/44] START C=0.0001, class_weight=None, penalty=l2...................
[CV 3/10; 4/44] END C=0.0001, class_weight=None, penalty=l2;, score=0.949 total time=   0.1s
[CV 4/10; 5/44] START C=0.0005, class_weight=balanced, penalty=l1...............
[CV 4/10; 5/44] END C=0.0005, class_weight=balanced, penalty=l1;, score=0.957 total time=   0.0s
[CV 3/10; 6/44] START C=0.0005, class_weight=balanced, penalty=l2...............
[CV 3/10; 6/44] END C=0.0005, class_weight=balanced, penalty=l2;, score=0.955 total time=   0.1s
[CV 4/10; 6/44] START C=0.0005, class_weight=balanced, penalty=l2...............
[CV 4/10; 6/44] END C=0.0005, cla

[CV 1/10; 1/44] START C=0.0001, class_weight=balanced, penalty=l1...............
[CV 1/10; 1/44] END C=0.0001, class_weight=balanced, penalty=l1;, score=0.911 total time=   0.0s
[CV 3/10; 2/44] START C=0.0001, class_weight=balanced, penalty=l2...............
[CV 3/10; 2/44] END C=0.0001, class_weight=balanced, penalty=l2;, score=0.956 total time=   0.1s
[CV 6/10; 2/44] START C=0.0001, class_weight=balanced, penalty=l2...............
[CV 6/10; 2/44] END C=0.0001, class_weight=balanced, penalty=l2;, score=0.936 total time=   0.1s
[CV 7/10; 3/44] START C=0.0001, class_weight=None, penalty=l1...................
[CV 7/10; 3/44] END C=0.0001, class_weight=None, penalty=l1;, score=0.905 total time=   0.0s
[CV 8/10; 4/44] START C=0.0001, class_weight=None, penalty=l2...................
[CV 8/10; 4/44] END C=0.0001, class_weight=None, penalty=l2;, score=0.935 total time=   0.0s
[CV 1/10; 5/44] START C=0.0005, class_weight=balanced, penalty=l1...............
[CV 1/10; 5/44] END C=0.0005, class_w

# Predicting Probabilities

In [34]:
logr.classes_

array([0, 1])

In [35]:
logr.predict_proba(x_test)

array([[0.79535026, 0.20464974],
       [0.72790082, 0.27209918],
       [0.99446981, 0.00553019],
       ...,
       [0.99456396, 0.00543604],
       [0.964138  , 0.035862  ],
       [0.89704183, 0.10295817]])

In [36]:
logr.predict_proba(x_test)[:,1]

array([0.20464974, 0.27209918, 0.00553019, ..., 0.00543604, 0.035862  ,
       0.10295817])

In [38]:
test_score=logr.predict_proba(x_test)[:,1]

# Find cutoff for hardclass prediction

In [39]:
train_score=logr.predict_proba(x_train)[:,1]

In [40]:
real=y_train

In [53]:
cutoffs=np.linspace(0.001,0.999,999)

In [54]:
KS_all=[]

for cutoff in cutoffs:
    predicted=(train_score>cutoff).astype(int)

    TP=((predicted==1) & (real==1)).sum()
    TN=((predicted==0) & (real==0)).sum()
    FP=((predicted==1) & (real==0)).sum()
    FN=((predicted==0) & (real==1)).sum()

    P=TP+FN
    N=TN+FP

    KS=(TP/P)-(FP/N)
    
    KS_all.append(KS)

In [57]:
mycutoff=cutoffs[KS_all==max(KS_all)]

In [58]:
mycutoff

# if you get multiple values here , then simply go with the first one

array([0.524])

In [59]:
test_classes=(test_score>mycutoff).astype(int)

In [60]:
test_classes

array([0, 0, 0, ..., 0, 0, 0])