In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
%matplotlib inline
import statsmodels.api as sm #always use this to determine p-values

In [2]:
url = "https://raw.githubusercontent.com/ga-students/DS-SF-24/master/Data/bank.csv"
BankData = pd.read_csv(url)
BankData.head(5)

Unnamed: 0,age,marital,education,default,balance,housing,loan,contact,duration,campaign,pdays,previous,poutcome,y
0,30,married,primary,no,1787,no,no,cellular,79,1,-1,0,unknown,no
1,33,married,secondary,no,4789,yes,yes,cellular,220,1,339,4,failure,no
2,35,single,tertiary,no,1350,yes,no,cellular,185,1,330,1,failure,no
3,30,married,tertiary,no,1476,yes,yes,unknown,199,4,-1,0,unknown,no
4,59,married,secondary,no,0,yes,no,unknown,226,1,-1,0,unknown,no


In [3]:
print(BankData['y'].unique()) #you can use unique if you would like to find out how many unique attributes each variable have

['no' 'yes']


For dictionary of data please refer to https://archive.ics.uci.edu/ml/datasets/Bank+Marketing

#### Our goal is to define a model best predicts outcome y - success of the marketing campaign 

First let's create dummy variables for default, marital, housing, and loan

In [4]:
Default_dummy  = pd.get_dummies(BankData['default'], prefix = 'default')
del Default_dummy['default_no']

marital_dummy  = pd.get_dummies(BankData['marital'], prefix = 'marital')
del marital_dummy['marital_married']
del marital_dummy['marital_divorced']

housing_dummy  = pd.get_dummies(BankData['housing'], prefix = 'housing')
del housing_dummy['housing_no']

loan_dummy = pd.get_dummies(BankData['loan'], prefix = 'loan')
del loan_dummy['loan_no']

y_dummy = pd.get_dummies(BankData['y'], prefix = 'y')
del y_dummy['y_no']



BankData = pd.concat([BankData,marital_dummy , Default_dummy, housing_dummy, loan_dummy, y_dummy], axis=1)
print(BankData.head())
print(BankData.describe())



   age  marital  education default  balance housing loan   contact  duration  \
0   30  married    primary      no     1787      no   no  cellular        79   
1   33  married  secondary      no     4789     yes  yes  cellular       220   
2   35   single   tertiary      no     1350     yes   no  cellular       185   
3   30  married   tertiary      no     1476     yes  yes   unknown       199   
4   59  married  secondary      no        0     yes   no   unknown       226   

   campaign  pdays  previous poutcome   y  marital_single  default_yes  \
0         1     -1         0  unknown  no             0.0          0.0   
1         1    339         4  failure  no             0.0          0.0   
2         1    330         1  failure  no             1.0          0.0   
3         4     -1         0  unknown  no             0.0          0.0   
4         1     -1         0  unknown  no             0.0          0.0   

   housing_yes  loan_yes  y_yes  
0          0.0       0.0    0.0  
1     

In [5]:
BankData['Intercept'] = 1
X1 = BankData[['Intercept','age','balance','duration','campaign','previous','marital_single','default_yes','housing_yes','loan_yes']]
y1 = BankData['y_yes']


#### Task 1: run a logistic regression using statsmodels.api

In [6]:
logit = sm.Logit(y1, X1)

result = logit.fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.279689
         Iterations 7


0,1,2,3
Dep. Variable:,y_yes,No. Observations:,4521.0
Model:,Logit,Df Residuals:,4511.0
Method:,MLE,Df Model:,9.0
Date:,"Thu, 07 Jul 2016",Pseudo R-squ.:,0.2173
Time:,20:55:30,Log-Likelihood:,-1264.5
converged:,True,LL-Null:,-1615.5
,,LLR p-value:,2.5029999999999997e-145

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
Intercept,-3.2586,0.279,-11.695,0.000,-3.805 -2.712
age,0.0101,0.005,1.905,0.057,-0.000 0.021
balance,1.182e-05,1.58e-05,0.750,0.453,-1.91e-05 4.27e-05
duration,0.0038,0.000,20.782,0.000,0.003 0.004
campaign,-0.0910,0.026,-3.488,0.000,-0.142 -0.040
previous,0.1618,0.023,7.062,0.000,0.117 0.207
marital_single,0.3140,0.129,2.429,0.015,0.061 0.567
default_yes,0.3459,0.412,0.841,0.401,-0.461 1.152
housing_yes,-0.8404,0.111,-7.549,0.000,-1.059 -0.622


#### Task 2: Interpret Campaign, marital_single, housing_yes coefficients.

Answer: 
Campaign: all other things remaining constant, for every additional contact for the campaign resulted in 9% lower odds of subscribing to the bank's term deposit. 
Marital_single: All other things remaining constant, a single individual has 31% greater odds of subscribing to the bank's term deposit.
Housing_yes: All other things remaining constant, an individual who has a housing loan with this bank has an 84% greater odds of subscribing to the bank's term deposit. 

#### Task3: What are your 10-fold cross-validation accuracy using sklearn library and solver = 'liblinear' and solver =  'newton-cg'? (Set max_iter = 50000 if you get a warning.) What do you learn?

In [26]:
lm1 = LogisticRegression(solver = 'liblinear')
X1 = BankData[['age','balance','duration','campaign','previous','marital_single','default_yes','housing_yes','loan_yes']]
y1 = BankData['y'] #This should be categorical - for statsmodels api that shall be 0 and 1 not necessarily '0' and '1'
lm1.fit(X1,y1)

print(lm1.coef_)
print(lm1.intercept_)

print(cross_val_score(lm,X1,y1,cv=10, scoring = 'accuracy').mean())

[[  5.92678773e-03   1.12262553e-05   3.75180658e-03  -9.53399063e-02
    1.59580582e-01   2.49916961e-01   1.73238439e-01  -8.57853959e-01
   -8.23424388e-01]]
[-3.01711475]
0.888299732364


In [25]:
lm2 = LogisticRegression(solver = 'newton-cg', max_iter = 1000)
X2 = BankData[['age','balance','duration','campaign','previous','marital_single','default_yes','housing_yes','loan_yes']]
y2 = BankData['y'] #This should be categorical - for statsmodels api that shall be 0 and 1 not necessarily '0' and '1'
lm2.fit(X2,y2)

print(lm2.coef_)
print(lm2.intercept_) 

print(cross_val_score(lm,X2,y2,cv=10, scoring = 'accuracy').mean())

[[  1.01764775e-02   1.19324173e-05   3.78719139e-03  -9.10647460e-02
    1.61521372e-01   3.11421016e-01   2.92452115e-01  -8.30201026e-01
   -8.19486348e-01]]
[-3.26359904]
0.888299732364


It looks like the accuracy for this model is quite high: 88.8% using both the 'newton-cg' and the 'liblinear' models.

#### Construct confusion matrices for logistic regression using solver = 'liblinear' and solver = 'newton-cg'.

In [27]:
#this is for 'liblinear' model
y_hat1 = lm1.predict(X1)
confusion_matrix(y1, y_hat1)

array([[3928,   72],
       [ 426,   95]])

In [28]:
#this is for 'newton-cg' model
y_hat2 = lm2.predict(X2)
confusion_matrix(y2, y_hat2)

array([[3923,   77],
       [ 427,   94]])

#### what did we learn from what we observerd in confusion matrices?

Answer: 

#### What is your prediction for a person who is 30 years old, 1000 dollars balance,  with duration = 210 , has been contacted 3 times for this campaign (campaign = 3),  who has previously been contacted 4 times, who is single, never defaulted, home owner and doesn't have any loan?

#### Now standardize your data - you can use standardization method used for KNN algorithms.

In [None]:
def Standardize(X):
    X_Max = X.max()
    X_Min = X.min()
    X_Standardized = (X-X_Min)/(X_Max - X_Min)
    return X_Standardized



#### Use 10-fold cross validation to find the best tuning parameter - C. Use 'liblinear' and l1 penalty

#### Localize your search around the value you found above

#### Now use the best C you found above and repeat your analysis and check your coefficients

#### If you would like to drop 2 variables from your analysis, which variables are you going to choose?

Answer: 

#### Compare the results you found out above with p-values of statsmodels api. Do you have same story there? 

Answer: 