In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss, confusion_matrix
#import seaborn as sns
from patsy import dmatrices

%matplotlib inline

In [2]:
df = pd.read_csv("Credit.csv")
df.columns

Index(['Unnamed: 0', 'Income', 'Limit', 'Rating', 'Cards', 'Age', 'Education',
       'Gender', 'Student', 'Married', 'Ethnicity', 'Balance'],
      dtype='object')

1. Income greater than 50
==============
-------------------------

In [3]:
df['incomeGreaterThan50'] = [ 1 if x > 50 else 0 for x in df['Income']]

In [4]:
mask = np.random.rand(len(df)) < 0.8
training = df[mask]
validation = df[~mask]

In [5]:
model_formula = 'incomeGreaterThan50 ~ Rating + Education + Ethnicity + Balance'
yTrain, xTrain = dmatrices(model_formula, data=training, return_type='dataframe')
yValid, xValid = dmatrices(model_formula, data=validation, return_type='dataframe')

In [6]:
model = LogisticRegression(fit_intercept = False, C=1e9)
yTrain = np.array(yTrain).reshape(len(yTrain),)
result = model.fit(xTrain, yTrain)



In [7]:
# accuracy of our prediction on validation set
print("Training accuracy: ", accuracy_score(yTrain, model.predict(xTrain)))

# Log_loss on validation set -> we use  model.predict_proba() 
print("Training log-loss", log_loss(yTrain, model.predict_proba(xTrain)))

# accuracy of our prediction on validation set
print("Validation accuracy: ", accuracy_score(yValid, model.predict(xValid)))

# accuracy of our prediction on validation set
print("Confusion_matrix: \n", confusion_matrix(yValid, model.predict(xValid)))

Training accuracy:  0.937888198757764
Training log-loss 0.19915459169965818
Validation accuracy:  0.9487179487179487
Confusion_matrix: 
 [[52  1]
 [ 3 22]]


2. Credit card number
=============
----------------------

In [8]:
model_formula = 'Cards ~ Limit + Rating + Income'
yTrain, xTrain = dmatrices(model_formula, data=training, return_type='dataframe')
yValid, xValid = dmatrices(model_formula, data=validation, return_type='dataframe')

In [9]:
model = LogisticRegression(fit_intercept = False, C=1e9)
yTrain = np.array(yTrain).reshape(len(yTrain),)
result = model.fit(xTrain, yTrain)



In [10]:
# accuracy of our prediction on validation set
print("Training accuracy: ", accuracy_score(yTrain, model.predict(xTrain)))

# Log_loss on validation set -> we use  model.predict_proba() 
print("Training log-loss", log_loss(yTrain, model.predict_proba(xTrain)))

# accuracy of our prediction on validation set
print("Validation accuracy: ", accuracy_score(yValid, model.predict(xValid)))

# accuracy of our prediction on validation set
print("Confusion_matrix: \n", confusion_matrix(yValid, model.predict(xValid)))

Training accuracy:  0.36335403726708076
Training log-loss 1.4930430128680048
Validation accuracy:  0.32051282051282054
Confusion_matrix: 
 [[ 2  2  1  0  0  0  0  0]
 [ 0 11 10  0  0  0  1  0]
 [ 2  5 12  0  2  0  0  0]
 [ 0 10  9  0  1  0  0  0]
 [ 0  2  4  0  0  0  0  1]
 [ 0  0  2  0  0  0  0  0]
 [ 0  0  1  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0]]


In [11]:
df['Cards'].describe()

count    400.000000
mean       2.957500
std        1.371275
min        1.000000
25%        2.000000
50%        3.000000
75%        4.000000
max        9.000000
Name: Cards, dtype: float64