In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import model_selection

from sklearn.metrics import accuracy_score, log_loss, confusion_matrix

from patsy import dmatrices

%matplotlib inline

In [2]:
df = pd.read_csv("Credit.csv")

In [3]:
df.columns

Index(['Unnamed: 0', 'Income', 'Limit', 'Rating', 'Cards', 'Age', 'Education',
       'Gender', 'Student', 'Married', 'Ethnicity', 'Balance'],
      dtype='object')

In [4]:
def compareModels(Y, X,n_neighbors=5, max_depth=None):
    models = [
        ('KNN', KNeighborsClassifier(n_neighbors=n_neighbors)),
        ('DT', DecisionTreeClassifier(max_depth=max_depth))
             ]
    results = []
    names = []
    scoring = 'accuracy'
    seed=4
    
    results = []

    for name, model in models:
        kfold = model_selection.KFold(n_splits=10, random_state=seed)
        cv_results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
        #results.append(cv_results)
        names.append(name)
        #print("%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()))
        results.append((n_neighbors, max_depth, cv_results.mean()))
    return results

In [5]:
df['IncomeGreaterThan50'] = [1 if x > 50 else 0 for x in df['Income']]

In [6]:
def prepDataSet(model_formula): 
    Y, X = dmatrices(model_formula, data=df, return_type='dataframe')
    Y = np.array(Y).reshape(len(Y),)
    return Y, X

In [7]:
print('Income greater than 50')
Y, X = prepDataSet('IncomeGreaterThan50 ~ Limit + Balance + Rating + Education')
#compareModels(Y, X)

Income greater than 50


In [8]:
results = []
for nn, maxDepth in zip(range(1,50), range(1,50)):
    results.append(compareModels(Y,X,nn,maxDepth))
print("KNN: ",max(results, key=lambda el: el[0][2])[0])
print("DT: ",max(results, key=lambda el: el[1][2])[0])

KNN:  (1, 1, 0.9400000000000001)
DT:  (46, 46, 0.85)


In [9]:
print('\n# of cards')
Y, X = prepDataSet('Cards ~ Gender + Ethnicity')
compareModels(Y, X,12,5)


# of cards


[(12, 5, 0.24749999999999997), (12, 5, 0.2475)]

In [11]:
results

[[(1, 1, 0.9400000000000001), (1, 1, 0.85)],
 [(2, 2, 0.9375), (2, 2, 0.85)],
 [(3, 3, 0.9375), (3, 3, 0.8899999999999999)],
 [(4, 4, 0.9349999999999999), (4, 4, 0.885)],
 [(5, 5, 0.9375), (5, 5, 0.9199999999999999)],
 [(6, 6, 0.9225), (6, 6, 0.9125)],
 [(7, 7, 0.9275), (7, 7, 0.9125)],
 [(8, 8, 0.9299999999999999), (8, 8, 0.9225)],
 [(9, 9, 0.9275), (9, 9, 0.9199999999999999)],
 [(10, 10, 0.9200000000000002), (10, 10, 0.9199999999999999)],
 [(11, 11, 0.915), (11, 11, 0.9175000000000001)],
 [(12, 12, 0.9125), (12, 12, 0.9199999999999999)],
 [(13, 13, 0.9174999999999999), (13, 13, 0.9249999999999998)],
 [(14, 14, 0.9125), (14, 14, 0.9274999999999999)],
 [(15, 15, 0.9149999999999998), (15, 15, 0.9124999999999999)],
 [(16, 16, 0.9025000000000002), (16, 16, 0.9199999999999999)],
 [(17, 17, 0.9025000000000001), (17, 17, 0.9199999999999999)],
 [(18, 18, 0.8975000000000002), (18, 18, 0.9075)],
 [(19, 19, 0.9025000000000001), (19, 19, 0.9175000000000001)],
 [(20, 20, 0.8925000000000001), (20, 