In [36]:
from sklearn.model_selection import train_test_split
import pandas as pd

german_credit = pd.read_csv('german_credit_data.csv')
german_credit.describe()

Unnamed: 0,checking_account_status,duration,credit_history,purpose,credit_amount,savings,present_employment,installment_rate,personal,other_debtors,...,property,age,other_installment_plans,housing,existing_credits,job,dependents,telephone,foreign_worker,customer_type
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,...,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,1.577,20.903,2.545,3.277,3271.258,1.105,2.384,2.973,1.682,0.145,...,1.358,35.546,1.675,0.929,1.407,1.904,1.155,0.404,0.037,1.3
std,1.257638,12.058814,1.08312,2.739302,2822.736876,1.580023,1.208306,1.118715,0.70808,0.477706,...,1.050209,11.375469,0.705601,0.531264,0.577654,0.653614,0.362086,0.490943,0.188856,0.458487
min,0.0,4.0,0.0,0.0,250.0,0.0,0.0,1.0,0.0,0.0,...,0.0,19.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
25%,0.0,12.0,2.0,1.0,1365.5,0.0,2.0,2.0,1.0,0.0,...,0.0,27.0,2.0,1.0,1.0,2.0,1.0,0.0,0.0,1.0
50%,1.0,18.0,2.0,3.0,2319.5,0.0,2.0,3.0,2.0,0.0,...,1.0,33.0,2.0,1.0,1.0,2.0,1.0,0.0,0.0,1.0
75%,3.0,24.0,4.0,4.0,3972.25,2.0,4.0,4.0,2.0,0.0,...,2.0,42.0,2.0,1.0,2.0,2.0,1.0,1.0,0.0,2.0
max,3.0,72.0,4.0,9.0,18424.0,4.0,4.0,4.0,3.0,2.0,...,3.0,75.0,2.0,2.0,4.0,3.0,2.0,1.0,1.0,2.0


In [37]:
german_credit.head(3)

Unnamed: 0,checking_account_status,duration,credit_history,purpose,credit_amount,savings,present_employment,installment_rate,personal,other_debtors,...,property,age,other_installment_plans,housing,existing_credits,job,dependents,telephone,foreign_worker,customer_type
0,0.0,6.0,4.0,4.0,1169.0,4.0,4.0,4.0,2.0,0.0,...,0.0,67.0,2.0,1.0,2.0,2.0,1.0,1.0,0.0,1.0
1,1.0,48.0,2.0,4.0,5951.0,0.0,2.0,2.0,1.0,0.0,...,0.0,22.0,2.0,1.0,1.0,2.0,1.0,0.0,0.0,2.0
2,3.0,12.0,4.0,7.0,2096.0,0.0,3.0,2.0,2.0,0.0,...,0.0,49.0,2.0,1.0,1.0,1.0,2.0,0.0,0.0,1.0


In [38]:
X, y = german_credit.loc[:, "checking_account_status":"foreign_worker"], german_credit["customer_type"]
X_train, X_test, y_train, y_test =\
        train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

In [39]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score

pipe1 = make_pipeline(StandardScaler(), Perceptron(eta0=1.0, random_state=1))

pipe2 = make_pipeline(DecisionTreeClassifier(max_depth=10,
                                             criterion='gini',
                                             random_state=0))

pipe3 = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=20,
                                                             p=3,
                                                             metric='euclidean'))

clf_labels = ['Perceptron', 'Decision tree', 'KNN']

print("Our classifiers are to predict whether a customer has a good or bad credit rating.")

print('10-fold cross validation:\n')
for clf, label in zip([pipe1, pipe2, pipe3], clf_labels):
    scores = cross_val_score(estimator=clf,
                             X=X_train,
                             y=y_train,
                             cv=10,
                             scoring='accuracy')
    print("Accuracy: " + str(round(scores.mean(), 2)) + 
          " Stdev: " + str(round(scores.std(), 3)) +
          " [" + label + "]")

Our classifiers are to predict whether a customer has a good or bad credit rating.
10-fold cross validation:

Accuracy: 0.66 Stdev: 0.051 [Perceptron]
Accuracy: 0.72 Stdev: 0.051 [Decision tree]
Accuracy: 0.72 Stdev: 0.019 [KNN]


In [40]:
from sklearn.ensemble import VotingClassifier

mv_clf = VotingClassifier(estimators=[('p', pipe1), ('dt', pipe2), ('kn', pipe3)])

clf_labels += ['Majority voting']
all_clf = [pipe1, pipe2, pipe3, mv_clf]

for clf, label in zip(all_clf, clf_labels):
    scores = cross_val_score(estimator=clf,
                             X=X_train,
                             y=y_train,
                             cv=10,
                             scoring='accuracy')
    print("Accuracy: " + str(round(scores.mean(), 2)) + 
          " Stdev: " + str(round(scores.std(), 3)) +
          " [" + label + "]")

Accuracy: 0.66 Stdev: 0.051 [Perceptron]
Accuracy: 0.72 Stdev: 0.051 [Decision tree]
Accuracy: 0.72 Stdev: 0.019 [KNN]
Accuracy: 0.74 Stdev: 0.036 [Majority voting]


In [41]:
pipe1.fit(X_train, y_train)

y_pred = pipe1.predict(X_test)
print('Misclassified test set examples:', (y_test != y_pred).sum())
print('Out of a total of:', y_test.shape[0])
print('Accuracy:', pipe1.score(X_test, y_test))

Misclassified test set examples: 85
Out of a total of: 300
Accuracy: 0.7166666666666667


In [42]:
pipe2.fit(X_train, y_train)

y_pred = pipe2.predict(X_test)
print('Misclassified test set examples:', (y_test != y_pred).sum())
print('Out of a total of:', y_test.shape[0])
print('Accuracy:', pipe2.score(X_test, y_test))

Misclassified test set examples: 94
Out of a total of: 300
Accuracy: 0.6866666666666666


In [43]:
pipe3.fit(X_train, y_train)

y_pred = pipe3.predict(X_test)
print('Misclassified test set examples:', (y_test != y_pred).sum())
print('Out of a total of:', y_test.shape[0])
print('Accuracy:', pipe3.score(X_test, y_test))

Misclassified test set examples: 77
Out of a total of: 300
Accuracy: 0.7433333333333333


In [44]:
mv_clf.fit(X_train, y_train)

y_pred = mv_clf.predict(X_test)
print('Misclassified test set examples:', (y_test != y_pred).sum())
print('Out of a total of:', y_test.shape[0])
print('Accuracy:', mv_clf.score(X_test, y_test))

Misclassified test set examples: 73
Out of a total of: 300
Accuracy: 0.7566666666666667
