In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, label_binarize, StandardScaler, MinMaxScaler
from collections import defaultdict
from sklearn.model_selection import ShuffleSplit,train_test_split, cross_val_score, GridSearchCV

In [2]:
file = 'german.data'
# url = "http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data"

names = ['existingchecking', 'duration', 'credithistory', 'purpose', 'creditamount', 
         'savings', 'employmentsince', 'installmentrate', 'statussex', 'otherdebtors', 
         'residencesince', 'property', 'age', 'otherinstallmentplans', 'housing', 
         'existingcredits', 'job', 'peopleliable', 'telephone', 'foreignworker', 'classification']

data = pd.read_csv(file,names = names, delimiter=' ')
print(data.shape)
print (data.columns)
data.head(10)

(1000, 21)
Index(['existingchecking', 'duration', 'credithistory', 'purpose',
       'creditamount', 'savings', 'employmentsince', 'installmentrate',
       'statussex', 'otherdebtors', 'residencesince', 'property', 'age',
       'otherinstallmentplans', 'housing', 'existingcredits', 'job',
       'peopleliable', 'telephone', 'foreignworker', 'classification'],
      dtype='object')


Unnamed: 0,existingchecking,duration,credithistory,purpose,creditamount,savings,employmentsince,installmentrate,statussex,otherdebtors,...,property,age,otherinstallmentplans,housing,existingcredits,job,peopleliable,telephone,foreignworker,classification
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,2
5,A14,36,A32,A46,9055,A65,A73,2,A93,A101,...,A124,35,A143,A153,1,A172,2,A192,A201,1
6,A14,24,A32,A42,2835,A63,A75,3,A93,A101,...,A122,53,A143,A152,1,A173,1,A191,A201,1
7,A12,36,A32,A41,6948,A61,A73,2,A93,A101,...,A123,35,A143,A151,1,A174,1,A192,A201,1
8,A14,12,A32,A43,3059,A64,A74,2,A91,A101,...,A121,61,A143,A152,1,A172,1,A191,A201,1
9,A12,30,A34,A40,5234,A61,A71,4,A94,A101,...,A123,28,A143,A152,2,A174,1,A191,A201,2


In [3]:
# Binarize the y output for easier use of e.g. ROC curves -> 0 = 'bad' credit; 1 = 'good' credit
data.classification.replace([1,2], [1,0], inplace=True)
# Print number of 'good' credits (should be 700) and 'bad credits (should be 300)
data.classification.value_counts()

1    700
0    300
Name: classification, dtype: int64

In [4]:
#numerical variables labels
numvars = ['creditamount', 'duration', 'installmentrate', 'residencesince', 'age', 
           'existingcredits', 'peopleliable', 'classification']

# Standardization
numdata_std = pd.DataFrame(StandardScaler().fit_transform(data[numvars].drop(['classification'], axis=1)))

In [5]:
#categorical variables labels
catvars = ['existingchecking', 'credithistory', 'purpose', 'savings', 'employmentsince',
           'statussex', 'otherdebtors', 'property', 'otherinstallmentplans', 'housing', 'job', 
           'telephone', 'foreignworker']

d = defaultdict(LabelEncoder)

# Encoding the variable
lecatdata = data[catvars].apply(lambda x: d[x.name].fit_transform(x))

# print transformations
for x in range(len(catvars)):
    print(catvars[x],": ", data[catvars[x]].unique())
    print(catvars[x],": ", lecatdata[catvars[x]].unique())

#One hot encoding, create dummy variables for every category of every categorical variable
dummyvars = pd.get_dummies(data[catvars])

existingchecking :  ['A11' 'A12' 'A14' 'A13']
existingchecking :  [0 1 3 2]
credithistory :  ['A34' 'A32' 'A33' 'A30' 'A31']
credithistory :  [4 2 3 0 1]
purpose :  ['A43' 'A46' 'A42' 'A40' 'A41' 'A49' 'A44' 'A45' 'A410' 'A48']
purpose :  [4 7 3 0 1 9 5 6 2 8]
savings :  ['A65' 'A61' 'A63' 'A64' 'A62']
savings :  [4 0 2 3 1]
employmentsince :  ['A75' 'A73' 'A74' 'A71' 'A72']
employmentsince :  [4 2 3 0 1]
statussex :  ['A93' 'A92' 'A91' 'A94']
statussex :  [2 1 0 3]
otherdebtors :  ['A101' 'A103' 'A102']
otherdebtors :  [0 2 1]
property :  ['A121' 'A122' 'A124' 'A123']
property :  [0 1 3 2]
otherinstallmentplans :  ['A143' 'A141' 'A142']
otherinstallmentplans :  [2 0 1]
housing :  ['A152' 'A153' 'A151']
housing :  [1 2 0]
job :  ['A173' 'A172' 'A174' 'A171']
job :  [2 1 3 0]
telephone :  ['A192' 'A191']
telephone :  [1 0]
foreignworker :  ['A201' 'A202']
foreignworker :  [0 1]


In [6]:
data_clean = pd.concat([data[numvars], dummyvars], axis = 1)

print(data_clean.shape)

(1000, 62)


In [7]:
data_clean

Unnamed: 0,creditamount,duration,installmentrate,residencesince,age,existingcredits,peopleliable,classification,existingchecking_A11,existingchecking_A12,...,housing_A152,housing_A153,job_A171,job_A172,job_A173,job_A174,telephone_A191,telephone_A192,foreignworker_A201,foreignworker_A202
0,1169,6,4,4,67,2,1,1,1,0,...,1,0,0,0,1,0,0,1,1,0
1,5951,48,2,2,22,1,1,0,0,1,...,1,0,0,0,1,0,1,0,1,0
2,2096,12,2,3,49,1,2,1,0,0,...,1,0,0,1,0,0,1,0,1,0
3,7882,42,2,4,45,1,2,1,1,0,...,0,1,0,0,1,0,1,0,1,0
4,4870,24,3,4,53,2,2,0,1,0,...,0,1,0,0,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1736,12,3,4,31,1,1,1,0,0,...,1,0,0,1,0,0,1,0,1,0
996,3857,30,4,4,40,1,1,1,1,0,...,1,0,0,0,0,1,0,1,1,0
997,804,12,4,4,38,1,1,1,0,0,...,1,0,0,0,1,0,1,0,1,0
998,1845,45,4,4,23,1,1,0,1,0,...,0,1,0,0,1,0,0,1,1,0


In [8]:
# Unscaled, unnormalized data
X_clean = data_clean.drop('classification', axis=1)
y_clean = data_clean['classification']
X_train_clean, X_test_clean, y_train_clean, y_test_clean = train_test_split(X_clean,y_clean,test_size=0.2, random_state=1)

### Models

###### Linear

In [9]:
from sklearn.linear_model import LinearRegression
def linear(train, train_y, test, test_y):
    clf = LinearRegression().fit(train, train_y)
    train_score = clf.score(train, train_y)
    test_arr = np.array(test)
    test_score = clf.score(test, test_y)
    return clf, train_score, test_score



###### Logistic

In [10]:
from sklearn.linear_model import LogisticRegression

def logistic(train, train_y, test, test_y):
    clf = LogisticRegression(random_state=0).fit(train, train_y)
    train_score = clf.score(train, train_y)
    test_arr = np.array(test)
    test_score = clf.score(test, test_y)
    return clf, train_score, test_score



###### GB

In [11]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
def gradient_boosted(train, train_y, test, test_y):
    clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0).fit(train, train_y)
    train_score = clf.score(train, train_y)
    test_arr = np.array(test)
    test_score = clf.score(test, test_y)
    return clf, train_score, test_score



###### RF

In [12]:
from sklearn.ensemble import RandomForestClassifier
def random_forrest(train, train_y, test, test_y):
    clf = RandomForestClassifier(max_depth=2, random_state=0).fit(train, train_y)
    train_score = clf.score(train, train_y)
    test_arr = np.array(test)
    test_score = clf.score(test, test_y)
    return clf, train_score, test_score

###### MLP

In [13]:
from sklearn.neural_network import MLPClassifier

def MLP(train, train_y, test, test_y):
    clf = MLPClassifier(solver='lbfgs', alpha=.1, hidden_layer_sizes=(6, 32, 32, 6), random_state=1, max_iter=300)
    clf.fit(train, train_y)
    train_score = clf.score(train, train_y)
    test_arr = np.array(test)
    test_score = clf.score(test, test_y)
    return clf, train_score, test_score



###### Logistic

In [14]:
linear_model, lin_train_acc, lin_test_acc = linear(X_train_clean, y_train_clean, X_test_clean, y_test_clean)
logistic_model, log_train_acc, log_test_acc = logistic(X_train_clean, y_train_clean, X_test_clean, y_test_clean)
gradient_boosted_model, grad_train_acc, grad_test_acc = gradient_boosted(X_train_clean, y_train_clean, X_test_clean, y_test_clean)
rf_model, rf_train_acc, rf_test_acc = random_forrest(X_train_clean, y_train_clean, X_test_clean, y_test_clean)
mlp_model, mlp_train_acc, mlp_test_acc = MLP(X_train_clean, y_train_clean, X_test_clean, y_test_clean)



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
