# Importing Packages

In [2]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
import seaborn as sns 
import matplotlib.pyplot as plt

# Exploring the data and evaluating data quality

In [3]:
df = pd.read_excel("German_creditscoring.xls")



In [4]:
df

Unnamed: 0,Creditability,Balance of current account,Duration in months,Payment of previous credits,Purpose of credit,Amount of credit in DM,Value of savings or stocks,Has been employed by current employer for,Installment in % of available income,Marital Status/Sex,...,Most valuable available assets,Age in years,Further running credits,Type of apartment,Number of previous credits at this bank,Occupation,Number of persons entitled to maintainance,Telephone,Foreign worker,NewVar
0,bad,no running account,36,no problems with current credits at this bank,retraining,2145,no savings,between 4 and 7 years,between 25 and 35,male:single,...,savings contract with a building society/Life ...,24,no further running credits,rented,2 or 3,skilled worker/skilled employee/minor civil se...,3 and more,yes,yes,2.960619
1,good,no balance,48,hesistant payment of previous credits,retraining,12204,greater than 1000 DM,between 1 and 4 years,between 25 and 35,male:single,...,savings contract with a building society/Life ...,48,at other banks,rented,one,executive/self-employed/higher civil servant,3 and more,yes,yes,2.879810
2,bad,>=200 DM,36,no previous credits or paid back,used car,10974,no savings,unemployed,less than 20,female:divorced/living apart/married,...,savings contract with a building society/Life ...,26,no further running credits,rented,2 or 3,executive/self-employed/higher civil servant,3 and more,yes,yes,2.592734
3,good,no running account,24,paid back previous credits at this bank,new car,6419,no savings,greater than 7 years,between 25 and 35,female:divorced/living apart/married,...,ownership of house or land,44,no further running credits,owner,2 or 3,executive/self-employed/higher civil servant,0 to 2,yes,yes,2.576883
4,good,>=200 DM,24,no previous credits or paid back,retraining,1258,no savings,between 4 and 7 years,less than 20,male:single,...,no assets,25,no further running credits,rented,one,skilled worker/skilled employee/minor civil se...,3 and more,yes,yes,2.527170
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,bad,>=200 DM,21,hesistant payment of previous credits,other,5003,greater than 1000 DM,between 1 and 4 years,greater than 35,female:divorced/living apart/married,...,car/other,29,at other banks,rented,2 or 3,skilled worker/skilled employee/minor civil se...,3 and more,yes,yes,-2.517569
996,good,no balance,30,no previous credits or paid back,used car,3832,no savings,less than 1 year,between 25 and 35,male:married/widowed,...,car/other,22,no further running credits,rented,one,skilled worker/skilled employee/minor civil se...,3 and more,no,yes,-2.573962
997,good,no balance,9,paid back previous credits at this bank,items of furniture,1154,no savings,greater than 7 years,between 25 and 35,male:single,...,no assets,37,no further running credits,rented,4 or 5,unskilled with permanant residence,3 and more,no,yes,-2.697261
998,good,>=200 DM,24,paid back previous credits at this bank,household appliances,2058,no savings,between 1 and 4 years,less than 20,male:divorced/living apart,...,no assets,33,no further running credits,rented,2 or 3,skilled worker/skilled employee/minor civil se...,3 and more,yes,yes,-2.889172


In [5]:
for column in df.columns.values:
    print()
    print(column)
    print(df[column].unique())
    


Creditability
['bad' 'good']

Balance of current account
['no running account' 'no balance' '>=200 DM' '<= 200 DM']

Duration in months
[36 48 24 12 30 15 27 18  6 42 10 33 16  9 60 11 21 13 26  4 45 39  8 54
 14 20 28  7 22 72  5 40 47]

Payment of previous credits
['no problems with current credits at this bank'
 'hesistant payment of previous credits'
 'no previous credits or paid back'
 'paid back previous credits at this bank' 'problematic running accounts']

Purpose of credit
['retraining' 'used car' 'new car' 'items of furniture' 'repair' 'other'
 'business' 'radio/television' 'household appliances' 'vacation']

Amount of credit in DM
[ 2145 12204 10974  6419  1258  1037  3108  1537  1471  2520  4057   750
  4455  1743  1893  7166  4788  1837  3878  5381  3399  2028  5302  1884
  1309  7582  2064  7253  1567  1155  7855  1289  1169  4249  7393  2625
   741  2134  2570  1382  3965  1881  2118  1478  2670   841  5324  3416
  1521  1216  3148  4113  1249  2978  2292  1264  3780  3

In [6]:
df["Creditability"] = df.apply(lambda row: 1 if row["Creditability"] == "good" else 0, axis=1)

In [7]:
def balance(row):
    if row["Balance of current account"] == '>=200 DM':
        return 2
    elif row["Balance of current account"] == '<=200 DM':
        return 1
    else: 
        return 0

df["Balance of current account"] = df.apply(lambda row: balance(row), axis=1)

def payments(row):
    if row["Payment of previous credits"] == 'no problems with current credits at this bank':
        return 0
    elif row["Payment of previous credits"] == 'hesistant payment of previous credits':
        return 1
    elif row["Payment of previous credits"] == 'no previous credits or paid back':
        return 2
    elif row["Payment of previous credits"] == 'paid back previous credits at this bank':
        return 3
    else:
        return 4

df["Payment of previous credits"] = df.apply(lambda row: payments(row), axis=1)

def purpose(row):
    if row["Purpose of credit"] == 'retraining':
        return 0
    elif row["Purpose of credit"] == 'used car':
        return 1
    elif row["Purpose of credit"] == 'new car':
        return 2
    elif row["Purpose of credit"] == 'repair':
        return 3
    elif row["Purpose of credit"] == 'other':
        return 4
    elif row["Purpose of credit"] == 'business':
        return 5
    elif row["Purpose of credit"] == 'household appliances':
        return 6
    elif row["Purpose of credit"] == 'radio/television':
        return 7
    else:
        return 8

df["Purpose of credit"] = df.apply(lambda row: purpose(row), axis=1)



def stocks(row):
    if row["Value of savings or stocks"] == 'no savings':
        return 0
    elif row["Value of savings or stocks"] =='less than 100 DM' :
        return 1
    elif row["Value of savings or stocks"] =='between 100 and 500 DM' :
        return 2
    elif row["Value of savings or stocks"] == 'between 500 and 1000 DM':
        return 3
    elif row["Value of savings or stocks"] == 'greater than 1000 DM':
        return 4

df["Value of savings or stocks"] = df.apply(lambda row: stocks(row), axis=1)


# def workyears(row, column):
#     if row[column] == 'unemployed':
#         return 0
#     elif row[column] =='less than 1 year':
#         return 1
#     elif row[column] =='between 1 and 4 years':
#         return 2
#     elif row[column] =='between 4 and 7 years' :
#         return 3
#     elif row[column] =='greater than 7 years'  :
#         return 4

# df["Has been employed by current employer for"] = df.apply(lambda row: workyears(row, "Has been employed by current employer for"), axis=1)


def workyears(row, column):
    if row[column] == 'unemployed':
        return 0
    elif row[column] =='less than 1 year':
        return 1
    elif row[column] =='between 1 and 4 years':
        return 2
    elif row[column] =='between 4 and 7 years' :
        return 3
    elif row[column] =='greater than 7 years'  :
        return 4

df["Has been employed by current employer for"] = df.apply(lambda row: workyears(row, "Has been employed by current employer for"), axis=1)


def percentIncome(row, column):
    if row[column] == 'less than 20':
        return 0
    elif row[column] =='between 20 and 25' :
        return 1
    elif row[column] =='between 25 and 35':
        return 2
    elif row[column] =='greater than 35' :
        return 3
    else:
        return 4

df["Installment in % of available income"] = df.apply(lambda row: percentIncome(row, "Installment in % of available income"), axis=1)


def sex(row, column):
    if row[column] == 'male:single':
        return 0
    elif row[column] == 'male:divorced/living apart':
        return 1
    elif row[column] ==  'male:married/widowed':
        return 2
    elif row[column] ==  'female:divorced/living apart/married':
        return 3
    else:
        return 4

df["Marital Status/Sex"] = df.apply(lambda row: sex(row, "Marital Status/Sex"), axis=1)




def debtors(row, column):
    if row[column] == 'none':
        return 0
    elif row[column] ==  'co-applicant':
        return 1
    elif row[column] ==  'guarantor':
        return 2
    else:
        return 4

df["Further debtors/Guarantors"] = df.apply(lambda row: debtors(row, "Further debtors/Guarantors"), axis=1)



def livin(row, column):
    if row[column] == 'less than 1 year':
        return 0
    elif row[column] == 'between 1 and 4 years':
        return 1
    elif row[column] ==  'between 4 and 7 years':
        return 2
    elif row[column] ==  'greater than 7 years':
        return 3
    else:
        return 4

df["Living in current household for"] = df.apply(lambda row: livin(row, "Living in current household for"), axis=1)



def valuableAssets(row, column):
    if row[column] == 'no assets':
        return 0
    elif row[column] == 'car/other':
        return 1
    elif row[column] == 'savings contract with a building society/Life insurance' :
        return 2
    elif row[column] ==  'ownership of house or land':
        return 3
    else:
        return 4

df["Most valuable available assets"] = df.apply(lambda row: valuableAssets(row, "Most valuable available assets"), axis=1)



def othercreds(row, column):
    if row[column] == 'no further running credits':
        return 0
    elif row[column] == 'at department store or mail order house':
        return 1
    elif row[column] ==  'at other banks':
        return 2
    else:
        return 4

df["Further running credits"] = df.apply(lambda row: othercreds(row, "Further running credits"), axis=1)





def apart(row, column):
    if row[column] == 'free apartment':
        return 0
    elif row[column] == 'rented':
        return 1
    elif row[column] ==  'owner':
        return 2
    else:
        return 4

df["Type of apartment"] = df.apply(lambda row: apart(row, "Type of apartment"), axis=1)





def prevCreds(row, column):
    if row[column] == 'one':
        return 1
    elif row[column] == '2 or 3':
        return 2
    elif row[column] ==  '4 or 5':
        return 3
    elif row[column] ==  '6 or more':
        return 4
    else:
        return 0

df["Number of previous credits at this bank"] = df.apply(lambda row: prevCreds(row, "Number of previous credits at this bank"), axis=1)






def Occupation(row, column):
    if row[column] == 'unemployed/unskilled with no permaant residence':
        return 0
    elif row[column] == 'unskilled with permanant residence':
        return 1
    elif row[column] ==  'skilled worker/skilled employee/minor civil servant':
        return 2
    elif row[column] == 'executive/self-employed/higher civil servant' :
        return 3
    else:
        return 4

df["Occupation"] = df.apply(lambda row: Occupation(row, "Occupation"), axis=1)






def maintainence(row, column):
    if row[column] == '3 and more':
        return 0
    elif row[column] == '0 to 2':
        return 1
    else:
        return 4

df["Number of persons entitled to maintainance"] = df.apply(lambda row: maintainence(row, "Number of persons entitled to maintainance"), axis=1)




def yesno(row, column):
    if row[column] == "no":
        return 0
    elif row[column] == "yes":
        return 1
    else:
        return 4

df["Telephone"] = df.apply(lambda row: yesno(row, "Telephone"), axis=1)
df["Foreign worker"] = df.apply(lambda row: yesno(row, "Foreign worker"), axis=1)


In [34]:
df = df[['Creditability','Balance of current account','Duration in months','Payment of previous credits','Amount of credit in DM','Value of savings or stocks','Has been employed by current employer for','Installment in % of available income','Most valuable available assets','Age in years','Further running credits',]]

In [35]:
df.corr()

Unnamed: 0,Creditability,Balance of current account,Duration in months,Payment of previous credits,Amount of credit in DM,Value of savings or stocks,Has been employed by current employer for,Installment in % of available income,Most valuable available assets,Age in years,Further running credits
Creditability,1.0,0.322436,-0.214927,0.065076,-0.15474,0.178943,0.116002,0.072404,-0.142612,0.091272,-0.109844
Balance of current account,0.322436,1.0,-0.063467,0.042277,-0.03948,0.200384,0.121506,-0.024961,-0.031296,0.061042,-0.078495
Duration in months,-0.214927,-0.063467,1.0,-0.130732,0.624988,0.047661,0.057381,-0.074749,0.303971,-0.03755,0.062884
Payment of previous credits,0.065076,0.042277,-0.130732,1.0,-0.118916,0.029956,0.065368,-0.055013,-0.016168,0.080196,0.06631
Amount of credit in DM,-0.15474,-0.03948,0.624988,-0.118916,1.0,0.064632,-0.008376,0.271322,0.311602,0.032273,0.069392
Value of savings or stocks,0.178943,0.200384,0.047661,0.029956,0.064632,1.0,0.12095,-0.021993,0.018948,0.083434,-0.001908
Has been employed by current employer for,0.116002,0.121506,0.057381,0.065368,-0.008376,0.12095,1.0,-0.126161,0.087187,0.259116,0.007279
Installment in % of available income,0.072404,-0.024961,-0.074749,-0.055013,0.271322,-0.021993,-0.126161,1.0,-0.053391,-0.057271,0.007894
Most valuable available assets,-0.142612,-0.031296,0.303971,-0.016168,0.311602,0.018948,0.087187,-0.053391,1.0,0.074551,0.107593
Age in years,0.091272,0.061042,-0.03755,0.080196,0.032273,0.083434,0.259116,-0.057271,0.074551,1.0,0.030472


In [145]:
X_train, X_test, y_train, y_test = train_test_split(df.loc[:, df.columns != 'Creditability'], df["Creditability"], test_size=0.3, random_state=1)

In [146]:
import imblearn
from imblearn.over_sampling import SMOTE

oversample = SMOTE()
X,y = oversample.fit_resample(X_train, y_train)

In [147]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [148]:
y_train.value_counts()

0    343
1    338
Name: Creditability, dtype: int64

In [39]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn import svm
from sklearn.manifold import TSNE
from sklearn.neighbors import KNeighborsClassifier

In [40]:
df["Creditability"].value_counts()

1    700
0    300
Name: Creditability, dtype: int64

In [41]:
df.isnull().astype(int).sum()

Creditability                                0
Balance of current account                   0
Duration in months                           0
Payment of previous credits                  0
Amount of credit in DM                       0
Value of savings or stocks                   0
Has been employed by current employer for    0
Installment in % of available income         0
Most valuable available assets               0
Age in years                                 0
Further running credits                      0
dtype: int64

In [42]:
from sklearn import svm, datasets
clf = svm.SVC(C= 1, kernel = "linear")
clf.fit(X_train, y_train)
results = clf.predict(X_test)
print(np.mean(results == y_test))

0.7245508982035929


In [32]:
results

array([0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1,
       0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1,
       1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0,

In [33]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
y_pred = cross_val_predict(clf, X_test, y_test, cv=10)
conf_mat = confusion_matrix(y_test, y_pred)

KeyboardInterrupt: 

In [None]:
print(conf_mat)

In [21]:
clf.best_params_.items()

dict_items([('C', 1), ('kernel', 'linear')])

In [105]:

from sklearn.linear_model import LinearRegression
linreg = LinearRegression().fit(X_train, y_train)

results = linreg.predict(X_test)
print(np.mean(results == y_test))


0.0


In [103]:
resultsCorrect = []

knn = KNeighborsClassifier()
knn.fit(X_train, y_train) 
KNeighborsClassifier(algorithm='auto', leaf_size=15, metric='minkowski',
       metric_params=None, n_jobs=1, n_neighbors=2, p=2,
       weights='uniform')
results = knn.predict(X_test)
print(np.mean(results == y_test))



0.6507936507936508


In [150]:
y_check = list(y_test)

true_positive = 0
true_negative = 0
false_positive = 0
false_negative = 0
i = 0
while i < len(results):
    if results[i] == 1 & y_check[i] == 1:
        true_positive += 1
    elif results[i] == 1 & y_check[i] == 0:
        false_positive += 1
    elif results[i] == 0 & y_check[i] == 0:
        true_negative += 1
    else: 
        false_negative += 1
    i += 1
    



print("True Positive "+str(true_positive))

print("True Negative "+str(true_negative))

print("False Positive "+str(false_positive))

print("False Negative "+str(false_negative))

print("Total Profit: "+ str(true_positive*.35+false_positive*-1))

True Positive 141
True Negative 8
False Positive 20
False Negative 124
Total Profit: 29.349999999999994


In [149]:
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(solver='adam', alpha=.0001, hidden_layer_sizes=(9),max_iter=100000, random_state=6)

clf.fit(X_train, y_train)



results = clf.predict(X_test)
print(np.mean(results == y_test))


0.5494880546075085


In [143]:
results

array([0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 0], dtype=int64)

In [None]:
y_train

In [32]:
y_test

521    1
737    0
740    1
660    1
411    0
      ..
506    1
342    1
485    0
711    1
133    1
Name: Creditability, Length: 330, dtype: int64

In [2]:
import pandas as pd

In [3]:
pubg_df = pd.read_csv("./PUBG_Player_Statistic.csv")

FileNotFoundError: File b'./PUBG_Player_Statistic.csv' does not exist