In [1]:
# Similar to SparkContext, for SparkSQL you need a SparkSession
from pyspark.sql import SparkSession
# Also all the functions (select, where, groupby) needs to be imported
from pyspark.sql.functions import *

In [2]:
# instantiate spark session
spark = SparkSession.builder.getOrCreate()

### Extract data

In [4]:
df_credit = spark.read.csv("/FileStore/tables/german_credit_data.csv", header=True)

In [5]:
# You can use show(n) to take a look into the dataframe
df_credit.show(5)

In [6]:
df_credit.columns

In [7]:
df_credit.show(10)

In [8]:
# df_credit = spark.read.csv("/FileStore/tables/german_credit_updata.csv", header=True)
df_credit = spark.read.csv("/FileStore/tables/german_credit_data.csv", header=True)

In [9]:
# You can use show(n) to take a look into the dataframe
df_credit.show(5)

In [10]:
from pyspark.ml.feature import Imputer

In [11]:
# You can use show(n) to take a look into the dataframe
df_credit.show(5)

In [12]:
df_credit=df_credit.toPandas()


In [13]:
df_credit.isna().sum()

### Remove checking account

In [15]:
# del df_credit['Checking account']

### Verify null values in data

In [17]:
df_credit.isna().sum()

In [18]:
df_credit['Saving accounts'] = df_credit['Saving accounts'].map({"little":0,"moderate":1,"quite rich":2 ,"rich":3 });
# df_credit['Saving accounts'] = df_credit['Saving accounts'].fillna(df_credit['Saving accounts'].dropna().mean())

# df_credit['Checking account'] = df_credit['Checking account'].map({"little":0,"moderate":1,"rich":2 });
# df_credit['Checking account'] = df_credit['Checking account'].fillna(df_credit['Checking account'].dropna().mean())

df_credit['Sex'] = df_credit['Sex'].map({"male":0,"female":1}).astype(float);

df_credit['Housing'] = df_credit['Housing'].map({"own":0,"free":1,"rent":2}).astype(float);

df_credit['Purpose'] = df_credit['Purpose'].map({'radio/TV':0, 'education':1, 'furniture/equipment':2, 'car':3, 'business':4,
       'domestic appliances':5, 'repairs':6, 'vacation/others':7}).astype(float);

In [19]:
df_credit.head(50)

Unnamed: 0,_c0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,0,67,0.0,2,0.0,,little,1169,6,0.0,good
1,1,22,1.0,2,0.0,0.0,moderate,5951,48,0.0,bad
2,2,49,0.0,1,0.0,0.0,,2096,12,1.0,good
3,3,45,0.0,2,1.0,0.0,little,7882,42,2.0,good
4,4,53,0.0,2,1.0,0.0,little,4870,24,3.0,bad
5,5,35,0.0,1,1.0,,,9055,36,1.0,good
6,6,53,0.0,2,0.0,2.0,,2835,24,2.0,good
7,7,35,0.0,3,2.0,0.0,moderate,6948,36,3.0,good
8,8,61,0.0,1,0.0,3.0,,3059,12,0.0,good
9,9,28,0.0,3,0.0,0.0,moderate,5234,30,3.0,bad


In [20]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Exploratory Data Analysis

In [22]:
df_credit = pd.read_csv("/dbfs/FileStore/tables/german_credit_data.csv")
df_credit.head() 

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,0,67,male,2,own,,little,1169,6,radio/TV,good
1,1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,2,49,male,1,own,little,,2096,12,education,good
3,3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,4,53,male,2,free,little,little,4870,24,car,bad


In [23]:
df_credit.isnull().sum()

### Imbalance Check in DataSet

In [25]:
# import seaborn as sns
# import matplotlib.pyplot as plt
plt.figure(figsize=(3,4))
print(df_credit['Risk'].value_counts())
display(sns.countplot(df_credit['Risk']))
# display(sns.catplot(x="Risk", kind="count", data=df_credit))

### Risk v/s Duration

In [27]:
plt.figure(figsize=(3,4))
display(sns.boxplot(x='Duration', y='Risk', data=df_credit))

#### Age v/s Credibility

In [29]:
# Ploting Age Distribution
# plt.figure(figsize=(5,8))
df_good = df_credit[df_credit["Risk"] == 'good']
df_bad = df_credit[df_credit["Risk"] == 'bad']

fig, ax = plt.subplots(nrows=2, figsize=(14,8))
plt.subplots_adjust(hspace = 0.4, top = 0.8)

g1 = sns.distplot(df_good["Age"], ax=ax[0], 
             color="g")
g1 = sns.distplot(df_bad["Age"], ax=ax[0], 
             color='r')
g1.set_title("Age", fontsize=15)
g1.set_xlabel("Age")
g1.set_xlabel("Frequency")

g2 = sns.countplot(x="Age",data=df_credit, 
              palette="hls", ax=ax[1], 
              hue = "Risk")
g2.set_title("Age Counting by Creditability", fontsize=15)
g2.set_xlabel("Age")
g2.set_xlabel("Count")
display(plt.show())

#### Risk v/s Credit Amount

In [31]:
plt.figure(figsize=(3,3))
display(sns.boxplot(x='Risk', y='Credit amount', data=df_credit))

In [32]:
plt.figure(figsize=(3,3))
display(sns.swarmplot(x='Risk', y='Age', data=df_credit))

#### Duration v/s Credit amount

In [34]:
plt.figure(figsize=(8,4))
# sns.boxplot(x='Duration', y='Credit amount', data=df_credit)
display(sns.boxplot(x='Duration', y='Credit amount', data=df_credit))
# display(plt.figure())

#### Risk V/S Sex

In [36]:
plt.figure(figsize=(4,4))
display(sns.countplot(data=df_credit,x='Risk',hue='Sex'))

#### Risk v/s Credit Amount v/s Sex

In [38]:
plt.figure(figsize=(4,4))
display(sns.boxplot(x='Risk', y='Credit amount', data=df_credit,hue='Sex'))

#### Risk V/s Credit Amount

In [40]:
plt.figure(figsize=(4,4))
display(sns.boxplot(x='Risk', y='Credit amount', data=df_credit))

#### Relation between Features (Heat Map)

In [42]:
_df_credit=df_credit

In [43]:
df_credit['Saving accounts'] = df_credit['Saving accounts'].map({"little":0,"moderate":1,"quite rich":2 ,"rich":3 });
df_credit['Saving accounts'] = df_credit['Saving accounts'].fillna(df_credit['Saving accounts'].dropna().mean())

df_credit['Checking account'] = df_credit['Checking account'].map({"little":0,"moderate":1,"rich":2 });
df_credit['Checking account'] = df_credit['Checking account'].fillna(df_credit['Checking account'].dropna().mean())

df_credit['Sex'] = df_credit['Sex'].map({"male":0,"female":1}).astype(float);

df_credit['Housing'] = df_credit['Housing'].map({"own":0,"free":1,"rent":2}).astype(float);

df_credit['Purpose'] = df_credit['Purpose'].map({'radio/TV':0, 'education':1, 'furniture/equipment':2, 'car':3, 'business':4,
       'domestic appliances':5, 'repairs':6, 'vacation/others':7}).astype(float);

### Verify correlation between features

In [45]:
corr=df_credit.corr()
plt.figure(figsize=(6,6))
_sns=sns.heatmap(corr[(corr >= 0.5) | (corr <= -0.5)], vmax=.8, linewidths=0.01,square=True,annot=True,cmap='GnBu',linecolor="white")
plt.title('Correlation between features')
display(_sns)

In [46]:
df_credit=_df_credit

In [47]:
df_credit.head()

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,0,67,0.0,2,0.0,0.456548,0.0,1169,6,0.0,good
1,1,22,1.0,2,0.0,0.0,1.0,5951,48,0.0,bad
2,2,49,0.0,1,0.0,0.0,0.651815,2096,12,1.0,good
3,3,45,0.0,2,1.0,0.0,0.0,7882,42,2.0,good
4,4,53,0.0,2,1.0,0.0,0.0,4870,24,3.0,bad


In [48]:
data = pd.get_dummies(data=df_credit, columns=['Sex', 'Saving accounts','Checking account','Purpose', 'Age', 'Housing' ])

In [49]:
data.head()

Unnamed: 0.1,Unnamed: 0,Job,Credit amount,Duration,Risk,Sex_0.0,Sex_1.0,Saving accounts_0.0,Saving accounts_0.4565483476132191,Saving accounts_1.0,Saving accounts_2.0,Saving accounts_3.0,Checking account_0.0,Checking account_0.6518151815181518,Checking account_1.0,Checking account_2.0,Purpose_0.0,Purpose_1.0,Purpose_2.0,Purpose_3.0,Purpose_4.0,Purpose_5.0,Purpose_6.0,Purpose_7.0,Age_19,Age_20,Age_21,Age_22,Age_23,Age_24,Age_25,Age_26,Age_27,Age_28,Age_29,Age_30,Age_31,Age_32,Age_33,Age_34,Age_35,Age_36,Age_37,Age_38,Age_39,Age_40,Age_41,Age_42,Age_43,Age_44,Age_45,Age_46,Age_47,Age_48,Age_49,Age_50,Age_51,Age_52,Age_53,Age_54,Age_55,Age_56,Age_57,Age_58,Age_59,Age_60,Age_61,Age_62,Age_63,Age_64,Age_65,Age_66,Age_67,Age_68,Age_70,Age_74,Age_75,Housing_0.0,Housing_1.0,Housing_2.0
0,0,2,1169,6,good,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0
1,1,2,5951,48,bad,0,1,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,2,1,2096,12,good,1,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,3,2,7882,42,good,1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,4,2,4870,24,bad,1,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [50]:
y = data['Risk']

In [51]:
x=data.drop(columns=['Risk'],axis=1)

In [52]:
x.head()

Unnamed: 0.1,Unnamed: 0,Job,Credit amount,Duration,Sex_0.0,Sex_1.0,Saving accounts_0.0,Saving accounts_0.4565483476132191,Saving accounts_1.0,Saving accounts_2.0,Saving accounts_3.0,Checking account_0.0,Checking account_0.6518151815181518,Checking account_1.0,Checking account_2.0,Purpose_0.0,Purpose_1.0,Purpose_2.0,Purpose_3.0,Purpose_4.0,Purpose_5.0,Purpose_6.0,Purpose_7.0,Age_19,Age_20,Age_21,Age_22,Age_23,Age_24,Age_25,Age_26,Age_27,Age_28,Age_29,Age_30,Age_31,Age_32,Age_33,Age_34,Age_35,Age_36,Age_37,Age_38,Age_39,Age_40,Age_41,Age_42,Age_43,Age_44,Age_45,Age_46,Age_47,Age_48,Age_49,Age_50,Age_51,Age_52,Age_53,Age_54,Age_55,Age_56,Age_57,Age_58,Age_59,Age_60,Age_61,Age_62,Age_63,Age_64,Age_65,Age_66,Age_67,Age_68,Age_70,Age_74,Age_75,Housing_0.0,Housing_1.0,Housing_2.0
0,0,2,1169,6,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0
1,1,2,5951,48,0,1,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,2,1,2096,12,1,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,3,2,7882,42,1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,4,2,4870,24,1,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [53]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(x)
data = scaler.transform(x)

In [54]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(y)
y = pd.Series(le.transform(y))
y.head()

### Import Library for machine Learning model development

In [56]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
# from xgboost import XGBClassifier

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, fbeta_score

from sklearn.model_selection import train_test_split, KFold, cross_val_score

###  Split train and test dataset - 80 and 20 percent

In [58]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state=42)

### Apply multiple machine learning algorithm together to find accuracy between them

In [60]:
models = [RandomForestClassifier(), LogisticRegression(), DecisionTreeClassifier(), KNeighborsClassifier(), LinearDiscriminantAnalysis(),GaussianNB(), SVC()]
for model in models:
    model.fit(x_train,y_train)
    print(model,'Accuracy = ',accuracy_score(y_test, model.predict(x_test)))
    print('classification_report = ',classification_report(y_test, model.predict(x_test)))
    print('\n')

In [61]:
# xgb_model = XGBClassifier()
# xgb_model.fit(x_train, y_train)
# print('Accuracy = ',accuracy_score(y_test, xgb_model.predict(x_test)))
# print('classification_report = ',classification_report(y_test, xgb_model.predict(x_test))

### Random Forest Model - Evaluation via test set data

In [63]:
# Logistic Regression
Rf_model = RandomForestClassifier()
Rf_model.fit(x_train, y_train)
print('Accuracy = ',accuracy_score(y_test, Rf_model.predict(x_test)))
print('classification_report = ',classification_report(y_test, Rf_model.predict(x_test)))

In [64]:
import numpy as np

###  Hyper-Parameter Tuning Process

In [66]:
# param_grid = {"penalty":['l1', 'l2'],'multi_class':['auto','ovr','multinomial'],'C':[.001,.01,.1,1,10,100]}
# param_grid = {"penalty":['l1'],'C':[.001,.01,.1,1,10,100]}
param_grid = [
{'n_estimators': [5,7,10, 25], 'max_features': [2,3,5, 10], 
 'max_depth': [3,7,8,10, 50, None], 'bootstrap': [True, False]}
]
model = RandomForestClassifier()
grid_search = GridSearchCV(model, param_grid=param_grid, cv=3, scoring='recall', verbose=1)
grid_search.fit(x_train, y_train)

###  Best Parameter for Random Forest Model

In [68]:
print(grid_search.best_params_)

### Apply Hyper-parameter tuning for Logistic Regression

In [70]:
# param_grid = {"penalty":['l1', 'l2'],'multi_class':['auto','ovr','multinomial'],'C':[.001,.01,.1,1,10,100]}
param_grid1 = {"penalty":['l1'],'C':[.001,.01,.1,1,10,100]}
# param_grid = [
# {'n_estimators': [5,7,10, 25], 'max_features': [2,3,5, 10], 
#  'max_depth': [3,7,8,10, 50, None], 'bootstrap': [True, False]}
# ]
model1 = LogisticRegression()
grid_search1 = GridSearchCV(model1, param_grid=param_grid1, cv=3, scoring='recall', verbose=1)
grid_search1.fit(x_train, y_train)

### Best Parameter for logistic regression model

In [72]:
print(grid_search1.best_params_)

In [73]:
print('Accuracy_Score',accuracy_score(y_test,grid_search1.predict(x_test)))
print("\n")
print('Classification_Report',classification_report(y_test, grid_search1.predict(x_test)))

In [74]:
# param_grid = {"penalty":['l1', 'l2'],'multi_class':['auto','ovr','multinomial'],'C':[.001,.01,.1,1,10,100]}
param_grid2 = {"penalty":['l2'],'C':[.001,.01,.1,1,10,100]}
# param_grid = [
# {'n_estimators': [5,7,10, 25], 'max_features': [2,3,5, 10], 
#  'max_depth': [3,7,8,10, 50, None], 'bootstrap': [True, False]}
# ]
model2 = LogisticRegression()
grid_search2 = GridSearchCV(model2, param_grid=param_grid2, cv=3, scoring='recall', verbose=1)
grid_search2.fit(x_train, y_train)

In [75]:
print(grid_search2.best_params_)

In [76]:
print('Accuracy_Score',accuracy_score(y_test,grid_search2.predict(x_test)))
print("\n")
print('Classification_Report',classification_report(y_test, grid_search2.predict(x_test)))