In [48]:
import pandas as pd
import numpy as np
import seaborn as sns
!pip install plotly
import plotly.offline as py 
py.init_notebook_mode(connected=True) # this code, allow us to work with offline plotly version
import plotly.graph_objs as go # it's like "plt" of matplot
import plotly.express as px
import plotly.tools as tls # It's useful to we get some tools of plotly
import warnings # This library will be used to ignore some warnings
from collections import Counter # To do counter of some features
############################################################################################
#Importing necessary packages in Python 
%matplotlib inline 
import matplotlib.pyplot as plt 
from sklearn.datasets import make_classification 
from sklearn.learning_curve import learning_curve 
#from sklearn.cross_validation import train_test_split 
#from sklearn.grid_search import GridSearchCV
#from sklearn.cross_validation import ShuffleSplit
from sklearn.metrics import classification_report,confusion_matrix, roc_curve, roc_auc_score, auc, accuracy_score
from sklearn.model_selection import ShuffleSplit,train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, label_binarize, StandardScaler, MinMaxScaler



In [19]:
credit=pd.read_csv('german_credit_data.csv')
print("The dataset is {} credit record".format(len(credit)))

The dataset is 1000 credit record


# check data structure and contain:

In [3]:
credit.head(2)

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,0,67,male,2,own,,little,1169,6,radio/TV,good
1,1,22,female,2,own,little,moderate,5951,48,radio/TV,bad


In [4]:
credit=credit.iloc[:, 1:]

In [5]:
credit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Age               1000 non-null   int64 
 1   Sex               1000 non-null   object
 2   Job               1000 non-null   int64 
 3   Housing           1000 non-null   object
 4   Saving accounts   817 non-null    object
 5   Checking account  606 non-null    object
 6   Credit amount     1000 non-null   int64 
 7   Duration          1000 non-null   int64 
 8   Purpose           1000 non-null   object
 9   Risk              1000 non-null   object
dtypes: int64(4), object(6)
memory usage: 78.2+ KB


In [6]:
credit.describe()

Unnamed: 0,Age,Job,Credit amount,Duration
count,1000.0,1000.0,1000.0,1000.0
mean,35.546,1.904,3271.258,20.903
std,11.375469,0.653614,2822.736876,12.058814
min,19.0,0.0,250.0,4.0
25%,27.0,2.0,1365.5,12.0
50%,33.0,2.0,2319.5,18.0
75%,42.0,2.0,3972.25,24.0
max,75.0,3.0,18424.0,72.0


# descriptive analysis

In [None]:
credit['Sex'].value_counts()

Let's now cross validate and breakdown 

In [7]:
SA = credit.loc[:,['Sex','Age']]
fig = px.box(SA, x="Sex", y="Age", points="all",color="Sex")
fig.update_layout(
    title={
          'text':"Sex Vs Age Cross tabulation",
        'y':.95,
        'x':.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    xaxis_title="Sex",
    yaxis_title="Age",
   
)
fig.show()

In [8]:
SC =credit.loc[:,['Sex','Credit amount']]
fig = px.box(SC, x="Sex", y="Credit amount", points="all", color="Sex")
fig.update_traces(quartilemethod="exclusive") # or "inclusive", or "linear" by default
fig.update_layout(
    title={
          'text':"Sex Vs Credit Amount Cross tabulation",
        'y':.95,
        'x':.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    xaxis_title="Sex",
    yaxis_title="Age",
   
)
fig.show()

In [11]:
Purpose = credit['Purpose']
fig = px.histogram(credit, x="Purpose", color="Purpose")
fig.show()

In [10]:
SC =credit.loc[:,['Purpose','Credit amount']]
fig = px.box(SC, x="Purpose", y="Credit amount", color="Purpose")
fig.update_traces(quartilemethod="exclusive") # or "inclusive", or "linear" by default
fig.update_layout(
    title={
          'text':"Purpose Vs Credit Amount Cross tabulation",
        'y':.95,
        'x':.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    xaxis_title="Purpose",
    yaxis_title="Credit amount",
   
)
fig.show()

In [12]:
import ipywidgets as widgets
# Create dimensions
gender_dim = go.parcats.Dimension(values=credit.Sex, label="Sex")
Housing_dim = go.parcats.Dimension(values=credit.Housing, label="Housing")
#Saving_accounts_dim = go.parcats.Dimension(values=credit['Saving accounts'], label="Saving accounts")
Checking_account_dim = go.parcats.Dimension(values=credit['Checking account'], label="Checking account")
Purpose_dim = go.parcats.Dimension(values=credit.Purpose, label="Purpose")
Risk_dim = go.parcats.Dimension(values=credit.Risk, label="Risk")
# Create parcats trace
color = np.zeros(len(credit), dtype='uint8')
colorscale = [[0, 'gray'], [0.33, 'firebrick'],
              [0.33, 'firebrick'], [0.66, 'blue'],
              [0.66, 'blue'], [1.0, 'green']]


fig = go.Figure(data = [go.Parcats(dimensions=[gender_dim, Housing_dim,Checking_account_dim,Purpose_dim,Risk_dim],
        #line={'color':color,'colorscale': colorscale},
         hoveron='color', hoverinfo='count+probability',
        labelfont={'size': 18, 'family': 'Times'},
        tickfont={'size': 16, 'family': 'Times'},
        arrangement='freeform')])

fig.show()

In [None]:
PC =credit.loc[:,['Purpose','Credit amount','Risk']]
fig = go.Figure()

fig.add_trace(go.Violin(x=PC['Purpose'][ PC['Risk'] == 'good' ],
                        y=PC['Credit amount'][  PC['Risk'] == 'good' ],
                        legendgroup='good', scalegroup='good', name='good',
                        side='negative',
                        line_color='blue')
             )
fig.add_trace(go.Violin(x=PC['Purpose'][ PC['Risk'] == 'bad' ],
                        y=PC['Credit amount'][  PC['Risk'] == 'bad' ],
                        legendgroup='bad', scalegroup='bad', name='bad',
                        side='positive',
                        line_color='orange')
             )
fig.update_traces(meanline_visible=True)
fig.update_layout(violingap=0, violinmode='overlay')
fig.update_layout(
    title={
          'text':"Purpose Vs Credit Amount Cross tabulation",
        'y':.95,
        'x':.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    xaxis_title="Purpose",
    yaxis_title="Credit amount",
   
)
fig.show()

# Feature engineering and transformation

In [20]:
credit['Risk'] = credit['Risk'].map({'bad':0, 'good':1})

In [37]:
credit_with_null=credit.copy()

# Machine learning

In [46]:
from sklearn.model_selection import train_test_split
cat_features = ['Sex','Housing', 'Saving accounts', 'Checking account','Purpose']
num_features=['Age', 'Job', 'Credit amount', 'Duration','Risk']
for variable in cat_features:
    dummies = pd.get_dummies(credit_with_null[cat_features])
    df1= pd.concat([credit_with_null[num_features], dummies],axis=1)
    #df1.drop([variable], axis=1, inplace=True)

Risk= df1['Risk']          
df2=df1.drop(['Risk'],axis=1)
X_train,X_test,Y_train,Y_test = train_test_split(df2,Risk,test_size=0.20,random_state = 30)

In [54]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import fbeta_score
lr = LogisticRegression()
lr.fit(X_train, Y_train)
y_test_pred = lr.predict(X_test)

# checking results:
print(accuracy_score(Y_test,y_test_pred))
print("\n")
print(confusion_matrix(Y_test, y_test_pred))
print("\n")
print(fbeta_score(Y_test, y_test_pred, beta=2))


0.71


[[ 24  40]
 [ 18 118]]


0.8404558404558405


In [55]:
credit_without_null=credit.copy()
credit_without_null['Saving accounts'] = credit_without_null['Saving accounts'].fillna('no_inf')
credit_without_null['Checking account'] = credit_without_null['Checking account'].fillna('no_inf')

In [56]:
from sklearn.model_selection import train_test_split
cat_features = ['Sex','Housing', 'Saving accounts', 'Checking account','Purpose']
num_features=['Age', 'Job', 'Credit amount', 'Duration','Risk']
for variable in cat_features:
    dummies = pd.get_dummies(credit_without_null[cat_features])
    df3= pd.concat([credit_without_null[num_features], dummies],axis=1)
Risk= df1['Risk']          
df4=df3.drop(['Risk'],axis=1)
X_train_2,X_test_2,Y_train_2,Y_test_2 = train_test_split(df4,Risk,test_size=0.20,random_state = 30)

In [57]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import fbeta_score
lr = LogisticRegression()
lr.fit(X_train_2, Y_train_2)
y_test_pred_2 = lr.predict(X_test_2)

# checking results:
print(accuracy_score(Y_test_2,y_test_pred_2))
print("\n")
print(confusion_matrix(Y_test_2, y_test_pred_2))
print("\n")
print(fbeta_score(Y_test_2, y_test_pred_2, beta=2))

0.705


[[ 24  40]
 [ 19 117]]


0.8345221112696148


let's start dealing with null values and transform all string feature to numeric one ( one hot encoding)