Objectives :
In this hands-on lab, you will learn how to apply decision tree machine learning techniques to analyze past results, to accurately predict future outcomes

In [1]:
import pandas as pd
df = pd.read_csv('data/german_credit_data.csv')
df.drop('Unnamed: 0',axis=1, inplace=True)
df.head(4)

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,male,2,own,,little,1169,6,radio/TV,good
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,49,male,1,own,little,,2096,12,education,good
3,45,male,2,free,little,little,7882,42,furniture/equipment,good


In [2]:
df.columns

Index(['Age', 'Sex', 'Job', 'Housing', 'Saving accounts', 'Checking account',
       'Credit amount', 'Duration', 'Purpose', 'Risk'],
      dtype='object')

In [3]:
df.shape

(1000, 10)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Age               1000 non-null   int64 
 1   Sex               1000 non-null   object
 2   Job               1000 non-null   int64 
 3   Housing           1000 non-null   object
 4   Saving accounts   817 non-null    object
 5   Checking account  606 non-null    object
 6   Credit amount     1000 non-null   int64 
 7   Duration          1000 non-null   int64 
 8   Purpose           1000 non-null   object
 9   Risk              1000 non-null   object
dtypes: int64(4), object(6)
memory usage: 78.2+ KB


In [5]:
df['Risk'].unique()

array(['good', 'bad'], dtype=object)

In [6]:
df['Risk'].value_counts()

good    700
bad     300
Name: Risk, dtype: int64

In [7]:
df_tidy=df.copy()

In [8]:
categorical_cols = ['Sex', 'Housing', 'Saving accounts','Checking account' , 'Purpose','Risk']
for column in categorical_cols:
    df_tidy[column]=df_tidy[column].astype('category')
    df_tidy[column] = df_tidy[column].cat.codes
df_tidy.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,1,2,1,-1,0,1169,6,5,1
1,22,0,2,1,0,1,5951,48,5,0
2,49,1,1,1,0,-1,2096,12,3,1
3,45,1,2,0,0,0,7882,42,4,1
4,53,1,2,0,0,0,4870,24,1,0


In [9]:
# for first time, we will all features without any prepartion just necessary ones
features1=['Age','Sex', 'Job', 'Housing', 'Saving accounts','Checking account' , 'Credit amount', 'Duration', 'Purpose']

In [10]:
x=df_tidy[features1]
y=df_tidy['Risk']
x.head(3)

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose
0,67,1,2,1,-1,0,1169,6,5
1,22,0,2,1,0,1,5951,48,5
2,49,1,1,1,0,-1,2096,12,3


In [11]:
y.head(3)

0    1
1    0
2    1
Name: Risk, dtype: int8

In [12]:
# Split the data set to train and test set using fraction (75% and 25%), you may chnage it, train_size=0.8,09
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=1)
print(X_train.shape)
print(X_test.shape)

(750, 9)
(250, 9)


In [13]:
### Build models : decision tree with differents features
from sklearn import tree
DT_model_inst = tree.DecisionTreeClassifier()
DT_model_trained = DT_model_inst.fit(X_train, y_train)
 
# to evaluate built model
from sklearn.metrics import confusion_matrix, classification_report
y_predict = DT_model_trained.predict(X_test)
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_predict)

array([[ 36,  38],
       [ 44, 132]], dtype=int64)

In [14]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_predict)

0.672

In [15]:
y_predict[0:5]

array([1, 0, 1, 1, 1], dtype=int8)

In [16]:
y_test.head()

507    0
818    1
452    1
368    0
242    0
Name: Risk, dtype: int8

In [17]:
y_predict.item(2)

1

In [18]:
df.iloc[452]

Age                                  34
Sex                                male
Job                                   2
Housing                             own
Saving accounts                  little
Checking account                    NaN
Credit amount                      2759
Duration                             12
Purpose             furniture/equipment
Risk                               good
Name: 452, dtype: object

In [19]:
y_predict.item(1)

0

In [20]:
df.iloc[818]

Age                              43
Sex                            male
Job                               3
Housing                         own
Saving accounts              little
Checking account             little
Credit amount                 15857
Duration                         36
Purpose             vacation/others
Risk                           good
Name: 818, dtype: object

In [21]:
DT_model_trained.feature_importances_

array([0.19069875, 0.01302295, 0.04830613, 0.03773997, 0.05403384,
       0.1389658 , 0.29581834, 0.15162987, 0.06978433])

In [22]:
list(zip(features1, DT_model_trained.feature_importances_))

[('Age', 0.19069875395502928),
 ('Sex', 0.013022949146538962),
 ('Job', 0.048306130697327),
 ('Housing', 0.037739969784994436),
 ('Saving accounts', 0.054033844955291094),
 ('Checking account', 0.13896580484950424),
 ('Credit amount', 0.29581834251947653),
 ('Duration', 0.1516298700509497),
 ('Purpose', 0.0697843340408887)]

In [23]:
client1=X_test.iloc[50:51]
client1

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose
713,25,1,1,1,0,0,1138,9,5


In [24]:
df.iloc[713]

Age                       25
Sex                     male
Job                        1
Housing                  own
Saving accounts       little
Checking account      little
Credit amount           1138
Duration                   9
Purpose             radio/TV
Risk                    good
Name: 713, dtype: object

In [25]:
print(DT_model_trained.predict(client1))

[1]
