In [79]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, export_graphviz, export_text
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import tree
from sklearn.preprocessing import OneHotEncoder
import graphviz
import matplotlib.pyplot as plt

raw_data = {
 'Name': ['Tim', 'Joe', 'Sue', 'John', 'Mary', 'Fred', 'Pete',
'Jacob', 'Sofia'],
 'Debt': ['low', 'high', 'low', 'medium', 'high', 'low', 'low',
'high', 'medium'],
 'Income': ['low', 'high', 'high', 'low', 'low', 'low', 'medium',
'medium', 'low'],
 'Married?': ['no', 'yes', 'yes', 'no', 'yes', 'yes', 'no', 'yes',
'no'],
 'Owns_Property': ['no', 'yes', 'no', 'no', 'no', 'no', 'yes',
'yes', 'no'],
 'Gender': ['male', 'male', 'female', 'male', 'female', 'male',
'male', 'male', 'female'],
 'Risk': ['low', 'low', 'low', 'high', 'high', 'high', 'low',
'low', 'high']}
data = pd.DataFrame(raw_data)
print(data.info())
duplicates = data.duplicated().sum()
print(f'Duplicates = {duplicates}')
print(data)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Name           9 non-null      object
 1   Debt           9 non-null      object
 2   Income         9 non-null      object
 3   Married?       9 non-null      object
 4   Owns_Property  9 non-null      object
 5   Gender         9 non-null      object
 6   Risk           9 non-null      object
dtypes: object(7)
memory usage: 632.0+ bytes
None
Duplicates = 0
    Name    Debt  Income Married? Owns_Property  Gender  Risk
0    Tim     low     low       no            no    male   low
1    Joe    high    high      yes           yes    male   low
2    Sue     low    high      yes            no  female   low
3   John  medium     low       no            no    male  high
4   Mary    high     low      yes            no  female  high
5   Fred     low     low      yes            no    male  high
6   Pete     l

In [80]:
data['Married?'] = data['Married?'].map({'no': 0, 'yes': 1})
data['Owns_Property'] = data['Owns_Property'].map({'no': 0, 'yes': 1})
data['Gender'] = data['Gender'].map({'female': 0, 'male': 1})
data['Risk'] = data['Risk'].map({'low': 0, 'high': 1})

mapping = {'low': 0, 'medium': 1, 'high': 2}

data['Debt'] = data['Debt'].map(mapping)
data['Income'] = data['Income'].map(mapping)
print(data)

    Name  Debt  Income  Married?  Owns_Property  Gender  Risk
0    Tim     0       0         0              0       1     0
1    Joe     2       2         1              1       1     0
2    Sue     0       2         1              0       0     0
3   John     1       0         0              0       1     1
4   Mary     2       0         1              0       0     1
5   Fred     0       0         1              0       1     1
6   Pete     0       1         0              1       1     0
7  Jacob     2       1         1              1       1     0
8  Sofia     1       0         0              0       0     1


In [81]:
X = data[['Debt', 'Income', 'Married?', 'Owns_Property', 'Gender']]
Y = data['Risk']

#Since we have separate data to train, we won't split this data into train and test. 

classifier = DecisionTreeClassifier(random_state=42)
classifier.fit(X, Y)

dot_data = export_graphviz(classifier, out_file=None, feature_names=['Debt', 'Income', 'Married?', 'Owns_Property', 'Gender'], class_names=['low', 'high'], filled=True, rounded=True, special_characters=True)
graph = graphviz.Source(dot_data)
graph.render("Credit Risk Tree", format='png')
graph.view()

'Credit Risk Tree.pdf'

In [78]:
test_data = pd.DataFrame({
 'Debt': [0,0],
 'Income_low': [0,1],
 'Married?_no': [0, 1],
 'Owns_Property': [1, 1],
 'Gender': [1, 0],
})

test_data = test_data.reindex(columns=X.columns, fill_value=0)
predict = classifier.predict(test_data)
predict_risk = ['low' if risk == 0 else 'high' for risk in predictions]
print(f"Tom credit risk: {predict_risk[0]}")
print(f"Ana credit risk: {predict_risk[1]}")


Tom credit risk: low
Ana credit risk: low


In [69]:
def print_tree(tree, feature_names, class_names, node=0, depth=0):
   
    if tree.children_left[node] == tree.children_right[node]:  # Leaf node
        print(f"{'|  ' * depth}Predict: {class_names[tree.value[node].argmax()]}")
    else:
        feature_index = tree.feature[node]
        threshold = tree.threshold[node]
        
        # Print the decision rule
        print(f"{'|  ' * depth}{feature_names[feature_index]} <= {threshold:.2f}")
        # Left child
        print_tree(tree, feature_names, class_names, tree.children_left[node], depth + 1)
        # Right child
        print(f"{'|  ' * depth}{feature_names[feature_index]} > {threshold:.2f}")
        print_tree(tree, feature_names, class_names, tree.children_right[node], depth + 1)

from sklearn.tree import _tree

feature_names = X.columns.tolist()
class_names = ['Low Risk', 'High Risk']

print_tree(classifier.tree_, feature_names, class_names)

Income <= 0.50
|  Married? <= 0.50
|  |  Debt <= 0.50
|  |  |  Predict: Low Risk
|  |  Debt > 0.50
|  |  |  Gender <= 0.50
|  |  |  |  Predict: Low Risk
|  |  |  Gender > 0.50
|  |  |  |  Predict: High Risk
|  Married? > 0.50
|  |  Predict: High Risk
Income > 0.50
|  Predict: Low Risk


In [72]:
used_features = set(X.columns) - set(classifier.feature_importances_ == 0)
print("Features not playing a role in the new decision tree:", set(X.columns) - used_features)

Features not playing a role in the new decision tree: set()


In [73]:
print(X)

   Debt  Income  Married?  Owns_Property  Gender
0     0       0         0              0       1
1     2       2         1              1       1
2     0       2         1              0       0
3     1       0         0              0       1
4     2       0         1              0       0
5     0       0         1              0       1
6     0       1         0              1       1
7     2       1         1              1       1
8     1       0         0              0       0
