# Import Statements

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import random
from pprint import pprint

# Load and Prepare Data
## Format of the data:
 - last column of the data frame must contain the label and it must also be called "label"
 - there should be no missing values in the data frame

In [177]:
def format_data(df):
    df_formatted = df.copy()
    column_name_list = df.columns
    for name in column_name_list:
        new_name = name.replace(" ", "_")
        df_formatted.rename(columns={name: new_name}, inplace=True)
    return df_formatted

# Train-Test-Split

In [5]:
def train_test_split(df, test_size):
    if isinstance(test_size, float):
        test_size = round(test_size * len(df))

    indices = df.index.tolist()
    test_indices = random.sample(population=indices, k=test_size)

    test_df = df.loc[test_indices]
    train_df = df.drop(test_indices)
    
    return train_df, test_df

# Helper Functions

In [6]:
#data = train_df.values #We convert df into an array for improving efficiency
#data[:5]

## Data pure?

In [7]:
def check_purity(data):
    label_column = data[:,-1] # select all rows and the label column of each
    unique_classes = np.unique(label_column)

    if len(unique_classes) == 1:
        return True
    else:
        return False

## Classify

In [8]:
def classify_data(data): #Chehk the majority in case the data is not totally pure
    label_column = data[:,-1]
    unique_classes, counts_unique_classes = np.unique(label_column, return_counts=True)
    
    index = counts_unique_classes.argmax()
    classification = unique_classes[index]
    
    return classification

## Potential splits?

In [9]:
def get_potential_splits(data):
   
    potential_splits = {}
    _, n_columns = data.shape
    
    for column_index in range(n_columns - 1):
        values = data[:, column_index] #values of each column
        unique_values = np.unique(values)
        
        potential_splits[column_index] = unique_values
            
    return potential_splits


## Split data

In [10]:
def split_data(data, split_column, split_value): # Use to split data in a certain column/row and given a value
    
    split_column_value = data[:, split_column]
    
    type_of_feature = FEATURE_TYPES[split_column]
    if type_of_feature == "continuous":
        data_below = data[split_column_value <= split_value]
        data_above = data[split_column_value > split_value]
    else:
        data_below = data[split_column_value == split_value]
        data_above = data[split_column_value != split_value]
       
    return data_below, data_above

## Lowest Overall Entropy?

In [11]:
def calculate_entropy(data):
    
    label_column = data[:, -1]
    _, counts = np.unique(label_column, return_counts = True)

    probabilities = counts / counts.sum() #numpy arrays operates element by element
    entropy = sum(probabilities * -np.log2(probabilities))  #just the formula for computing entropy      
    
    return entropy

In [12]:
def calculate_overall_entropy(data_below, data_above):
    
    n_data_points = len(data_below) + len(data_above)
    
    p_data_below = len(data_below) / n_data_points
    p_data_above = len(data_above) / n_data_points

    overall_entropy = (p_data_below * calculate_entropy(data_below)
                   + p_data_above * calculate_entropy(data_above))
    return overall_entropy

In [13]:
def determine_best_split(data, potential_splits):
    
    overall_entropy = 9999 #arbitrary high value
    for column_index in potential_splits:
        for value in potential_splits[column_index]:
            data_below, data_above = split_data(data, split_column=column_index, split_value=value)
            current_overall_entropy = calculate_overall_entropy(data_below, data_above)
        
            if current_overall_entropy <= overall_entropy:
                overall_entropy = current_overall_entropy
                best_split_value = value
                best_split_column = column_index
                
    return best_split_column, best_split_value

# Decision Tree Algorithm

REPRESENTATION OF DECISION TREE: ----------->  
subtree = {question: [yes_answer, no_answer]}

## Determine Type of Feature

In [208]:
def determine_type_of_feature(df):
    feature_types = []
    n_unique_values_threshold = 8
    
    for column in df.columns:
        unique_values = df[column].unique()
        example_value = unique_values[0]
        
        if (isinstance(example_value, str)) or (len(unique_values) <= n_unique_values_threshold):
            feature_types.append("categorical")
        else:
            feature_types.append("continuous")

    
    return feature_types

## Algorithm

In [15]:
def decision_tree_algorithm(df, counter=0, min_samples=2, max_depth=5):
    
    # data preparations
    if counter == 0: #Transfort from dataFrame to array only in the first call (recursive alg)
        global COLUMNS_HEADERS, FEATURE_TYPES
        COLUMNS_HEADERS = df.columns
        FEATURE_TYPES = determine_type_of_feature(df)
        data = df.values
    else:
        data = df
        
    # base case
    if (check_purity(data)) or (len(data) < min_samples) or (counter == max_depth):
        classification = classify_data(data)
        return classification
    
    # recursive part
    else:
        counter += 1
        
        #helper functions
        potential_splits = get_potential_splits(data)
        split_column, split_value = determine_best_split(data, potential_splits)
        data_below, data_above = split_data(data, split_column, split_value)
        
        #check for empty data
        if len(data_below) == 0 or len(data_above) == 0:
            classification = classify_data(data)
            return classification
        
        #instanciate sub-tree
        feature_name = COLUMNS_HEADERS[split_column]
        type_of_feature = FEATURE_TYPES[split_column]
        if type_of_feature == "continuous":
            question = "{} <= {}".format(feature_name, split_value)
        else:
            question = "{} = {}".format(feature_name, split_value)
        sub_tree = {question: []}
        
        # find answer (recursion)
        yes_answer = decision_tree_algorithm(data_below, counter, min_samples, max_depth)
        no_answer = decision_tree_algorithm(data_above, counter, min_samples, max_depth)
        
        if yes_answer == no_answer:
            sub_tree = yes_answer
        else:
            sub_tree[question].append(yes_answer)
            sub_tree[question].append(no_answer)
        
        return sub_tree

# Classification
subtree = {question: [yes_answer, no_answer]}

In [16]:
#example = test_df.iloc[2]
#example

In [140]:
def classify_example(example, tree):
    question = list(tree.keys())[0]
    feature_name, comparison_operator, value = question.split()

    #ask question
    if comparison_operator == "<=":
        if example[feature_name] <= float(value):
            answer = tree[question][0]
        else:
            answer = tree[question][1]
            
    else:
        if str(example[feature_name]) == value:
            answer = tree[question][0]
        else:
            answer = tree[question][1]
    
    #base case
    if not isinstance(answer, dict):
        return answer

    #recursive call
    else:
        residual_tree = answer
        return classify_example(example, residual_tree)

In [18]:
#classify_example(example, tree)

# Accuracy

In [19]:
def calculate_accuracy(df, tree):
    
    df["classification"] = df.apply(classify_example, axis=1, args=(tree,))
    df["classification_correct"] = df.classification == df.label
    
    accuracy = df.classification_correct.mean()
    
    return accuracy

# Data Set Tree Examples

## Titanic

In [53]:
df = pd.read_csv("../data/Titanic.csv") #For read the csv file using Pandas library
df["label"] = df.Survived
df = df.drop(["PassengerId", "Survived", "Name", "Ticket", "Cabin"], axis=1) 


#handeling missing values
median_age = df.Age.median()
mode_embarked = df.Embarked.mode()[0]

df = df.fillna({"Age": median_age, "Embarked": mode_embarked})

In [21]:
df.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,label
0,3,male,22.0,1,0,7.25,S,0
1,1,female,38.0,1,0,71.2833,C,1
2,3,female,26.0,0,0,7.925,S,1
3,1,female,35.0,1,0,53.1,S,1
4,3,male,35.0,0,0,8.05,S,0


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    int64  
 1   Sex       891 non-null    object 
 2   Age       891 non-null    float64
 3   SibSp     891 non-null    int64  
 4   Parch     891 non-null    int64  
 5   Fare      891 non-null    float64
 6   Embarked  891 non-null    object 
 7   label     891 non-null    int64  
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


In [29]:
train_df, test_df = train_test_split(df, test_size=0.2)
tree = decision_tree_algorithm(train_df)
accuracy = calculate_accuracy(test_df, tree)

pprint(tree, width = 50)
print(accuracy)

{'Sex = male': [{'Fare <= 15.1': [{'Age <= 12.0': [1,
                                                   0]},
                                  {'Age <= 4.0': [{'Pclass = 3': [{'Fare <= 20.575': [1,
                                                                                      0]},
                                                                  1]},
                                                  0]}]},
                {'Pclass = 3': [{'Fare <= 22.3583': [{'Embarked = S': [{'Fare <= 10.5167': [0,
                                                                                            1]},
                                                                       {'Age <= 28.0': [1,
                                                                                        0]}]},
                                                     {'Parch = 0': [1,
                                                                    0]}]},
                                {'Fare <= 28.7125': [{'Age

## Red Wine Quality

In [209]:
df = pd.read_csv("../data/winequality-red.csv")
df["label"] = df.quality
df = df.drop(["quality"], axis=1) #axis = 0 means row while axis = 1 means column

In [210]:
#Only in case the columns has space charater in their names, we use this function in order to format column names 
df = format_data(df)

In [211]:
df.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,label
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [212]:
#Check if our data is complete
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed_acidity         1599 non-null   float64
 1   volatile_acidity      1599 non-null   float64
 2   citric_acid           1599 non-null   float64
 3   residual_sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free_sulfur_dioxide   1599 non-null   float64
 6   total_sulfur_dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  label                 1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [213]:
#Execution of the algorithim
train_df, test_df = train_test_split(df, test_size=0.3)
tree = decision_tree_algorithm(train_df)
print("-------------- Decision tree learned ---------------------")
pprint(tree, width=60)
accuracy = calculate_accuracy(test_df, tree)
print("Accuracy of our tree", accuracy )

-------------- Decision tree learned ---------------------
{'alcohol <= 10.5': [{'sulphates <= 0.55': [{'chlorides <= 0.094': [{'sulphates <= 0.52': [{'residual_sugar <= 3.9': [5.0,
                                                                                                                      6.0]},
                                                                                           5.0]},
                                                                    {'density <= 0.99471': [{'sulphates <= 0.48': [4.0,
                                                                                                                   3.0]},
                                                                                            5.0]}]},
                                            {'total_sulfur_dioxide <= 70.0': [{'volatile_acidity <= 0.53': [6.0,
                                                                                                            {'total_sulfur_dioxide <= 18.0'

In [214]:
print("-------------- Subset using for learning the tree --------------")
train_df.head()

-------------- Subset using for learning the tree --------------


Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,label
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
5,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5
6,7.9,0.6,0.06,1.6,0.069,15.0,59.0,0.9964,3.3,0.46,9.4,5
8,7.8,0.58,0.02,2.0,0.073,9.0,18.0,0.9968,3.36,0.57,9.5,7


In [215]:
print("-------------- Subset tested by the tree --------------")
test_df

-------------- Subset tested by the tree --------------


Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,label,classification,classification_correct
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,5.0,True
1168,6.5,0.340,0.27,2.8,0.067,8.0,44.0,0.99384,3.21,0.56,12.0,6,6.0,True
435,12.3,0.390,0.63,2.3,0.091,6.0,18.0,1.00040,3.16,0.49,9.5,5,5.0,True
1138,7.5,0.410,0.15,3.7,0.104,29.0,94.0,0.99786,3.14,0.58,9.1,5,5.0,True
272,10.9,0.370,0.58,4.0,0.071,17.0,65.0,0.99935,3.22,0.78,10.1,5,6.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1023,8.2,0.320,0.42,2.3,0.098,3.0,9.0,0.99506,3.27,0.55,12.3,6,6.0,True
1016,8.9,0.380,0.40,2.2,0.068,12.0,28.0,0.99486,3.27,0.75,12.6,7,7.0,True
1343,7.5,0.510,0.02,1.7,0.084,13.0,31.0,0.99538,3.36,0.54,10.5,6,5.0,False
1593,6.8,0.620,0.08,1.9,0.068,28.0,38.0,0.99651,3.42,0.82,9.5,6,5.0,False


In [216]:
print("--------- We choose a random example -----------")
example = df.loc[random.randint(1,len(df))]
example

--------- We choose a random example -----------


fixed_acidity           13.8000
volatile_acidity         0.4900
citric_acid              0.6700
residual_sugar           3.0000
chlorides                0.0930
free_sulfur_dioxide      6.0000
total_sulfur_dioxide    15.0000
density                  0.9986
pH                       3.0200
sulphates                0.9300
alcohol                 12.0000
label                    6.0000
Name: 347, dtype: float64

In [217]:
label = classify_example(example, tree)
print("Label:", label)

Label: 6.0


## Diabetes Indicator (from 0 to 10)

In [218]:
df = pd.read_csv("../data/diabetes.csv")
df["label"] = df.Income
df = df.drop(["Income"], axis=1) #axis = 0 means row while axis = 1 means column

In [219]:
#Only in case the columns has space charater in their names, we use this function in order to format column names 
df = format_data(df)

In [220]:
df.head()

Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,label
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [221]:
#Check if our data is complete
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253680 entries, 0 to 253679
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Diabetes_012          253680 non-null  float64
 1   HighBP                253680 non-null  float64
 2   HighChol              253680 non-null  float64
 3   CholCheck             253680 non-null  float64
 4   BMI                   253680 non-null  float64
 5   Smoker                253680 non-null  float64
 6   Stroke                253680 non-null  float64
 7   HeartDiseaseorAttack  253680 non-null  float64
 8   PhysActivity          253680 non-null  float64
 9   Fruits                253680 non-null  float64
 10  Veggies               253680 non-null  float64
 11  HvyAlcoholConsump     253680 non-null  float64
 12  AnyHealthcare         253680 non-null  float64
 13  NoDocbcCost           253680 non-null  float64
 14  GenHlth               253680 non-null  float64
 15  

In [222]:
#Execution of the algorithim
train_df, test_df = train_test_split(df, test_size=0.3)
tree = decision_tree_algorithm(train_df, max_depth= 3)
print("-------------- Decision tree learned ---------------------")
pprint(tree, width=60)
accuracy = calculate_accuracy(test_df, tree)
print("Accuracy of our tree", accuracy )

-------------- Decision tree learned ---------------------
{'Education = 6.0': [{'DiffWalk = 1.0': [{'NoDocbcCost = 1.0': [6.0,
                                                                8.0]},
                                         8.0]},
                     {'DiffWalk = 1.0': [{'Education = 5.0': [6.0,
                                                              3.0]},
                                         {'Age <= 9.0': [8.0,
                                                         6.0]}]}]}
Accuracy of our tree 0.3772732050877746


In [202]:
print("-------------- Subset using for learning the tree --------------")
train_df.head()

-------------- Subset using for learning the tree --------------


Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,label
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0
9,0.0,0.0,0.0,1.0,24.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,2.0,0.0,0.0,0.0,1.0,8.0,4.0,3.0
10,2.0,0.0,0.0,1.0,25.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,3.0,0.0,0.0,0.0,1.0,13.0,6.0,8.0


In [203]:
print("-------------- Subset tested by the tree --------------")
test_df

-------------- Subset tested by the tree --------------


Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,label,classification,classification_correct
230704,0.0,1.0,0.0,1.0,28.0,0.0,0.0,0.0,1.0,1.0,...,3.0,0.0,0.0,0.0,1.0,7.0,6.0,8.0,8.0,True
151193,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,0.0,0.0,...,3.0,0.0,0.0,0.0,1.0,6.0,6.0,8.0,8.0,True
166787,0.0,0.0,0.0,1.0,23.0,0.0,0.0,0.0,1.0,1.0,...,3.0,20.0,2.0,0.0,0.0,7.0,5.0,6.0,8.0,False
223194,0.0,0.0,0.0,1.0,25.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,9.0,4.0,8.0,8.0,True
246523,0.0,0.0,0.0,1.0,32.0,1.0,1.0,1.0,0.0,0.0,...,4.0,5.0,15.0,0.0,1.0,9.0,6.0,4.0,8.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65410,0.0,1.0,0.0,1.0,23.0,0.0,0.0,0.0,1.0,1.0,...,1.0,1.0,0.0,0.0,0.0,6.0,6.0,8.0,8.0,True
229452,0.0,1.0,0.0,1.0,33.0,0.0,0.0,0.0,1.0,1.0,...,3.0,10.0,0.0,0.0,1.0,6.0,4.0,4.0,8.0,False
96133,2.0,1.0,1.0,1.0,32.0,0.0,0.0,0.0,1.0,1.0,...,2.0,0.0,0.0,0.0,0.0,10.0,5.0,6.0,8.0,False
129385,0.0,0.0,0.0,0.0,16.0,0.0,0.0,0.0,1.0,1.0,...,2.0,0.0,0.0,0.0,0.0,9.0,5.0,2.0,8.0,False


In [206]:
print("--------- We choose a random example -----------")
example = df.loc[random.randint(1,len(df))]
example

--------- We choose a random example -----------


Diabetes_012             0.0
HighBP                   1.0
HighChol                 1.0
CholCheck                1.0
BMI                     31.0
Smoker                   1.0
Stroke                   0.0
HeartDiseaseorAttack     0.0
PhysActivity             1.0
Fruits                   1.0
Veggies                  1.0
HvyAlcoholConsump        0.0
AnyHealthcare            1.0
NoDocbcCost              0.0
GenHlth                  1.0
MentHlth                 0.0
PhysHlth                 0.0
DiffWalk                 0.0
Sex                      1.0
Age                      7.0
Education                6.0
label                    8.0
Name: 211889, dtype: float64

In [207]:
label = classify_example(example, tree)
print("Label:", label)

Label: 8.0


## Car - Purchase Decision

In [233]:
df = pd.read_csv("../data/car_data.csv")
df["label"] = df.Purchased
df = df.drop(["Purchased", "User ID"], axis=1) #axis = 0 means row while axis = 1 means column

In [234]:
#Only in case the columns has space charater in their names, we use this function in order to format column names 
df = format_data(df)

In [235]:
df.head()

Unnamed: 0,Gender,Age,AnnualSalary,label
0,Male,35,20000,0
1,Male,40,43500,0
2,Male,49,74000,0
3,Male,40,107500,1
4,Male,25,79000,0


In [236]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Gender        1000 non-null   object
 1   Age           1000 non-null   int64 
 2   AnnualSalary  1000 non-null   int64 
 3   label         1000 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 31.4+ KB


In [238]:
#Execution of the algorithim
train_df, test_df = train_test_split(df, test_size=0.3)
tree = decision_tree_algorithm(train_df, max_depth= 3)
print("-------------- Decision tree learned ---------------------")
pprint(tree, width=60)
accuracy = calculate_accuracy(test_df, tree)
print("Accuracy of our tree", accuracy )

-------------- Decision tree learned ---------------------
{'Age <= 44': [{'AnnualSalary <= 90500': [0, 1]}, 1]}
Accuracy of our tree 0.8966666666666666


In [239]:
print("-------------- Subset using for learning the tree --------------")
train_df.head()

-------------- Subset using for learning the tree --------------


Unnamed: 0,Gender,Age,AnnualSalary,label
0,Male,35,20000,0
1,Male,40,43500,0
2,Male,49,74000,0
3,Male,40,107500,1
4,Male,25,79000,0


In [240]:
print("-------------- Subset tested by the tree --------------")
test_df

-------------- Subset tested by the tree --------------


Unnamed: 0,Gender,Age,AnnualSalary,label,classification,classification_correct
516,Female,59,29000,1,1,True
83,Female,27,17000,0,0,True
573,Female,42,61500,0,0,True
558,Female,19,21000,0,0,True
533,Male,39,96000,1,1,True
...,...,...,...,...,...,...
970,Female,58,101000,1,1,True
591,Male,52,76500,0,1,False
601,Male,53,72000,1,1,True
30,Male,45,26000,1,1,True


In [287]:
print("--------- We choose a random example -----------")
example = df.loc[random.randint(1,len(df))]
example

--------- We choose a random example -----------


Gender           Male
Age                58
AnnualSalary    75500
label               1
Name: 283, dtype: object

In [270]:
label = classify_example(example, tree)
print("Label:", label)

Label: 1


## Therapy Drug Used

In [293]:
df = pd.read_csv("../data/drug.csv")
df["label"] = df.Drug
df = df.drop(["Drug"], axis=1) #axis = 0 means row while axis = 1 means column

In [294]:
#Only in case the columns has space charater in their names, we use this function in order to format column names 
df = format_data(df)

In [295]:
df.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,label
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,drugY


In [296]:
#Check if our data is complete
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          200 non-null    int64  
 1   Sex          200 non-null    object 
 2   BP           200 non-null    object 
 3   Cholesterol  200 non-null    object 
 4   Na_to_K      200 non-null    float64
 5   label        200 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 9.5+ KB


In [318]:
#Execution of the algorithim
train_df, test_df = train_test_split(df, test_size=0.2)
tree = decision_tree_algorithm(train_df)
print("-------------- Decision tree learned ---------------------")
pprint(tree, width=60)
accuracy = calculate_accuracy(test_df, tree)
print("Accuracy of our tree", accuracy )

-------------- Decision tree learned ---------------------
{'Na_to_K <= 14.642': [{'BP = HIGH': [{'Age <= 49': ['drugA',
                                                     'drugB']},
                                      {'BP = NORMAL': ['drugX',
                                                       {'Cholesterol = NORMAL': ['drugX',
                                                                                 'drugC']}]}]},
                       'drugY']}
Accuracy of our tree 0.975


In [319]:
print("-------------- Subset using for learning the tree --------------")
train_df

-------------- Subset using for learning the tree --------------


Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,label
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
4,61,F,LOW,HIGH,18.043,drugY
5,22,F,NORMAL,HIGH,8.607,drugX
...,...,...,...,...,...,...
194,46,F,HIGH,HIGH,34.686,drugY
195,56,F,LOW,HIGH,11.567,drugC
196,16,M,LOW,HIGH,12.006,drugC
197,52,M,NORMAL,HIGH,9.894,drugX


In [320]:
print("-------------- Subset tested by the tree --------------")
test_df

-------------- Subset tested by the tree --------------


Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,label,classification,classification_correct
61,24,M,HIGH,NORMAL,9.475,drugA,drugA,True
112,35,M,LOW,NORMAL,9.17,drugX,drugX,True
35,46,M,NORMAL,NORMAL,7.285,drugX,drugX,True
3,28,F,NORMAL,HIGH,7.798,drugX,drugX,True
107,42,M,LOW,HIGH,20.013,drugY,drugY,True
75,26,M,LOW,NORMAL,20.909,drugY,drugY,True
130,70,F,NORMAL,HIGH,20.489,drugY,drugY,True
53,24,F,HIGH,NORMAL,18.457,drugY,drugY,True
147,26,F,HIGH,NORMAL,12.307,drugA,drugA,True
145,61,M,NORMAL,HIGH,9.443,drugX,drugX,True


In [335]:
print("--------- We choose a random example -----------")
example = df.loc[random.randint(1,len(df))]
example

--------- We choose a random example -----------


Age                34
Sex                 F
BP                LOW
Cholesterol    NORMAL
Na_to_K        12.923
label           drugX
Name: 159, dtype: object

In [336]:
label = classify_example(example, tree)
print("Label:", label)

Label: drugX
