# Decision tree with pandas and numpy libraries

In [35]:
import pandas as pd
import numpy as np
from pprint import pprint
data = pd.read_csv(r"income.csv")
data


Unnamed: 0,age,JobType,EdType,maritalstatus,occupation,relationship,race,gender,capitalgain,capitalloss,hoursperweek,nativecountry,SalStat
0,45,Private,HS-grad,Divorced,Adm-clerical,Not-in-family,White,Female,0,0,28,United-States,"less than or equal to 50,000"
1,24,Federal-gov,HS-grad,Never-married,Armed-Forces,Own-child,White,Male,0,0,40,United-States,"less than or equal to 50,000"
2,44,Private,Some-college,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States,"greater than 50,000"
3,27,Private,9th,Never-married,Craft-repair,Other-relative,White,Male,0,0,40,Mexico,"less than or equal to 50,000"
4,20,Private,Some-college,Never-married,Sales,Not-in-family,White,Male,0,0,35,United-States,"less than or equal to 50,000"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
31973,34,Local-gov,HS-grad,Never-married,Farming-fishing,Not-in-family,Black,Male,594,0,60,United-States,"less than or equal to 50,000"
31974,34,Local-gov,Some-college,Never-married,Protective-serv,Not-in-family,White,Female,0,0,40,United-States,"less than or equal to 50,000"
31975,23,Private,Some-college,Married-civ-spouse,Adm-clerical,Husband,White,Male,0,0,40,United-States,"less than or equal to 50,000"
31976,42,Local-gov,Some-college,Married-civ-spouse,Adm-clerical,Wife,White,Female,0,0,40,United-States,"less than or equal to 50,000"


In [16]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
data["SalStat"] = le.fit_transform(data["SalStat"])

In [17]:
dataset = data[["JobType","capitalgain","capitalloss","hoursperweek","SalStat"]]

In [18]:
def entropy(target_col):

    elements,counts = np.unique(target_col,return_counts = True)
    
    entropy = np.sum([(-counts[i]/np.sum(counts))*np.log2(counts[i]/np.sum(counts)) for i in range(len(elements))])
    
    return entropy

In [19]:
def InfoGain(data,split_attribute_name,target_name="SalStat"):

    total_entropy = entropy(data[target_name])
    

    vals,counts= np.unique(data[split_attribute_name],return_counts=True)
    

    Weighted_Entropy = np.sum([(counts[i]/np.sum(counts))*entropy(data.where(data[split_attribute_name]==vals[i]).dropna()[target_name]) for i in range(len(vals))])
    
    Information_Gain = total_entropy - Weighted_Entropy
    return Information_Gain

In [20]:
def ID3(data,originaldata,features,target_attribute_name="SalStat",parent_node_class = None):


    if len(np.unique(data[target_attribute_name])) <= 1:
        return np.unique(data[target_attribute_name])[0]

    elif len(data)==0:
        return np.unique(originaldata[target_attribute_name])[np.argmax(np.unique(originaldata[target_attribute_name],return_counts=True)[1])]
    

    elif len(features) ==0:
        return parent_node_class
    
    
    else:
        parent_node_class = np.unique(data[target_attribute_name])[np.argmax(np.unique(data[target_attribute_name],return_counts=True)[1])]
        
        item_values = [InfoGain(data,feature,target_attribute_name) for feature in features] # features x1, x2,x3
        
        best_feature_index = np.argmax(item_values)
        best_feature = features[best_feature_index]

        tree = {best_feature:{}}
        
        

        features = [i for i in features if i != best_feature]
        
        for value in np.unique(data[best_feature]):
            value = value
            sub_data = data.where(data[best_feature] == value).dropna()
            
            subtree = ID3(sub_data,dataset,features,target_attribute_name,parent_node_class)

            tree[best_feature][value] = subtree
            
        return(tree)    
                

    
def predict(query,tree,default = 1):
    
    #1.
    for key in list(query.keys()):
        if key in list(tree.keys()):
            #2.
            try:
                result = tree[key][query[key]] 
            except:
                return default
  
            #3.
            result = tree[key][query[key]]
            #4.
            if isinstance(result,dict):
                return predict(query,result)

            else:
                return result

In [21]:
def train_test_split(dataset):
    training_data = dataset.iloc[:21425].reset_index(drop=True)

    testing_data = dataset.iloc[21425:].reset_index(drop=True)
    return training_data,testing_data

training_data = train_test_split(dataset)[0]
testing_data = train_test_split(dataset)[1] 



def test(data,tree):

    queries = data.iloc[:,:-1].to_dict(orient = "records")
    
    predicted = pd.DataFrame(columns=["predicted"]) 
    
    for i in range(len(data)):
        predicted.loc[i,"predicted"] = predict(queries[i],tree,1.0) 
        
        
    return predicted["predicted"]


    
tree = ID3(training_data,training_data,training_data.columns[:-1])
# pprint(tree)
test(testing_data,tree)
y_pred = test(testing_data,tree)
y_pred = y_pred.astype(int).values

from sklearn.metrics import confusion_matrix

cm1 = confusion_matrix(testing_data["SalStat"].values,y_pred)
cm1

array([[ 804, 1795],
       [  26, 7928]], dtype=int64)

In [22]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
def print_metrics(labels, preds):
    print("Precision Score: {}".format(precision_score(labels, preds, average = 'macro')))
    print("Recall Score: {}".format(recall_score(labels, preds, average = 'macro')))
    print("Accuracy Score: {}".format(accuracy_score(labels, preds)))
    print("F1 Score: {}".format(f1_score(labels, preds, average = 'macro')))
    
print_metrics(testing_data["SalStat"].values, y_pred)

Precision Score: 0.8920304482353976
Recall Score: 0.6530404771646277
Accuracy Score: 0.8274424334312518
F1 Score: 0.6829630824064625


# Decision Tree with Scikit-learn

In [27]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

data = pd.read_csv(r"income.csv")
data.head()

Unnamed: 0,age,JobType,EdType,maritalstatus,occupation,relationship,race,gender,capitalgain,capitalloss,hoursperweek,nativecountry,SalStat
0,45,Private,HS-grad,Divorced,Adm-clerical,Not-in-family,White,Female,0,0,28,United-States,"less than or equal to 50,000"
1,24,Federal-gov,HS-grad,Never-married,Armed-Forces,Own-child,White,Male,0,0,40,United-States,"less than or equal to 50,000"
2,44,Private,Some-college,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States,"greater than 50,000"
3,27,Private,9th,Never-married,Craft-repair,Other-relative,White,Male,0,0,40,Mexico,"less than or equal to 50,000"
4,20,Private,Some-college,Never-married,Sales,Not-in-family,White,Male,0,0,35,United-States,"less than or equal to 50,000"


In [28]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
data["SalStat"] = le.fit_transform(data["SalStat"])
data["gender"] = le.fit_transform(data["gender"])
data["JobType"] = data["JobType"].map({" ?":np.nan, " Never-worked":0, " Without-pay":1," Self-emp-not-inc":2," Self-emp-inc":3," Private":4," Local-gov":5," State-gov":6," Federal-gov":7," Without-pay":8})


In [29]:

Y = data.iloc[:,-1]

X = data[["JobType","capitalgain","capitalloss","hoursperweek"]]

In [30]:
from sklearn.impute import SimpleImputer
imputer  = SimpleImputer(missing_values = np.nan, strategy = "mean")
missing_data = X.values
missing_data = missing_data.reshape(len(missing_data),X.shape[1])
imputer = imputer.fit(missing_data)
X = imputer.transform(missing_data)
X1 = pd.DataFrame(X, columns = ["JobType","capitalgain","capitalloss","hoursperweek"] )


In [31]:
from sklearn.model_selection import train_test_split


x_train, x_test,y_train,y_test = train_test_split(X1,Y,test_size=0.33, random_state=0)

x_train = x_train.sort_index().values
y_train = y_train.sort_index().values
x_test = x_test.sort_index().values
y_test = y_test.sort_index().values

x_train.shape


(21425, 4)

In [32]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

X_train = sc.fit_transform(x_train)

X_test = sc.transform(x_test)
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier(criterion = "entropy")

dtc.fit(X_train,y_train)
y_pred = dtc.predict(X_test)

In [33]:
from sklearn.metrics import confusion_matrix

cm1 = confusion_matrix(y_test,y_pred)

cm1

array([[ 871, 1636],
       [ 186, 7860]], dtype=int64)

In [34]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
def print_metrics(labels, preds):
    print("Precision Score: {}".format(precision_score(labels, preds, average = 'macro')))
    print("Recall Score: {}".format(recall_score(labels, preds, average = 'macro')))
    print("Accuracy Score: {}".format(accuracy_score(labels, preds)))
    print("F1 Score: {}".format(f1_score(labels, preds, average = 'macro')))
    
print_metrics(y_test, y_pred)


Precision Score: 0.8258736039035308
Recall Score: 0.662155063510463
Accuracy Score: 0.8273476736473041
F1 Score: 0.6924558228761475
