In [1]:
import pandas as pd
import numpy as np
from math import log
import operator
import pickle
import re

### Pandas Reading

In [2]:
data = pd.read_csv('adult.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  32561 non-null  object
 14  Income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [4]:
test = pd.read_csv('adult_test.csv')

FileNotFoundError: [Errno 2] File adult_test.csv does not exist: 'adult_test.csv'

In [None]:
test.info()

In [None]:
data.head()

### Python Reading & Cleaning

#### remove '?', convert label to 1/-1 and delete the last blank row

In [None]:
def readData(filename):
    with open(filename, 'r') as f:
        data=[]
        # f.readline() # Uncomment here when reading adult.test
        for line in f.readlines():
            data += [line.strip().split(', ')]
        data = [[int(i) if i.isdigit() else i for i in row] for row in data] # String to int with numeric data
        data = [[1 if i=='>50K' else i for i in row] for row in data]
        data = [[-1 if i=='<=50K' else i for i in row] for row in data] #deal with label
        for row in data[:]:
            for column in row:
                if column == '?':
                    data.remove(row)
                    break
        del (data[-1]) #delete last row with no data
        return data

In [None]:
def readTestData(filename):
    with open(filename, 'r') as f:
        data=[]
        # f.readline() # Uncomment here when reading adult.test
        for line in f.readlines():
            data += [line.strip().split(', ')]
        data = [[int(i) if i.isdigit() else i for i in row] for row in data] # String to int with numeric data
        data = [[1 if i=='>50K.' else i for i in row] for row in data]
        data = [[-1 if i=='<=50K.' else i for i in row] for row in data] #deal with label
        for row in data[:]:
            for column in row:
                if column == '?':
                    data.remove(row)
                    break
        del (data[-1]) #delete last row with no data
        return data

In [None]:
X = readData('adult.csv')

In [None]:
test = readTestData('adult_test.csv')

In [None]:
column_names = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','income']

#### Remove Column native-country

In [None]:
X[:] = [i[:13]+i[14:] for i in X]

In [None]:
test[:] = [i[:13]+i[14:] for i in test]

In [None]:
for i in range(len(X)):
    X[i][2] = round(X[i][2]/10000) * 10000

In [None]:
for i in range(len(X)):
    X[i][10] = round(X[i][10]/1000) * 1000

In [None]:
for i in range(len(test)):
    test[i][2] = round(test[i][2]/10000) * 10000

In [None]:
for i in range(len(test)):
    test[i][10] = round(test[i][10]/1000) * 1000

### Find Unique Value for a column

In [None]:
def unique(rows, col):
    return set([row[col] for row in rows])

### Count Labels of the Set

In [None]:
def label_classes(data):
    counts = {1:0,-1:0}
    for row in data:
        label = row[-1]
        counts[label] += 1
    return counts

In [None]:
label_classes(X[0:10])

### Checking Attribute

In [None]:
class check:   
    def __init__(self, col, val):
        self.col = col
        self.val = val
    
    def check(self, pending):
        pending_val = pending[self.col]
        if isinstance(pending_val, int):
            if pending_val >= self.val:
                return True
            else:
                return False
        else:
            if pending_val == self.val:
                return True
            else:
                return False
        
    def __repr__(self):
        condition = "=="
        if isinstance(self.val, int):
            condition = ">="
        return "Is %s %s %s?" % (
            column_names[self.col], condition, str(self.val))

In [None]:
c = check(9, 'Male') # check if it is male

In [None]:
pending = X[0]
c.check(pending)

In [None]:
c2 = check(0, 28) # check whether the age is larger and equal than 28

In [None]:
pending1 = X[0]
c2.check(pending1)

### Partitioning Rows

In [None]:
def partition(check, data):
    true_set = []
    false_set = []
    for row in data:
        if check.check(row):
            true_set.append(row)
        else:
            false_set.append(row)
    return true_set, false_set

In [None]:
t, f = partition(check(9, 'Female'), X[0:10])

In [None]:
t

In [None]:
f

In [None]:
def gini(data):
    cnt = label_classes(data)
    gini = 1
    prob_of_1 = cnt[1] / float(len(data))
    prob_of_0 = cnt[-1] / float(len(data))
    # print(cnt[1], cnt[-1], prob_of_1, prob_of_0)
    gini = 1 - prob_of_1**2 - prob_of_0**2
#     for keys in cnt:
#         prob_of_keys = cnt[keys] / float(len(data))
#         gini -= prob_of_keys**2
    return gini

In [None]:
def gini_split(left, right):
    p = float(len(left)) / (len(left) + len(right))
    # print(p, len(left), len(right))
    #print(left)
    #print(gini(left))
    return p * gini(left) + (1 - p) * gini(right)

In [None]:
t, f = partition(check(2, 80000), X)

In [None]:
gini_split(t, f)

In [None]:
def find_best_gini(data):

    best_split = 1
    best_check = None

    for col in range(13):
        unique_values = unique(data, col)
        for values in unique_values:
            c = check(col, values)
            t, f = partition(c, data)
            if len(t) == 0 or len(f) == 0:
                continue

            gs = gini_split(t, f)
            print(gs)

            if gs <= best_split:
                best_split = gs
                best_check = c

    return best_split, best_check

In [None]:
best_gs, best_check = find_best_gini(X)
best_check

In [None]:
best_gs, best_check = find_best_gini(X)
best_check

In [None]:
class leaf_node:
    def __init__(self, data):
        self.attribute = label_classes(data)

In [None]:
class internal_node:
    def __init__(self, check, left_branch, right_branch):
        self.check = check
        self.left_branch = left_branch
        self.right_branch = right_branch

In [None]:
def build_tree(data):

    gini_split, c = find_best_split(data)

    if gini_split == 1:
        return leaf_node(data)

    t_set, f_set = partition(c, data)

    left_branch = build_tree(t_set)
    right_branch = build_tree(f_set)
    
    return internal_node(c, left_branch, right_branch)

In [None]:
def print_tree(node, spacing=""):

    if isinstance(node, leaf_node):
        print (spacing + "Predict", node.attribute)
        return

    print (spacing + str(node.check))

    print (spacing + '--> True:')
    print_tree(node.left_branch, spacing + "  ")

    print (spacing + '--> False:')
    print_tree(node.right_branch, spacing + "  ")

In [None]:
dtree = build_tree(X)

In [None]:
print_tree(dtree)

In [None]:
def classifier(data, node):

    if isinstance(node, leaf_node):
        return node.attribute

    if node.check.check(data):
        return classifier(data, node.left_branch)
    else:
        return classifier(data, node.right_branch)

In [None]:
def print_leaf(counts):
    if(len(counts.keys())==2):
        num_1 = counts[1]
        num_0 = counts[-1]
        if(num_1 >= num_0):
            pred = 1
        else:
            pred = -1
    else:
        pred = list(counts.keys())[0]
    total = (counts[1]+counts[-1]) * 1.0
    probs = {}
    for key in counts.keys():
        probs[key] = str(int(counts[key] / total * 100)) + "%"
    return probs, pred

In [None]:
print_leaf(classifier(test[741], tree))

In [None]:
def print_result(test):
    correct = 0
    for row in test:
        probs = {}
        pred = 0
        attr = classifier(row, dtree)
        if(len(attr.keys())==2):
            num_1 = attr[1]
            num_0 = attr[-1]
            if(num_1 >= num_0):
                pred = 1
            else:
                pred = -1
        else:
            pred = list(attr.keys())[0]
        total = (attr[1]+attr[-1]) * 1.0
        for key in attr.keys():
            probs[key] = str(int(attr[key] / total * 100)) + "%"
        if row[-1] == pred:
            correct += 1
        print ("Actual: %s. Probs: %s. Predictions: %d" %
               (row[-1], probs, pred))
    accuracy = correct / len(test)
    print("Correct: %d, Wrong: %d, Accuracy: %f" % (correct, len(test)-correct, accuracy))

In [None]:
print_result(test)

### For Validation

In [None]:
train_set = pd.DataFrame(X)

In [None]:
train_set

In [None]:
train_set.columns = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','income']

In [None]:
train_set[0:10]

In [None]:
train_set.info()

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
cor_data = train_set.copy()

In [None]:
cor_data['workclass'] = le.fit_transform(cor_data['workclass'])
cor_data['education'] = le.fit_transform(cor_data['education'])
cor_data['marital-status'] = le.fit_transform(cor_data['marital-status'])
cor_data['occupation'] = le.fit_transform(cor_data['occupation'])
cor_data['relationship'] = le.fit_transform(cor_data['relationship'])
cor_data['race'] = le.fit_transform(cor_data['race'])
cor_data['sex'] = le.fit_transform(cor_data['sex'])

In [None]:
cor_data.corr()

In [None]:
cor_data.corr()['income'].sort_values()

In [None]:
train_set['workclass'] = le.fit_transform(train_set['workclass'])
train_set['education'] = le.fit_transform(train_set['education'])
train_set['marital-status'] = le.fit_transform(train_set['marital-status'])
train_set['occupation'] = le.fit_transform(train_set['occupation'])
train_set['relationship'] = le.fit_transform(train_set['relationship'])
train_set['race'] = le.fit_transform(train_set['race'])
train_set['sex'] = le.fit_transform(train_set['sex'])

In [None]:
from sklearn.tree import DecisionTreeClassifier
alltree = DecisionTreeClassifier()
X_train = train_set.drop(['income'],axis=1)
y_train = train_set['income']
alltree.fit(X_train,y_train)

In [None]:
X_test =pd.DataFrame(test)

In [None]:
X_test.columns = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','income']

In [None]:
X_test['workclass'] = le.fit_transform(X_test['workclass'])
X_test['education'] = le.fit_transform(X_test['education'])
X_test['marital-status'] = le.fit_transform(X_test['marital-status'])
X_test['occupation'] = le.fit_transform(X_test['occupation'])
X_test['relationship'] = le.fit_transform(X_test['relationship'])
X_test['race'] = le.fit_transform(X_test['race'])
X_test['sex'] = le.fit_transform(X_test['sex'])

In [None]:
x_test = X_test.drop(['income'],axis=1)
y_test = X_test['income']

In [None]:
predictions = alltree.predict(x_test)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))

In [None]:
from sklearn import tree
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(30, 20))
tree.plot_tree(alltree, fontsize=10)
plt.show()