In [1]:
import numpy as np
import pandas as pd 
from itertools import combinations

In [2]:
def file2csv(file_name):
    data = []
    keys = ['age', 'type_employer', 'fnlwgt', 'education', 'education_num', 'marital',
            'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss',
            'hr_per_week', 'country', 'income']
    with open(file_name) as f:
        for line in f:
            attributes = line.strip().split(',')
            data_point = {}
            for idx, attribute in enumerate(attributes):
                data_point[keys[idx]] = np.nan if attribute.strip() == '?' else attribute.strip()
            data.append(data_point)

    df = pd.DataFrame(data)
    # write to csv
#     integer_atts = ['age', 'hr_per_week', 'capital_gain', 'capital_loss'] 
#     df[integer_atts] = df[integer_atts].astype(int)
    return df

In [3]:
df = file2csv('adult.train')

In [4]:
df[:20]

Unnamed: 0,age,type_employer,fnlwgt,education,education_num,marital,occupation,relationship,race,sex,capital_gain,capital_loss,hr_per_week,country,income
0,24,Private,176189,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,<=50K
1,30,Private,161690,Assoc-voc,11,Never-married,Prof-specialty,Not-in-family,White,Female,0,0,40,United-States,<=50K
2,60,Private,145493,Some-college,10,Divorced,Adm-clerical,Not-in-family,White,Female,0,0,40,United-States,<=50K
3,42,Private,144995,Preschool,1,Never-married,Handlers-cleaners,Not-in-family,White,Male,0,0,25,United-States,<=50K
4,23,Private,50341,Masters,14,Never-married,Sales,Not-in-family,White,Female,0,0,20,United-States,<=50K
5,50,Self-emp-not-inc,68898,Masters,14,Married-civ-spouse,Sales,Husband,White,Male,7688,0,55,United-States,>50K
6,17,Private,148522,11th,7,Never-married,Other-service,Own-child,White,Male,0,1721,15,United-States,<=50K
7,47,Private,363418,Some-college,10,Never-married,Sales,Not-in-family,White,Male,0,0,70,United-States,>50K
8,49,Local-gov,119904,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,White,Female,7688,0,30,United-States,>50K
9,45,Private,178341,Masters,14,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K


In [5]:
integer_atts = ['age', 'hr_per_week', 'capital_gain', 'capital_loss'] 
df[integer_atts] = df[integer_atts].astype(int)

In [6]:
df.dtypes

age               int64
type_employer    object
fnlwgt           object
education        object
education_num    object
marital          object
occupation       object
relationship     object
race             object
sex              object
capital_gain      int64
capital_loss      int64
hr_per_week       int64
country          object
income           object
dtype: object

In [7]:
df.shape

(4000, 15)

In [8]:
df = df.dropna()

In [9]:
df.shape

(3700, 15)

In [10]:
filtered_df = df.drop(['fnlwgt', 'education_num', 'relationship'], axis=1)

In [11]:
filtered_df[:10]

Unnamed: 0,age,type_employer,education,marital,occupation,race,sex,capital_gain,capital_loss,hr_per_week,country,income
0,24,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,White,Male,0,0,40,United-States,<=50K
1,30,Private,Assoc-voc,Never-married,Prof-specialty,White,Female,0,0,40,United-States,<=50K
2,60,Private,Some-college,Divorced,Adm-clerical,White,Female,0,0,40,United-States,<=50K
3,42,Private,Preschool,Never-married,Handlers-cleaners,White,Male,0,0,25,United-States,<=50K
4,23,Private,Masters,Never-married,Sales,White,Female,0,0,20,United-States,<=50K
5,50,Self-emp-not-inc,Masters,Married-civ-spouse,Sales,White,Male,7688,0,55,United-States,>50K
6,17,Private,11th,Never-married,Other-service,White,Male,0,1721,15,United-States,<=50K
7,47,Private,Some-college,Never-married,Sales,White,Male,0,0,70,United-States,>50K
8,49,Local-gov,Bachelors,Married-civ-spouse,Prof-specialty,White,Female,7688,0,30,United-States,>50K
9,45,Private,Masters,Married-civ-spouse,Prof-specialty,White,Male,0,0,50,United-States,<=50K


In [12]:
filtered_df.shape

(3700, 12)

In [13]:
filtered_df.loc[filtered_df['capital_gain'] > 0,'capital_gain'] = 'yes'  # yes > 0, no = 0
filtered_df.loc[filtered_df['capital_gain'] == 0, 'capital_gain'] = 'no'
filtered_df.loc[filtered_df['capital_loss'] > 0, 'capital_loss'] = 'yes'  # yes > 0, no = 0
filtered_df.loc[filtered_df['capital_loss'] == 0, 'capital_loss'] = 'no'
filtered_df.loc[filtered_df['country'] != 'United-States', 'country'] = 'other'  # United-States, other

In [14]:
filtered_df[:20]

Unnamed: 0,age,type_employer,education,marital,occupation,race,sex,capital_gain,capital_loss,hr_per_week,country,income
0,24,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,White,Male,no,no,40,United-States,<=50K
1,30,Private,Assoc-voc,Never-married,Prof-specialty,White,Female,no,no,40,United-States,<=50K
2,60,Private,Some-college,Divorced,Adm-clerical,White,Female,no,no,40,United-States,<=50K
3,42,Private,Preschool,Never-married,Handlers-cleaners,White,Male,no,no,25,United-States,<=50K
4,23,Private,Masters,Never-married,Sales,White,Female,no,no,20,United-States,<=50K
5,50,Self-emp-not-inc,Masters,Married-civ-spouse,Sales,White,Male,yes,no,55,United-States,>50K
6,17,Private,11th,Never-married,Other-service,White,Male,no,yes,15,United-States,<=50K
7,47,Private,Some-college,Never-married,Sales,White,Male,no,no,70,United-States,>50K
8,49,Local-gov,Bachelors,Married-civ-spouse,Prof-specialty,White,Female,yes,no,30,United-States,>50K
9,45,Private,Masters,Married-civ-spouse,Prof-specialty,White,Male,no,no,50,United-States,<=50K


In [15]:
category_age = ['young', 'adult', 'senior', 'old']
filtered_df['age'] = pd.cut(x=filtered_df['age'],
                            bins=[0, 25, 45, 65, 90],
                            labels=category_age)
category_hour = ['part-time', 'full-time', 'over-time']
filtered_df['hr_per_week'] = pd.cut(x=filtered_df['hr_per_week'],
                                    bins=[0, 39, 40, 168],
                                    labels=category_hour)

In [16]:
filtered_df[:20]

Unnamed: 0,age,type_employer,education,marital,occupation,race,sex,capital_gain,capital_loss,hr_per_week,country,income
0,young,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,White,Male,no,no,full-time,United-States,<=50K
1,adult,Private,Assoc-voc,Never-married,Prof-specialty,White,Female,no,no,full-time,United-States,<=50K
2,senior,Private,Some-college,Divorced,Adm-clerical,White,Female,no,no,full-time,United-States,<=50K
3,adult,Private,Preschool,Never-married,Handlers-cleaners,White,Male,no,no,part-time,United-States,<=50K
4,young,Private,Masters,Never-married,Sales,White,Female,no,no,part-time,United-States,<=50K
5,senior,Self-emp-not-inc,Masters,Married-civ-spouse,Sales,White,Male,yes,no,over-time,United-States,>50K
6,young,Private,11th,Never-married,Other-service,White,Male,no,yes,part-time,United-States,<=50K
7,senior,Private,Some-college,Never-married,Sales,White,Male,no,no,over-time,United-States,>50K
8,senior,Local-gov,Bachelors,Married-civ-spouse,Prof-specialty,White,Female,yes,no,part-time,United-States,>50K
9,adult,Private,Masters,Married-civ-spouse,Prof-specialty,White,Male,no,no,over-time,United-States,<=50K


In [17]:
filtered_df['type_employer'] = filtered_df['type_employer'].replace(['Federal-gov',
                                                                     'Local-gov',
                                                                     'State-gov'], 
                                                                    'gov')
filtered_df['type_employer'] = filtered_df['type_employer'].replace(['Without-pay',
                                                                     'Never-worked'],
                                                                    'Not-working')
filtered_df['type_employer'] = filtered_df['type_employer'].replace(['Self-emp-inc',
                                                                     'Self-emp-not-inc'],
                                                                    'Self-employed')

In [18]:
filtered_df[:20]

Unnamed: 0,age,type_employer,education,marital,occupation,race,sex,capital_gain,capital_loss,hr_per_week,country,income
0,young,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,White,Male,no,no,full-time,United-States,<=50K
1,adult,Private,Assoc-voc,Never-married,Prof-specialty,White,Female,no,no,full-time,United-States,<=50K
2,senior,Private,Some-college,Divorced,Adm-clerical,White,Female,no,no,full-time,United-States,<=50K
3,adult,Private,Preschool,Never-married,Handlers-cleaners,White,Male,no,no,part-time,United-States,<=50K
4,young,Private,Masters,Never-married,Sales,White,Female,no,no,part-time,United-States,<=50K
5,senior,Self-employed,Masters,Married-civ-spouse,Sales,White,Male,yes,no,over-time,United-States,>50K
6,young,Private,11th,Never-married,Other-service,White,Male,no,yes,part-time,United-States,<=50K
7,senior,Private,Some-college,Never-married,Sales,White,Male,no,no,over-time,United-States,>50K
8,senior,gov,Bachelors,Married-civ-spouse,Prof-specialty,White,Female,yes,no,part-time,United-States,>50K
9,adult,Private,Masters,Married-civ-spouse,Prof-specialty,White,Male,no,no,over-time,United-States,<=50K


In [19]:
filtered_df['education'] = filtered_df['education'].replace(['Preschool',
                                                             '1st-4th',
                                                             '5th-6th',
                                                             '7th-8th',
                                                             '9th',
                                                             '10th',
                                                             '11th',
                                                             '12th'],
                                                            'BeforeHS')
filtered_df['education'] = filtered_df['education'].replace(['Prof-school',
                                                             'Assoc-acdm',
                                                             'Assoc-voc',
                                                             'Some-college'],
                                                           'AfterHS')
filtered_df['education'] = filtered_df['education'].replace(['Masters',
                                                             'Doctorate'],
                                                            'Grd')

In [20]:
filtered_df[:30]

Unnamed: 0,age,type_employer,education,marital,occupation,race,sex,capital_gain,capital_loss,hr_per_week,country,income
0,young,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,White,Male,no,no,full-time,United-States,<=50K
1,adult,Private,AfterHS,Never-married,Prof-specialty,White,Female,no,no,full-time,United-States,<=50K
2,senior,Private,AfterHS,Divorced,Adm-clerical,White,Female,no,no,full-time,United-States,<=50K
3,adult,Private,BeforeHS,Never-married,Handlers-cleaners,White,Male,no,no,part-time,United-States,<=50K
4,young,Private,Grd,Never-married,Sales,White,Female,no,no,part-time,United-States,<=50K
5,senior,Self-employed,Grd,Married-civ-spouse,Sales,White,Male,yes,no,over-time,United-States,>50K
6,young,Private,BeforeHS,Never-married,Other-service,White,Male,no,yes,part-time,United-States,<=50K
7,senior,Private,AfterHS,Never-married,Sales,White,Male,no,no,over-time,United-States,>50K
8,senior,gov,Bachelors,Married-civ-spouse,Prof-specialty,White,Female,yes,no,part-time,United-States,>50K
9,adult,Private,Grd,Married-civ-spouse,Prof-specialty,White,Male,no,no,over-time,United-States,<=50K


In [21]:
filtered_df['marital'] = filtered_df['marital'].replace(['Married-AF-spouse',
                                                             'Married-civ-spouse'],
                                                            'Married')

filtered_df['marital'] = filtered_df['marital'].replace(['Married-spouse-absent',
                                                         'Separated',
                                                         'Divorced',
                                                         'Widowed'],
                                                        'Not-married')

In [22]:
filtered_df[:30]

Unnamed: 0,age,type_employer,education,marital,occupation,race,sex,capital_gain,capital_loss,hr_per_week,country,income
0,young,Private,HS-grad,Married,Machine-op-inspct,White,Male,no,no,full-time,United-States,<=50K
1,adult,Private,AfterHS,Never-married,Prof-specialty,White,Female,no,no,full-time,United-States,<=50K
2,senior,Private,AfterHS,Not-married,Adm-clerical,White,Female,no,no,full-time,United-States,<=50K
3,adult,Private,BeforeHS,Never-married,Handlers-cleaners,White,Male,no,no,part-time,United-States,<=50K
4,young,Private,Grd,Never-married,Sales,White,Female,no,no,part-time,United-States,<=50K
5,senior,Self-employed,Grd,Married,Sales,White,Male,yes,no,over-time,United-States,>50K
6,young,Private,BeforeHS,Never-married,Other-service,White,Male,no,yes,part-time,United-States,<=50K
7,senior,Private,AfterHS,Never-married,Sales,White,Male,no,no,over-time,United-States,>50K
8,senior,gov,Bachelors,Married,Prof-specialty,White,Female,yes,no,part-time,United-States,>50K
9,adult,Private,Grd,Married,Prof-specialty,White,Male,no,no,over-time,United-States,<=50K


In [23]:
filtered_df['occupation'] = filtered_df['occupation'].replace(['Tech-support',
                                                                   'Adm-clerical',
                                                                   'Priv-house-serv',
                                                                   'Protective-serv',
                                                                   'Armed-Forces',
                                                                   'Other-service'],
                                                                  'Other')

filtered_df['occupation'] = filtered_df['occupation'].replace(['Craft-repair', 'Farming-fishing',
                                                               'Handlers-cleaners',
                                                               'Machine-op-inspct',
                                                               'Transport-moving'],
                                                              'ManualWork')

In [None]:
filtered_df['Income'] = filtered_df['Income'].replace(['<=50K', '>50K'],
                                                     ['0', '1'])

In [24]:
filtered_df.type_employer.unique()

array(['Private', 'Self-employed', 'gov', 'Not-working'], dtype=object)

In [25]:
filtered_df.education.unique()

array(['HS-grad', 'AfterHS', 'BeforeHS', 'Grd', 'Bachelors'], dtype=object)

In [26]:
filtered_df.marital.unique()

array(['Married', 'Never-married', 'Not-married'], dtype=object)

In [27]:
filtered_df.occupation.unique()

array(['ManualWork', 'Prof-specialty', 'Other', 'Sales',
       'Exec-managerial'], dtype=object)

In [28]:
filtered_df.nunique()

age              4
type_employer    4
education        5
marital          3
occupation       5
race             5
sex              2
capital_gain     2
capital_loss     2
hr_per_week      3
country          2
income           2
dtype: int64

In [47]:
a = filtered_df.iloc[:, -1:].mode()['income'][0]

In [49]:
a.values[0]

'<=50K'

In [37]:
a.mode()

0    1418
1    2282
dtype: int64

11 atts + income (labels) \
Binary: country, capital loss, cappital gain, sex (4) \
Ordered: age, education, hr_per_week (3) \
Nominal: type_employer, martial, race, martial (4)

In [None]:
# class tree node for create a new node in tree
class Node(object):
    def __init__(self, attribute):
        self.left = None
        self.right = None
        self.parent = None
        self.attribute = attribute
#         self.gini_index = 0
        self.label = 0
        self.positive_labels = 0
        self.negative_label = 0

In [29]:
class BinaryDT(object):
    self.root = None
    self.min_sup = 3
    def gini_index(self, df):
        # get counting number for df
        count_0, count_1 = self.counting_label(df)
        # calculate calculate the gini_index for each node
        if count_0 == 0 or count_1 == 0:
            return 0
        total = count_0 + count_1
        gini_index = 1 - (count_0 / total)**2 - (count_1 / total)**2
        return gini_index
    
    def counting_label(self, df):
        counting = df['income'].value_counts().to_dict()
        
        return counting['0'], counting['1']
    
    def split(self, attributes_values, attribute_type):
        # attribute generate all posible left, right of a node
        # 0 binary, 1: orinal, 2: nominal
        # results = list of all potential outcomes: (left, right) tuple
        results = []
        if attribute_type == 0:
            g_left = attribute_values[0]
            g_right = attribute_values[1]
            results.append((g_left, g_right))
        elif attribute_type = 1
            # co the sort cai list xong roi increse dan dan
            # get all values of this attributes in order
            values = []
            for i in range(1, len(values) - 1):
                g_left = values[:i]
                g_right = values[i:]
                results.append((g_left, g_right))
        else:
            # gen combinations for left and set all - set(left)
            values = []
            for L in range(1, len(values) - 1):
                for com in combinations(values, L):
                    g_left = list(com)
                    g_right = list(set(values) - set(com))
                    results.append((g_left, g_right))        
        return results
    
    def stop(self, df, atts):
        # case 1: no more attribute
        count_0, count_1 = self.counting_label(df)
        if len(atts) == 0:
            return True
        # case 2: node is pure
        elif count_0 == 0 or count_1 == 0:
            return True
        # case 3: support below min sup
        elif count_0 < self.min_sup or count_1 < min_sup:
            return True
        return False

    def find_best_split(self, df, attributes):
        parent_gini = self.gini_index(df)
        best_gini_list = []
        best_split_attrs = []
        for attribute in attributes:
            # for each attribute find all potential outcomes
            df_attribute = df[attribute]
            N = len(df_attribute.index)
            attribute_values = []
            p_split = self.split(attribute_values, attribute_type=0)
            max_gain = -100
            best_split_att = None
            for left, right in p_split:
                # calculate gini for this split
                # get sub df for left group 
                df_left = df.loc[df[attribute].isin(left)]
                gini_left = self.gini_index(df_left)
                # get sub df for right group
                df_right = df.loc[df[attribute].isin(right)]
                gini_right = self.gini_index(df_right)
                # calculate weighted gini for each split and gain
                weighted_gini = (len(df_left.index) * gini_left + len(df_right.index) * gini_right) / N
                gain_gini = parent_gini - weighted_gini
                if gain_gini > max_gain:
                    # find best split for this attribute
                    max_gain = gain
                    best_split_att = (left, right)
                    
            best_gini_list.append(max_gain)
            best_split_attrs.append(best_split_att)
            
        best_gini = max(best_gini_list)
        best_attribute = attributes[attributes.index(best_gini)]
        best_split = best_split_attrs[attributes.index(best_gini)]
        return best_split, best_gini, best_attribute
        
    def classify(self, df):
        # classify by major voting
        label = df.iloc[:, -1:].mode()['income'][0]
        return label 
        
    def tree_grow(self, df, atts):
#         if self.stop() == True:
#             node = Node()
#             node.label = self.classify(df)
#             return Node
#         else:
        if True:
            # calculate parent attribute
            
            split, gini, cond_att = self.find_best_split(df, atts)
            root = Node(cond_att)
            # find al
            # find all data for left, 
            remain_atts = list(set(atts).remove(cond_att))
            g_left, g_right = split
            df_left = df.loc[df[cond_att].isin(g_left)]
            root.left = self.tree_grow(df_left, remain_atts)
            df_right = df.loc[df[cond_att].isin(g_right)]
            root.right = self.tree_grow(df_right, remain_atts)
        return root
        
    
    
    

NameError: name 'self' is not defined

In [None]:
3