In [12]:
import numpy as np
import pandas as pd
import math
import csv

In [11]:
class TreeNode:
    def __init__(self, feature_index,feature_value,table,value,children):
        self.feature_index = feature_index #the column that this Node is a child of it
        self.feature_value = feature_value #the value of feature in this child Node
#         self.best_col      = best_col
        self.childrens     = children
        self.table         = table
        self.entropy       = None
        self.parent        = None
        self.value         = value

In [10]:
class TreeConstructor:
    def __init__(self,maxdepth,minsamples):
        self.root=None
        self.maxdepth=maxdepth
        self.minsamples=minsamples

    #############################################    
    
    #############################################
    def classify(self,df):
        labels         = df.values[:,-1]
        unique_classes,counts = np.unique(labels,return_counts=True)
        index          = counts.argmax()
        classification = unique_classes[index]
        return classification
        
    #############################################

    #############################################
    def check_pure(self,df):
        unique_class  =  np.unique(df.values[:,-1])
        if len(unique_class)==1:
            return True
        else:
            return False
    #############################################

    #############################################
    def entropy(self,df):
        uniques,counts  =  np.unique(df.values[:,-1],return_counts=True)
        probs           =  counts/sum(counts)
        entropy         =  sum(probs*-np.log2(probs))

        return entropy
    #############################################
    
    #############################################   
    def information_Gain(self,df,col):
        data        = df[[col,df.columns[-1]]]
        allcount    = len(data.values)
        probs=[]
        entropies=[]
        col_uniques = np.unique(data[col].values)
        for u in col_uniques:
            prob    = len(data[data[col]==u])/allcount
            probs.append(prob)
            ent     = self.entropy(data[data[col]==u])
            entropies.append(ent)
        overall     =0
        for i in range(len(probs)):
            overall  +=probs[i]*entropies[i]

        return overall
    #############################################

    #############################################
    def split(self,df,col):
        sub_df=[]
        featureValue=[]
        data         = df[col]
        col_uniques  = np.unique(data.values)
        for i in col_uniques:
            sub_df.append([col,i,df[df[col]==i].drop(col,axis=1)])
            
        return sub_df
    #############################################

    #############################################
    def best_split(self,df,):
        entropy  = math.inf
        best_col = 0
        cols = df.columns[:-1]
        for col in cols:
            col_entropy = self.information_Gain(df,col)
            if col_entropy < entropy:
                entropy = col_entropy
                best_col = col
        
        splits=self.split(df,best_col)
        
        return splits,entropy
    #############################################
    
    #############################################
    def buildTree(self,df,currdepth,feature_index,feature_value):
        sampleNo , featureNo = np.shape(df)
        featureNo -= 1
        Nodes = []
        if self.check_pure(df):
            return TreeNode(feature_index=feature_index,feature_value=feature_value,table=df,value=self.classify(df),children=[])
        if sampleNo >= self.minsamples and currdepth <= self.maxdepth:
            bestSplit,entropy = self.best_split(df)
            for item in bestSplit:
                Nodes.append(self.buildTree(item[2],currdepth+1,item[0],item[1]))
            tree_node = TreeNode(feature_index=feature_index,feature_value=feature_value,table=df,value=None,children=Nodes)
            for Node in tree_node.childrens:
                Node.parent=tree_node
            tree_node.entropy=entropy
            return tree_node
        
        leaf=self.classify(df)
        
        return TreeNode(feature_index=feature_index,feature_value=feature_value,table=df,value=leaf,children=None)    
    #############################################
    
    #############################################
    def print_tree(self,tree=None, indent="---------",depth=1):
        if not tree:
            tree = self.root
            
        if tree.value is not None:
            print(indent,tree.value)
        else:
            for branches in tree.childrens:
                print(indent,branches.feature_index,"==",branches.feature_value)
#                 print("%s"%(indent), end="")
                self.print_tree(branches,indent+"------------",depth+1)

    #############################################
    
    #############################################  
    def make_prediction(self,inputs,tree):
        if tree.value != None:
            return tree.value
        
        feature_val    = inputs[tree.childrens[0].feature_index].values[0]
        for branch in tree.childrens:
            if feature_val==branch.feature_value:
                return self.make_prediction(inputs,branch)
    #############################################
    
    #############################################
    def fit(self,df):
        dataset    = df
        self.root  = self.buildTree(dataset,currdepth=1,feature_index=None,feature_value=None)
    #############################################
    
    #############################################
    def predict(self,inputs):
        predictions = []
        rows,_ = inputs.shape
        for i in range(rows):
            predictions.append(self.make_prediction(inputs.iloc[[i]],self.root))
        return predictions
    #############################################

In [14]:
df = pd.read_csv("dataset\poker-hand-training-true.data", header=None)
df_test  = pd.read_csv("dataset\poker-hand-testing.data", header=None)




# cols=[]
# df=df[df[1]==10]
# df=df[df[3]==10]
# df=df[df[2]==2]
# df=df[df[4]==1]
# df

# df       = pd.read_csv("dataset\play_tennis.csv")
# df=df.drop(["day"],axis=1)


# # df[df["outlook"]=="Sunny"]
# # df[df.columns[1]]


In [8]:
classifier = TreeConstructor(minsamples=1,maxdepth=10)
classifier.fit(df)




# classifier.print_tree(indent="")


# arrasy = classifier.predict(df)
# pd.DataFrame(classifier.predict(df), columns=['play'])
#Error
# k = 0
# for i in classifier.predict(df):
#     if i == df.iloc[[k]].values[:,-1][0]:
#         print(i)
#         print(df.iloc[[k]].values[:,-1][0])
#     k+=1

# df.iloc[:,-1:]

In [9]:
from sklearn import datasets 
from sklearn.metrics import accuracy_score

array = classifier.predict(df_test.iloc[0:100])
dff=pd.DataFrame(array,columns=['play'])


In [7]:
# print(dff)
score = accuracy_score( dff,df_test.iloc[0:100,10])


In [858]:
def filter_df(df,feature_index,feature_value):
#     print(df,'amir')
#     print(feature_index,feature_value)
#     print(df[df[feature_index]==feature_value],'arash')
    return df[df[feature_index]==feature_value]

def postpruning(tree,df,df_test):
    #base case
    noneLeafs=0
    

    for child in tree.childrens:
        if child.value == None:
            noneLeafs+=1
    if noneLeafs==0 and tree.value==None and df_test.empty != False:
        leaf   = df.iloc[:,-1:].value_counts().index[0][0]
        errors_leaf = sum(df_test.iloc[:,-1:].values != leaf)[0]
        errors_leaf = int(errors_leaf)
        test_labels = df_test.iloc[:,-1:]
        test_preds  = pd.DataFrame(classifier.predict(df), columns=['play'])
        errors_decision_node = int(sum(test_labels.values!=test_preds.values)[0])
        if errors_leaf<=errors_decision_node:
            return leaf
        else:
            return tree
    else:
#         Nodes=[]
        for i in range(len(tree.childrens)):
            feature_index= tree.childrens[i].feature_index
            feature_value= tree.childrens[i].feature_value
            print(feature_value)

            df_train     = filter_df(df,feature_index,feature_value)
            df_test      = filter_df(df_test,feature_index,feature_value)
            variable = postpruning(child,df_train,df_test)
#             if isinstance(variable,TreeNode):
#                 tree.childrens[i]=variable
#             else:
#                 tree.childrens[i].value=variable
# #                 Nodes.append(variable)
        return tree

In [908]:
classifier.print_tree(indent="")


tree = postpruning(classifier.root,df,df)

classifier.print_tree(tree,indent="")

 outlook == Overcast
------------ Yes
 outlook == Rain
------------ wind == Strong
------------------------ No
------------ wind == Weak
------------------------ Yes
 outlook == Sunny
------------ humidity == High
------------------------ No
------------ humidity == Normal
------------------------ Yes
