In [387]:
import pandas as pd
import numpy as np
from copy import deepcopy
train = pd.read_csv("Train.csv",index_col=0)
test=pd.read_csv('Test.csv',index_col=0)

In [388]:
dlist=[]
err_list=[]
rulelist=[]

In [389]:
class Node:
    def __repr__(self):
        return "<Node Object with attribute "+str(self.attr)+" and bound "+str(self.bound)+" and decision "+str(self.decision)+">"
    def __init__(self, *args, **kwargs):
        try:
            self.parent = kwargs['parent']
        except:
            self.parent = None
        try:
            self.left = kwargs['left']
        except:
            self.left = None
        try:
            self.right = kwargs['right']
        except:
            self.right = None
        try:
            self.decision = kwargs['decision']
        except:
            self.decision = None
        try:
            self.attr = kwargs['attr']
        except:
            self.attr = None
        try:
            self.bound = kwargs['bound']
        except:
            self.bound = None

In [390]:
class Rule:
    def __repr__(self):
        if(self.dec):
            sign='>'
        else:
            sign='<='
        return "<" + str(self.attr)+" " + sign+str(self.val)+">"
    def __init__(self,attr,dec,val):
        self.attr=attr
        self.val=val
        self.dec=dec

In [391]:
def regressiontree():
    list_attr = train.columns.to_numpy()[:-1].tolist()
    root = Node()
    rule=[]
    d= split(train, root, 'left', list_attr,rule,-1)
    return (root,train,test,d)

In [392]:
def LS_ERROR(train, list_attr):
    input_train = train[list_attr]
    if(len(input_train.drop_duplicates())<=1):
        return 0
    else:
        #t = type(train['csMPa'][0])
        m = train['csMPa'].mean()
        ERROR = 0
        #print(len(train))
        for i in train.index:
            t=type(i)
            ERROR = ERROR + (train['csMPa'][i] - m)**2
        return ERROR

In [393]:
def WEIGHTED_LS(left, right, list_attr):
    lls = LS_ERROR(left, list_attr)
    rls = LS_ERROR(right, list_attr)
    return ((lls*len(left) + rls*len(right))/(len(left)+len(right)))

In [394]:
def split(root_train, parent, side, list_attr,rule,depth):
    newrule=deepcopy(rule)
#     print(newrule)
    a = Node(parent=parent)
    d=depth+1
    input_train = root_train[list_attr]
    if(LS_ERROR(root_train, list_attr) == 0):
        if(side == 'left'):
            parent.left = a
        else:
            parent.right = a
        rulelist.append(newrule)
        return d
    MIN_VAR = np.inf
    attr=None
    bound=0
    left=pd.DataFrame(columns=['cement','slag','flyash','water','superplasticizer','coarseaggregate','fineaggregate','age'])
    right=pd.DataFrame(columns=['cement','slag','flyash','water','superplasticizer','coarseaggregate','fineaggregate','age'])
    for attribute in list_attr:
        valid_set = root_train[attribute].unique().tolist()
        valid_set.sort()
        l =len(valid_set)
        for i in range(0,l-1):
            left_temp = root_train[root_train[attribute]<=valid_set[i]]
            right_temp = root_train[root_train[attribute]>valid_set[i]]
            v =WEIGHTED_LS(left_temp, right_temp, list_attr)
            if(v<MIN_VAR):
                left = left_temp
                right = right_temp
                attr = attribute
                bound = (valid_set[i]+valid_set[i+1])/2
                MIN_VAR = v
    a.attr = attr
    a.bound = bound
    if(side == 'left'):
        parent.left = a
    else:
        parent.right = a
    rulenode=Rule(attr,0,bound)
    newrule.append(rulenode)
    d1=split(left, a, 'left', list_attr,newrule,d)
    newrule=deepcopy(rule)
    rulenode=Rule(attr,1,bound)
    newrule.append(rulenode)
    d2=split(right, a, 'right', list_attr,newrule,d)
    return max(d1,d2)

In [395]:
(root,train,test,depth)=regressiontree()
print("Maximum depth of the tree=",depth)

Maximum depth of the tree= 13


In [396]:
for i in range(0,len(rulelist)):
    print(rulelist[i])
    print()

[<cement <=263.25>, <age <=42.0>, <age <=21.0>, <age <=10.5>, <water <=188.9>, <coarseaggregate <=1054.25>, <coarseaggregate <=1006.3499999999999>, <slag <=6.8>, <cement <=250.9>, <cement <=230.85>, <cement <=203.64999999999998>]

[<cement <=263.25>, <age <=42.0>, <age <=21.0>, <age <=10.5>, <water <=188.9>, <coarseaggregate <=1054.25>, <coarseaggregate <=1006.3499999999999>, <slag <=6.8>, <cement <=250.9>, <cement <=230.85>, <cement >203.64999999999998>]

[<cement <=263.25>, <age <=42.0>, <age <=21.0>, <age <=10.5>, <water <=188.9>, <coarseaggregate <=1054.25>, <coarseaggregate <=1006.3499999999999>, <slag <=6.8>, <cement <=250.9>, <cement >230.85>, <cement <=249.55>]

[<cement <=263.25>, <age <=42.0>, <age <=21.0>, <age <=10.5>, <water <=188.9>, <coarseaggregate <=1054.25>, <coarseaggregate <=1006.3499999999999>, <slag <=6.8>, <cement <=250.9>, <cement >230.85>, <cement >249.55>]

[<cement <=263.25>, <age <=42.0>, <age <=21.0>, <age <=10.5>, <water <=188.9>, <coarseaggregate <=1054.2

In [419]:
def prune():
    l=len(rulelist)
    ammendlist=deepcopy(rulelist)
    for i in range(0,l):# access individual rules
        rule=ammendlist[i]
#         print(rule)
        j=len(rule)# start from last rule
        while(j>1):
            df=test
            traindf=train
            for k in range(0,j-1):
                temprule=rule[k]
                if(temprule.dec):# form subset of test samples and training samples excluding the last condition
                    traindf=traindf[traindf[temprule.attr]>temprule.val]
                    df=df[df[temprule.attr]>temprule.val]
                else:
                    traindf=traindf[traindf[temprule.attr]<=temprule.val]
                    df=df[df[temprule.attr]<=temprule.val]
            df.reset_index(drop=True,inplace=True)
            m=traindf['csMPa'].mean()
#             print("Mean1=",m)
            l2=df.shape[0]
            if(l2<=0):
                break
            err1=0
            for k in range(0,l2):
                err1=err1+(m-df['csMPa'][k])**2
            err1=err1/l2
#             print("Error1=",err1)# error with pruning
            temprule=rule[j-1]
            if(temprule.dec):
                traindf=traindf[traindf[temprule.attr]>temprule.val]
                df=df[df[temprule.attr]>temprule.val]
            else:
                traindf=traindf[traindf[temprule.attr]<=temprule.val]
                df=df[df[temprule.attr]<=temprule.val]
            df.reset_index(drop=True,inplace=True)
            m=traindf['csMPa'].mean()
#             print("Mean2=",m)
            l2=df.shape[0]
            if(l2<=0):
#                 print(rule)
                break
            err2=0
            for k in range(0,l2):
                err2=err2+(m-df['csMPa'][k])**2
            err2=err2/l2
#             print("Error2=",err2)# error without pruning
            if(err1<=err2):
                print('Pruned')
                rule.pop(j-1)# pop the rule from the list
            else:
                break
            j=j-1
#         print()
    return ammendlist

In [420]:
ammendlist=prune()

Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned
Pruned

In [421]:
for i in range(0,len(ammendlist)):
    if(len(ammendlist[i])<len(rulelist[i])):
        print(ammendlist[i])

[<cement <=263.25>, <age <=42.0>, <age <=21.0>, <age <=10.5>, <water <=188.9>, <coarseaggregate <=1054.25>, <coarseaggregate <=1006.3499999999999>, <slag <=6.8>, <cement <=250.9>, <cement <=230.85>]
[<cement <=263.25>, <age <=42.0>, <age <=21.0>, <age <=10.5>, <water <=188.9>, <coarseaggregate <=1054.25>, <coarseaggregate <=1006.3499999999999>, <slag >6.8>]
[<cement <=263.25>, <age <=42.0>, <age <=21.0>, <age <=10.5>, <water <=188.9>, <coarseaggregate <=1054.25>, <coarseaggregate <=1006.3499999999999>, <slag >6.8>, <cement >169.9>, <flyash <=172.95>]
[<cement <=263.25>, <age <=42.0>, <age <=21.0>, <age <=10.5>, <water <=188.9>, <coarseaggregate <=1054.25>, <coarseaggregate >1006.3499999999999>, <flyash <=109.4>, <cement >203.6>]
[<cement <=263.25>, <age <=42.0>, <age <=21.0>, <age <=10.5>, <water <=188.9>, <coarseaggregate >1054.25>, <cement <=207.75>, <cement <=190.5>, <slag <=21.05>]
[<cement <=263.25>, <age <=42.0>, <age <=21.0>, <age <=10.5>, <water <=188.9>, <coarseaggregate >1054

In [422]:
newlist=[]
for i in ammendlist:
    flag1=True
    for j in newlist:
        flag2=True
        if(len(i)==len(j)):
            l=len(i)
            for k in range(0,l):
                if(i[k].attr == j[k].attr):
                    if(i[k].dec == j[k].dec):
                        if(i[k].val == j[k].val):
                            flag2=True
                        else:
                            flag2=False
                            break
                    else:
                        flag2=False
                        break
                else:
                    flag2=False
                    break
        else:
            flag2=False
        if(flag2):
            flag1=False
            break
    if(flag1):
        newlist.append(i)

In [423]:
for i in range(0,len(newlist)):
    print(newlist[i])

[<cement <=263.25>, <age <=42.0>, <age <=21.0>, <age <=10.5>, <water <=188.9>, <coarseaggregate <=1054.25>, <coarseaggregate <=1006.3499999999999>, <slag <=6.8>, <cement <=250.9>, <cement <=230.85>]
[<cement <=263.25>, <age <=42.0>, <age <=21.0>, <age <=10.5>, <water <=188.9>, <coarseaggregate <=1054.25>, <coarseaggregate <=1006.3499999999999>, <slag <=6.8>, <cement <=250.9>, <cement <=230.85>, <cement >203.64999999999998>]
[<cement <=263.25>, <age <=42.0>, <age <=21.0>, <age <=10.5>, <water <=188.9>, <coarseaggregate <=1054.25>, <coarseaggregate <=1006.3499999999999>, <slag <=6.8>, <cement <=250.9>, <cement >230.85>, <cement <=249.55>]
[<cement <=263.25>, <age <=42.0>, <age <=21.0>, <age <=10.5>, <water <=188.9>, <coarseaggregate <=1054.25>, <coarseaggregate <=1006.3499999999999>, <slag <=6.8>, <cement <=250.9>, <cement >230.85>, <cement >249.55>]
[<cement <=263.25>, <age <=42.0>, <age <=21.0>, <age <=10.5>, <water <=188.9>, <coarseaggregate <=1054.25>, <coarseaggregate <=1006.3499999

In [426]:
def accuracy():
    acclist=[]
    for i in range(0,len(newlist)):
        rule=newlist[i]
        j=len(rule)
        df=test
        traindf=train
        err=0
        for k in range(0,j):
            temprule=rule[k]
            if(temprule.dec):# form subset of test samples and training samples excluding the last condition
                traindf=traindf[traindf[temprule.attr]>temprule.val]
                df=df[df[temprule.attr]>temprule.val]
            else:
                traindf=traindf[traindf[temprule.attr]<=temprule.val]
                df=df[df[temprule.attr]<=temprule.val]
        df.reset_index(drop=True,inplace=True)
        m=traindf['csMPa'].mean()
        l2=df.shape[0]
        if(l2<=0):
            acclist.append(np.inf)
            continue
        err=0
        for k in range(0,l2):
            err=err+(m-df['csMPa'][k])**2
        err=err/l2
        acclist.append(err)
    return acclist

In [427]:
acclist=accuracy()

In [428]:
acclist

[2.7060250000000043,
 inf,
 20.884900000000002,
 inf,
 0.5776000000000023,
 12.060100000000006,
 inf,
 17.026544444444454,
 12.25,
 inf,
 inf,
 2.689599999999996,
 inf,
 inf,
 0.09000000000000043,
 inf,
 inf,
 inf,
 inf,
 inf,
 3.0976000000000057,
 inf,
 inf,
 inf,
 9.166624999999996,
 inf,
 inf,
 0.12697777777777763,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 0.014400000000000239,
 39.56409999999997,
 inf,
 46.306425000000004,
 65.2864,
 inf,
 2.1940812500000004,
 inf,
 inf,
 inf,
 1.3924000000000014,
 inf,
 0.40544444444444394,
 inf,
 inf,
 inf,
 0.16000000000000028,
 inf,
 inf,
 inf,
 0.08409999999999951,
 inf,
 inf,
 11.0889,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 19.873800000000006,
 inf,
 29.052100000000024,
 inf,
 inf,
 6.400900000000005,
 inf,
 1.3206229166666672,
 inf,
 inf,
 0.0529000000000002,
 inf,
 126.08765000000004,
 inf,
 inf,
 inf,
 inf,
 inf,
 6.916899999999995,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 47.94852499999996,
 inf,
 21.2

In [429]:
len(acclist)

662

In [432]:
tuples=[]
for i in range(0,len(acclist)):
    tuples.append((newlist[i],acclist[i]))

In [435]:
tuples.sort(key=lambda x:x[1])

In [436]:
tuples

[([<cement <=263.25>,
   <age <=42.0>,
   <age >21.0>,
   <slag <=43.650000000000006>,
   <water <=166.3>,
   <cement <=203.35>,
   <cement <=156.5>,
   <cement <=145.8>],
  0.0),
 ([<cement <=263.25>,
   <age <=42.0>,
   <age >21.0>,
   <slag <=43.650000000000006>,
   <water >166.3>,
   <cement <=165.55>,
   <water <=183.2>,
   <water <=181.3>,
   <flyash <=175.0>,
   <cement <=139.9>],
  0.0),
 ([<cement <=263.25>,
   <age <=42.0>,
   <age >21.0>,
   <slag <=43.650000000000006>,
   <water >166.3>,
   <cement <=165.55>,
   <water <=183.2>,
   <water <=181.3>,
   <flyash >175.0>,
   <superplasticizer <=14.5>,
   <cement <=157.5>,
   <cement >147.5>],
  0.0),
 ([<cement <=263.25>,
   <age <=42.0>,
   <age >21.0>,
   <slag <=43.650000000000006>,
   <water >166.3>,
   <cement <=165.55>,
   <water >183.2>,
   <coarseaggregate >945.0>,
   <cement <=153.2>],
  0.0),
 ([<cement <=263.25>,
   <age <=42.0>,
   <age >21.0>,
   <slag >43.650000000000006>,
   <water <=183.6>,
   <cement <=163.5>,
