# EXPERIMENT NO 14

## BAGGING

In [1]:
from random import seed
from random import randrange
from csv import reader


In [2]:
#Load a CSV file
def load_csv(filename):
    dataset=list()
    with open(filename,'r') as file:
        csv_reader=reader(file)
        for row in csv_reader:
            if not row:
                continue
            dataset.append(row)
    return dataset

#Converting string column to float
def str_column_to_float(dataset,column):
    for row in dataset:
        row[column]=float(row[column].strip())
        


In [3]:
#Convert string column to integer
def str_column_to_int(dataset,column):
    class_values=[row[column] for row in dataset]
    unique=set(class_values)
    lookup=dict()
    for i,value in enumerate(unique):
        lookup[value]=i
    for row in dataset:
        row[column]=lookup[row[column]]
    return lookup

#Split a dataset into k folds
def cross_validation_split(dataset,n_folds):
    dataset_split=list()
    dataset_copy=list(dataset)
    fold_size=int(len(dataset)/n_folds)
    for i in range(n_folds):
        fold=list()
        while len(fold)<fold_size:
            index=randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split


In [4]:
#Calculate accuracy percentage
def accuracy_metric(actual,predicted):
    correct=0
    for i in range(len(actual)):
        if(actual[i]==predicted[i]):
            correct+=1
    return correct/float(len(actual))*100.0

#Evaluate an algorith using a cross validation split
def evaluate_algorithm(dataset,algorithm,n_folds,*args):
    folds=cross_validation_split(dataset,n_folds)
    scores=list()
    for fold in folds:
        train_set=list(folds)
        train_set.remove(fold)
        train_set=sum(train_set,[])
        test_set=list()
        for row in fold:
            row_copy=list(row)
            test_set.append(row_copy)
            row_copy[-1]=None
        predicted=algorithm(train_set,test_set,*args)
        actual=[row[-1] for row in fold]
        accuracy=accuracy_metric(actual,predicted)
        scores.append(accuracy)
    return scores


In [5]:
#Split a dataset on an attribute and an attribute value
def test_split(index,value,dataset):
    left,right=list(),list()
    for row in dataset:
        if row[index]<value:
            left.append(row)
        else:
            right.append(row)
            
    return left,right

#Calculate the Gini index for a split dataset
def gini_index(groups,classes):
    #Count all samples at split point
    n_instances=float(sum([len(group) for group in groups]))
    #Sum weighted Gini index for each group
    gini=0.0
    for group in groups:
        size=float(len(group))
        #avoid divide by zero
        if size==0:
            continue
        score=0.0
        #Score the group based on he score for each class
        for class_val in classes:
            p=[row[-1] for row in group].count(class_val)/size
            score+=p*p
        #Weigth the group score by its relative size
        gini+=(1-score)*(size/n_instances)
    return gini


In [6]:
#Select the best split point for a dataset
def get_split(dataset):
    class_values=list(set(row[-1] for row in dataset))
    b_index,b_value,b_score,b_groups=999,999,999,None
    
    for index in range(len(dataset[0])-1):
        for row in dataset:
            for i in range(len(dataset)):
                row=dataset[randrange(len(dataset))]
                groups=test_split(index,row[index],dataset)
                gini=gini_index(groups,class_values)
            if gini<b_score:
                b_index,b_value,b_score,b_groups=index,row[index],gini,groups
    return {'index':b_index,'value':b_value,'groups':b_groups}

#Create a terminal node value
def to_terminal(group):
    outcomes=[row[-1] for row in group]
    return max(set(outcomes),key=outcomes.count)


In [7]:
#Create child splits for a node or make terminal
def split(node,max_depth,min_size,depth):
    left,right=node['groups']
    del(node['groups'])
    #Check for a no split
    if not left or not right:
        node['left']=node['right']=to_terminal(left+right)
        return
    #Check for max depth
    if depth>=max_depth:
        node['left'],node['right']=to_terminal(left),to_terminal(right)
        return
    #process left child
    if len(right)<=min_size:
        node['left']=to_terminal(left)
    else:
        node['left']=get_split(left)
        split(node['left'],max_depth,min_size,depth+1)
    #process right child
    if len(right)<=min_size:
        node['right']=to_terminal(right)
    else:
        node['right']=get_split(right)
        split(node['right'],max_depth,min_size,depth+1)
        
#Build a Decision Tree
def build_tree(train,max_depth,min_size):
    root=get_split(train)
    split(root,max_depth,min_size,1)
    return root


In [8]:
#Make a prediction with a decision tree
def predict(node,row):
    if row[node['index']]<node['value']:
        if isinstance(node['left'],dict):
            return predict(node['left'],row)
        else:
            return node['left']
    else:
        if isinstance(node['right'],dict):
            return predict(node['right'],row)
        else:
            return node['right']
        
#Create a random subsample from the dataset with replacement
def subsample(dataset,ratio):
    sample=list()
    n_sample=round(len(dataset)*ratio)
    while len(sample)<n_sample:
        index=randrange(len(dataset))
        sample.append(dataset[index])
    return sample


In [9]:
#Make a prediction with a list of bagged trees
def bagging_predict(trees,row):
    predictions=[predict(tree,row) for tree in trees]
    return max(set(predictions),key=predictions.count)

#Bootstrap Aggregation Algorithm
def bagging(train,test,max_depth,min_size,sample_size,n_trees):
    trees=list()
    for i in range(n_trees):
        sample=subsample(train,sample_size)
        tree=build_tree(sample,max_depth,min_size)
        trees.append(tree)
    predictions=[bagging_predict(trees,row) for row in test]
    return (predictions)


In [10]:
#Test bagging on the sonar dataset
seed(1)
#load and prepare data
filename='/content/EXPERIMENT 14.csv'
dataset=load_csv(filename)
#convert string attribiutes to integers
for i in range(len(dataset[0])-1):
    str_column_to_float(dataset,i)


In [11]:
#convert class columns to integers
str_column_to_int(dataset,len(dataset[0])-1)
#evaluate algorithm
n_folds=5
max_depth=6
min_size=2
sample_size=.50


In [12]:
for n_trees in [1,5,10,50]:
    scores=evaluate_algorithm(dataset,bagging,n_folds,max_depth,min_size,sample_size,n_trees)
    print('Trees:',n_trees)
    print('Scores:',scores)
    print("Mean Accuracy:",(sum(scores)/float(len(scores))))

Trees: 1
Scores: [80.48780487804879, 87.8048780487805, 63.41463414634146, 68.29268292682927, 75.60975609756098]
Mean Accuracy: 75.1219512195122
Trees: 5
Scores: [78.04878048780488, 85.36585365853658, 63.41463414634146, 70.73170731707317, 63.41463414634146]
Mean Accuracy: 72.19512195121952
Trees: 10
Scores: [82.92682926829268, 70.73170731707317, 73.17073170731707, 70.73170731707317, 78.04878048780488]
Mean Accuracy: 75.1219512195122
Trees: 50
Scores: [75.60975609756098, 75.60975609756098, 75.60975609756098, 82.92682926829268, 80.48780487804879]
Mean Accuracy: 78.04878048780488


# 