## Bagging trees function

## Importing libraries and dataframe

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier



## Pre-Pruning

In [3]:
path ="/mnt/c/Users/b_tib/coding/Msc/oLINGI2262/ml-classification-evaluation/decision-trees-bagging-randomforest/datasets"

train_df = pd.read_csv(str(path) + "/BostonHouseTrain.csv", index_col=0)
test_df = pd.read_csv(str(path) + "/BostonHouseTest.csv", index_col=0)

frame = pd.DataFrame(columns = ["min_samples_split", "NodeCount", "TrainAcc", "TestAcc"])

fracs = [1] # [0.05, 0.1, 0.2, 0.5, 0.99]
prunes=[0.01, 0.05, 0.5]

i=0
run=0
for f in fracs:
    for prune in prunes:
        for run in np.arange(1):

            train_df_frac = train_df.sample(frac=f,random_state=i)

            X_train = train_df_frac.iloc[:,:-1]
            Y_train = train_df_frac.iloc[:,-1:]

            X_test = test_df.iloc[:,:-1]
            Y_test = test_df.iloc[:,-1:]

            clf = DecisionTreeClassifier(min_samples_split=prune,random_state=0)
            clf = clf.fit(X_train, Y_train)

            node = clf.tree_
            score_train = clf.score(X_train,Y_train)
            score_test = clf.score(X_test,Y_test)

            frame.loc[i] = [prune, node.node_count, score_train, score_test]
            i+=1

print(frame)



FileNotFoundError: [Errno 2] File /mnt/c/Users/b_tib/coding/Msc/oLINGI2262/ml-classification-evaluation/decision-trees-bagging-randomforest/datasets/BostonHouseTrain.csv does not exist: '/mnt/c/Users/b_tib/coding/Msc/oLINGI2262/ml-classification-evaluation/decision-trees-bagging-randomforest/datasets/BostonHouseTrain.csv'

## Post-Prunning

In [None]:
def plot_impurity_leafs(ccp_alphas, impurities):
	fig, ax = plt.subplots()
	ax.plot(ccp_alphas[:-1], impurities[:-1], marker='o', drawstyle="steps-post")
	ax.set_xlabel("effective alpha")
	ax.set_ylabel("total impurity of leaves")
	ax.set_title("Total Impurity vs effective alpha for training set")


def prunning_impurity(ccp_alphas, X_train, Y_train):
	clfs = [] # meta parameters configuration
	for ccp_alpha in ccp_alphas: # parameter values
		clf = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
		clf.fit(X_train, Y_train)
		clfs.append(clf)
	return clfs

def plot_tree_vs_alpha(clfs):
	node_counts = [clf.tree_.node_count for clf in clfs]
	depth = [clf.tree_.max_depth for clf in clfs]
	fig, ax = plt.subplots(2, 1)
	ax[0].plot(ccp_alphas, node_counts, marker='o', drawstyle="steps-post")
	ax[0].set_xlabel("alpha")
	ax[0].set_ylabel("number of nodes")
	ax[0].set_title("Number of nodes vs alpha")
	ax[1].plot(ccp_alphas, depth, marker='o', drawstyle="steps-post")
	ax[1].set_xlabel("alpha")
	ax[1].set_ylabel("depth of tree")
	ax[1].set_title("Depth vs alpha")
	fig.tight_layout()

def plot_accuracies(clfs):
	train_scores = [clf.score(X_train, Y_train) for clf in clfs]
	test_scores = [clf.score(X_test, Y_test) for clf in clfs]

	fig, ax = plt.subplots()
	ax.set_xlabel("alpha")
	ax.set_ylabel("accuracy")
	ax.set_title("Accuracy vs alpha for training and testing sets")
	ax.plot(ccp_alphas, train_scores, marker='o', label="train",
			drawstyle="steps-post")
	ax.plot(ccp_alphas, test_scores, marker='o', label="test",
			drawstyle="steps-post")
	ax.legend()
	plt.show()

	print('Balanced training and test accuracy /n')
	frame = pd.DataFrame(columns = ["ccp_alphas","NodeCount", "TrainAcc", "TestAcc"])
	row = 0
	for i in range(len(test_scores)-1):
		if abs(train_scores[i] - test_scores[i]) < 0.02:
			frame.loc[row] = [ccp_alphas[i], clfs[i].tree_.node_count, train_scores[i], test_scores[i]]
			row += 1
	print(frame)


In [None]:
path ="/mnt/c/Users/b_tib/coding/Msc/oLINGI2262/ml-classification-evaluation/decision-trees-bagging-randomforest/datasets"

train_df = pd.read_csv(str(path) + "/BostonHouseTrain.csv", index_col=0)
test_df = pd.read_csv(str(path) + "/BostonHouseTest.csv", index_col=0)

X_train = train_df.iloc[:,:-1]
Y_train = train_df.iloc[:,-1:]

clf = DecisionTreeClassifier(random_state=0)

path = clf.cost_complexity_pruning_path(X_train, Y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities

plot_impurity_leafs(ccp_alphas, impurities) # total impurity by alpha
clfs = prunning_impurity(ccp_alphas, X_train, Y_train) # generates same tree strcuture with dif. ccp_alpha
plot_tree_vs_alpha(clfs) # how tree nodes and depth varies with alpha
plot_accuracies(clfs) # plot train and test accuracies according to alpha

## Bagging trees 

In [None]:
def bagging_trees(X_t, Y_t, X_v, p):
    """
    Input: 
        X_t: features train (numpy.arrays)
        Y_t: labels train (numpy.arrays)
        X_v: features test (numpy.arrays)
        p:   trees parameters (array)
                0: epochs
                1: n_trees
                2: criterion
                3: min_samples_leaf
                4: max_depth
                5: min_samples_splits
                6: max_leaf_nodes
    Output: 
        y_pred:      predictions on validation set X_v (array)
        unan_rates:  rate of majority votes (array)
        acc:         accuracy on training set Y_t (integer)
        f1:          f1 score on training set Y_t (integer)

    """
        
    #  make "X_t" and "Y_t" a pd.Dataframe
    X_t_df = pd.DataFrame(X_t)
    Y_t_df = pd.DataFrame(Y_t)
        
    # make "X_v" a pd.Dataframe    
    X_v_df = pd.DataFrame(X_v)    
    
    # make "train_df" to resample 
    train_df = X_t_df.copy() 
    train_df['label'] = Y_t_df

    for i in range(p[0]):
        bag = []
        for run in np.arange(p[1]):            
            # resampling the dataframe (number of distinct, number of distinct)
            train_df_bs = train_df.iloc[np.random.randint(len(train_df), size=len(train_df))]
            X_train = train_df_bs.iloc[:,:-1]
            Y_train = train_df_bs.iloc[:,-1:]

            # Storing each trained tree
            wl = DecisionTreeClassifier(criterion=p[2]
                                        , min_samples_leaf=p[3]
                                        , max_depth=p[4]
                                        , min_samples_split=p[5]
                                        , max_leaf_nodes=p[6]).fit(X_train,Y_train)   
            # add tree into bag
            bag.append(wl)
            
        v_predictions = []
        t_predictions = []
        
        # each tree will make a prediction about test_df
        for i in range(p[1]):
            
            # predict validation and training sets
            tree_v_prediction = bag[i].predict(X_v_df)
            tree_t_prediction = bag[i].predict(X_t_df)
            
            # Append predictions
            v_predictions.append(tree_v_prediction)
            t_predictions.append(tree_t_prediction)

        # Convert predictions lists into np.array to transpose them and obtain "n_tree" predictions per line
        v_predictions_T = np.array(v_predictions).T
        t_predictions_T = np.array(t_predictions).T
        
    ## Score on Training set
        t_final_predictions = []
        # for each entry "m" of X_t_df(m x features)
        for line in t_predictions_T:
            # countabilize the "n_tree" votes in v_predictions_T (m x n_tree)
            most_common = Counter(line).most_common(1)[0][0]
            t_final_predictions.append(most_common)  
            
        # accuracy and f1
        acc = accuracy_score(Y_t_df, t_final_predictions)
        f1 = f1_score(Y_t_df, t_final_predictions, average='macro')
        bcr = balanced_accuracy_score(Y_t_df, t_final_predictions)
        auc = roc_auc_score(Y_t_df, t_final_predictions, average='macro')
            
    ## Prediction on Validation set
        v_final_predictions = []   
        unanimity_rates = [] 
        # for each entry "n" of X_v_df(n x features)
        for line in v_predictions_T:
            # countabilize the "n_tree" votes in v_predictions_T (n x n_tree) 
            most_common = Counter(line).most_common(1)[0][0]
            unanimity_rate = Counter(line)[most_common] / len(line)
            # get prediction and unanimity rate
            v_final_predictions.append(most_common)
            unanimity_rates.append(unanimity_rate)
            

    return v_final_predictions, unanimity_rates, acc, bcr, f1, auc