In [2]:
import numpy as np
import pandas as pd
import warnings
from sklearn import tree
import random
from sklearn.metrics import confusion_matrix
from xgboost import XGBClassifier

In [3]:
train_df = pd.read_csv('HW3_ML//p2_data//data_train.csv', header = None)
test_df = pd.read_csv('HW3_ML//p2_data//data_test.csv', header = None)

In [4]:
def random_forest_training(X, Y):
    criteration = ['gini', 'entropy']
    max_features = [12,13,14,15,16]
    min_samples_split = [1, 2, 3, 4, 5]
    trees = []
    for i in range(15):
        clf = tree.DecisionTreeClassifier(max_depth=3, criterion=random.sample(criteration, 1)[0], 
                                max_features=random.sample(max_features,1)[0], min_samples_leaf=random.sample(min_samples_split,1)[0])
        
        clf.fit(X, Y)
        trees.append(clf)
    return trees
    
def voting(trees, test_sample):
    predictions = []
    for loop_tree in trees:
        temp_prediction = loop_tree.predict(test_sample)
        predictions.append(temp_prediction[0])
    return max(set(predictions), key=predictions.count)

def testing(trees, test_samples, actual_values):
    count = 0.0
    total = 0.0
    predicted_values = []
    for i in range(len(test_samples)):
        pred = voting(trees, [test_samples[i]])
        if pred == actual_values[i]:
            count += 1
        total +=1
        predicted_values.append(pred)
    print(count/total)
    print(confusion_matrix(actual_values, predicted_values))


In [5]:
forest = random_forest_training(train_df.iloc[:, :-1].to_numpy(), train_df.iloc[:, -1].to_numpy())
testing(forest, test_df.iloc[:, :-1].to_numpy(), test_df.iloc[:, -1].to_numpy())

0.6463693539165237
[[312   0   1   0   0   0  21   6  23   0]
 [  0   0 200 137  14   0  13   0   0   0]
 [  0   0 333   6   5   0  15   5   0   0]
 [  0   0   2 325   1   0   0   8   0   0]
 [  0   0   2   6 356   0   0   0   0   0]
 [  0   0   0  98  68 159   0   1   9   0]
 [  5   0  14   6   7   0 297   3   4   0]
 [  0   0  30  21  29   4   0 278   2   0]
 [ 36   0   0   0   0  32  46  21 201   0]
 [  1   0   0 168 163   0   0   4   0   0]]


## adaBoost

In [7]:
def adaBoost_train(X, Y, time, the_class):
    m = len(X)
    classifiers = []
    trees_weight = np.ones((time))/m
    e = np.zeros((time))
    w = np.ones((time, m))/m
    Y = ((Y==the_class) - np.array([0.5]))*2
#     print(Y)
    for t in range(time):
        clf = tree.DecisionTreeClassifier(max_depth=1)
        clf.fit(X=X,y=Y, sample_weight=w[t])
        predicted = clf.predict(X)
        classifiers.append(clf)
        prediction_performance = (predicted == Y) + np.zeros((len(predicted)))
        e[t] = np.sum(np.squeeze(w[t])*np.squeeze(1-prediction_performance))
        trees_weight[t] = 1/2 * np.log((1-e[t])/e[t])
        if t != time-1:
            w[t+1] = 1/2*w[t]*1/(1-e[t])*prediction_performance + 1/2*w[t]*1/e[t]*(1-prediction_performance)
    return classifiers, trees_weight

def predict_adaBoost(X, Y, time, test_samples):
    results = np.zeros((len(test_samples)))
    for the_class in range(10):
        classifiers, trees_weight = adaBoost_train(X, Y, time, the_class)
        preds = []
        for j in range(len(test_samples)):
            test_sample = test_samples[j]
            result = 0
            for i in range(len(classifiers)):
                pred = classifiers[i].predict([test_sample])
                result += pred*trees_weight[i]
                preds.append(pred[0])
            result = (result[0]>0)*1 + (result[0]<=0)*-1
            if result == 1:
                results[j] = the_class

    return results

def normalize_weights(weights):
    mini = np.min(weights)
    maxi = np.max(weights)
    result = ((weights - mini)/maxi)
    result /= (np.sum(result))
    return result
    

In [224]:
for i in (5, 10, 20, 50):
    results = predict_adaBoost(train_df.iloc[:, :-1].to_numpy(), train_df.iloc[:, -1].to_numpy(), i, test_df.iloc[:, :-1].to_numpy())
    print(f'accuracy for {i} trees:{np.mean((test_df.iloc[:, -1].to_numpy() == results) + np.zeros((1)))}')


accuracy for 5 trees:0.6615208690680389
accuracy for 10 trees:0.6955403087478559
accuracy for 20 trees:0.7721555174385363
accuracy for 50 trees:0.8413379073756432


## xgboost

In [45]:
warnings.filterwarnings('ignore')
model = XGBClassifier(n_estimators = 600, max_depth = 2, n_jobs = 3, gamma = 0.001, base_score = 10)
model.fit(train_df.iloc[:, :-1].to_numpy(), train_df.iloc[:,-1].to_numpy())
result_xg = model.predict(test_df.iloc[:, :-1])
print(f'accuracy for xgboost model: {np.sum(result_xg==test_df.iloc[:,-1])/len(result_xg)}')


accuracy for xgboost model: 0.967409948542024
