In [137]:
import scipy.io as sio
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier

# Load Data

### Mushroom

In [231]:
mushroom_data = pd.read_csv('agaricus-lepiota.data', header=None, sep=',')
mushroom_data = pd.get_dummies(mushroom_data, columns=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22])

eddible_nums = {0: {'e': 1, 'p': 0}}
mushroom_data.replace(eddible_nums, inplace=True)

print(mushroom_data.shape)

(8124, 118)


In [232]:
mushroom_X_train_80 = [None] * 3
mushroom_X_test_20 = [None] * 3    
mushroom_Y_train_80 = [None] * 3
mushroom_Y_test_20 = [None] * 3

for i in range(3):
    mushroom_data = mushroom_data.sample(frac=1).reset_index(drop=True)
    mushroom_X = mushroom_data.iloc[:,1:]   
    mushroom_Y = mushroom_data.iloc[:,0]
    
    mushroom_X_train_80[i], mushroom_X_test_20[i], mushroom_Y_train_80[i], mushroom_Y_test_20[i] = train_test_split(mushroom_X, mushroom_Y, test_size=0.2, random_state=0)

print(mushroom_X_train_80[0].shape, mushroom_X_test_20[0].shape, mushroom_Y_train_80[0].shape, mushroom_Y_test_20[0].shape)

(6499, 117) (1625, 117) (6499,) (1625,)


In [233]:
mushroom_X_train_50 = [None] * 3
mushroom_X_test_50 = [None] * 3    
mushroom_Y_train_50 = [None] * 3
mushroom_Y_test_50 = [None] * 3

for i in range(3):
    mushroom_data = mushroom_data.sample(frac=1).reset_index(drop=True)
    mushroom_X = mushroom_data.iloc[:,1:]   
    mushroom_Y = mushroom_data.iloc[:,0]
    
    mushroom_X_train_50[i], mushroom_X_test_50[i], mushroom_Y_train_50[i], mushroom_Y_test_50[i] = train_test_split(mushroom_X, mushroom_Y, test_size=0.5, random_state=0)

print(mushroom_X_train_50[0].shape, mushroom_X_test_50[0].shape, mushroom_Y_train_50[0].shape, mushroom_Y_test_50[0].shape)

(4062, 117) (4062, 117) (4062,) (4062,)


In [48]:
mushroom_X_train_20 = [None] * 3
mushroom_X_test_80 = [None] * 3   
mushroom_Y_train_20 = [None] * 3
mushroom_Y_test_80 = [None] * 3

for i in range(3):
    mushroom_data = mushroom_data.sample(frac=1).reset_index(drop=True)
    mushroom_X = mushroom_data.iloc[:,1:]   
    mushroom_Y = mushroom_data.iloc[:,0]

    mushroom_X_train_20[i], mushroom_X_test_80[i], mushroom_Y_train_20[i], mushroom_Y_test_80[i] = train_test_split(mushroom_X, mushroom_Y, test_size=0.8, random_state=0)

print(mushroom_X_train_20[0].shape, mushroom_X_test_80[0].shape, mushroom_Y_train_20[0].shape, mushroom_Y_test_80[0].shape)

(1624, 117) (6500, 117) (1624,) (6500,)


### Wine

In [90]:
wine_data = pd.read_csv('winequality-white.csv', sep=';')
wine_data = wine_data.iloc[:2000,:]

# mushroom_data = pd.read_csv('agaricus-lepiota.data', header=None, sep=',')
# mushroom_data = pd.get_dummies(mushroom_data, columns=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22])

# eddible_nums = {0: {'e': 1, 'p': 0}}
# mushroom_data.replace(eddible_nums, inplace=True)

print(wine_data.shape)

(2000, 12)


In [91]:
wine_X_train_80 = [None] * 3
wine_X_test_20 = [None] * 3 
wine_Y_train_80 = [None] * 3
wine_Y_test_20 = [None] * 3

for i in range(3):
    wine_data = wine_data.sample(frac=1).reset_index(drop=True)
    wine_X = wine_data.iloc[:,:-1]   
    wine_Y = wine_data.iloc[:,-1]

    wine_X_train_80[i], wine_X_test_20[i], wine_Y_train_80[i], wine_Y_test_20[i] = train_test_split(wine_X, wine_Y, test_size=0.2, random_state=0)

print(wine_X_train_80[0].shape, wine_X_test_20[0].shape, wine_Y_train_80[0].shape, wine_Y_test_20[0].shape)

(1600, 11) (400, 11) (1600,) (400,)


In [92]:
wine_X_train_50 = [None] * 3
wine_X_test_50 = [None] * 3   
wine_Y_train_50 = [None] * 3
wine_Y_test_50 = [None] * 3

for i in range(3):
    wine_data = wine_data.sample(frac=1).reset_index(drop=True)
    wine_X = wine_data.iloc[:,:-1]   
    wine_Y = wine_data.iloc[:,-1]

    wine_X_train_50[i], wine_X_test_50[i], wine_Y_train_50[i], wine_Y_test_50[i] = train_test_split(wine_X, wine_Y, test_size=0.5, random_state=0)

print(wine_X_train_50[0].shape, wine_X_test_50[0].shape, wine_Y_train_50[0].shape, wine_Y_test_50[0].shape)

(1000, 11) (1000, 11) (1000,) (1000,)


In [110]:
wine_X_train_20 = [None] * 3
wine_X_test_80 = [None] * 3    
wine_Y_train_20 = [None] * 3
wine_Y_test_80 = [None] * 3

for i in range(3):
    wine_data = wine_data.sample(frac=1).reset_index(drop=True)
    wine_X = wine_data.iloc[:,:-1]   
    wine_Y = wine_data.iloc[:,-1]
    
    wine_X_train_20[i], wine_X_test_80[i], wine_Y_train_20[i], wine_Y_test_80[i] = train_test_split(wine_X, wine_Y, test_size=0.8, random_state=0)

print(wine_X_train_20[0].shape, wine_X_test_80[0].shape, wine_Y_train_20[0].shape, wine_Y_test_80[0].shape)

(400, 11) (1600, 11) (400,) (1600,)


### Dota

In [11]:
dota_data = pd.read_csv('dota2Train.csv', header=None, sep=',')
dota_data = dota_data.iloc[:5000,:]

print(dota_data.shape)

(5000, 117)


In [54]:
dota_X_train_80 = [None] * 3
dota_X_test_20 = [None] * 3    
dota_Y_train_80 = [None] * 3
dota_Y_test_20 = [None] * 3

for i in range(3):
    dota_data = dota_data.sample(frac=1).reset_index(drop=True)
    dota_X = dota_data.iloc[:,1:]   
    dota_Y = dota_data.iloc[:,0]

    dota_X_train_80[i], dota_X_test_20[i], dota_Y_train_80[i], dota_Y_test_20[i] = train_test_split(dota_X, dota_Y, test_size=0.2, random_state=0)

print(dota_X_train_80[0].shape, dota_X_test_20[0].shape, dota_Y_train_80[0].shape, dota_Y_test_20[0].shape)

(4000, 116) (1000, 116) (4000,) (1000,)


In [53]:
dota_X_train_50 = [None] * 3
dota_X_test_50 = [None] * 3    
dota_Y_train_50 = [None] * 3
dota_Y_test_50 = [None] * 3

for i in range(3):
    dota_data = dota_data.sample(frac=1).reset_index(drop=True)
    dota_X = dota_data.iloc[:,1:]   
    dota_Y = dota_data.iloc[:,0]

    dota_X_train_50[i], dota_X_test_50[i], dota_Y_train_50[i], dota_Y_test_50[i] = train_test_split(dota_X, dota_Y, test_size=0.5, random_state=0)

print(dota_X_train_50[0].shape, dota_X_test_50[0].shape, dota_Y_train_50[0].shape, dota_Y_test_50[0].shape)

(2500, 116) (2500, 116) (2500,) (2500,)


In [52]:
dota_X_train_20 = [None] * 3
dota_X_test_80 = [None] * 3    
dota_Y_train_20 = [None] * 3
dota_Y_test_80 = [None] * 3

for i in range(3):
    dota_data = dota_data.sample(frac=1).reset_index(drop=True)
    dota_X = dota_data.iloc[:,1:]   
    dota_Y = dota_data.iloc[:,0]

    dota_X_train_20[i], dota_X_test_80[i], dota_Y_train_20[i], dota_Y_test_80[i] = train_test_split(dota_X, dota_Y, test_size=0.8, random_state=0)

print(dota_X_train_20[0].shape, dota_X_test_80[0].shape, dota_Y_train_20[0].shape, dota_Y_test_80[0].shape)

(1000, 116) (4000, 116) (1000,) (4000,)


# Helper Methods

In [234]:
def draw_heatmap_linear(acc, acc_desc, C_list):
    plt.figure(figsize = (2,4))
    ax = sns.heatmap(acc, annot=True, fmt='.3f', yticklabels=C_list, xticklabels=[])
    ax.collections[0].colorbar.set_label("accuracy")
    ax.set(ylabel='$C$')
    plt.title(acc_desc + ' w.r.t $C$')
    sns.set_style("whitegrid", {'axes.grid' : False})
    plt.show()

# Classifiers

In [249]:
C_list = [10**-3, 10**-2, 10**-1, 10**0]
G_list = [0.001, 0.005,0.01, 0.05]

## Linear SVM

### Mushroom

#### 80 / 20

In [250]:
clfs = []

for i in range(3):
    clf = GridSearchCV(cv=5, estimator=svm.SVC(), param_grid=[{'C': C_list, 'kernel': ['linear'], 'gamma': G_list}], n_jobs=-1)
    clf.fit(mushroom_X_train_80[i].values, mushroom_Y_train_80[i].values)
    clfs.append(clf)
    print(i)

0
1
2


In [253]:
train_accs = []
val_accs = []
test_accs = []

for i in range(3):
    train_accs.append(max(clfs[i].cv_results_['mean_train_score'].reshape((16, 1))))
    val_accs.append(max(clfs[i].cv_results_['mean_train_score'].reshape((16, 1))))
    test_accs.append(svm.SVC(C=clfs[i].best_estimator_.C, kernel='linear', gamma=clfs[i].best_estimator_.gamma).fit(mushroom_X_train_80[i],mushroom_Y_train_80[i]).score(mushroom_X_test_20[i], mushroom_Y_test_20[i]))
    print(i)


0




1




2


In [254]:
accs = []

for i in range(3):
    accs.append((train_accs[i] + val_accs[i] + test_accs[i])/3)

print('Best Accuracy: ' + str(max(accs)))

Best Accuracy: [1.]


#### 50 / 50

In [19]:
clfs = []

for i in range(3):
    clf = GridSearchCV(cv=5, estimator=svm.SVC(), param_grid=[{'C': C_list, 'kernel': ['linear'], 'gamma': G_list}], n_jobs=-1)
    clf.fit(mushroom_X_train_50[i].values, mushroom_Y_train_50[i].values)
    clfs.append(clf)
    print(i)

0
1
2


In [20]:
train_accs = []
val_accs = []
test_accs = []

for i in range(3):
    train_accs.append(max(clfs[i].cv_results_['mean_train_score'].reshape((16, 1))))
    val_accs.append(max(clfs[i].cv_results_['mean_train_score'].reshape((16, 1))))
    test_accs.append(svm.SVC(C=clfs[i].best_estimator_.C, kernel='linear', gamma=clfs[i].best_estimator_.gamma).fit(mushroom_X_train_50[i],mushroom_Y_train_50[i]).score(mushroom_X_test_50[i], mushroom_Y_test_50[i]))
    print(i)




0
1




2


In [21]:
accs = []

for i in range(3):
    accs.append((train_accs[i] + val_accs[i] + test_accs[i])/3)

print('Best Accuracy: ' + str(max(accs)))

Best Accuracy: [1.]


#### 20  / 80

In [22]:
clfs = []

for i in range(3):
    clf = GridSearchCV(cv=5, estimator=svm.SVC(), param_grid=[{'C': C_list, 'kernel': ['linear'], 'gamma': G_list}], n_jobs=-1)
    clf.fit(mushroom_X_train_20[i].values, mushroom_Y_train_20[i].values)
    clfs.append(clf)
    print(i)

0
1
2


In [23]:
train_accs = []
val_accs = []
test_accs = []

for i in range(3):
    train_accs.append(max(clfs[i].cv_results_['mean_train_score'].reshape((16, 1))))
    val_accs.append(max(clfs[i].cv_results_['mean_train_score'].reshape((16, 1))))
    test_accs.append(svm.SVC(C=clfs[i].best_estimator_.C, kernel='linear', gamma=clfs[i].best_estimator_.gamma).fit(mushroom_X_train_20[i],mushroom_Y_train_20[i]).score(mushroom_X_test_80[i], mushroom_Y_test_80[i]))
    print(i)


0




1
2




In [24]:
accs = []

for i in range(3):
    accs.append((train_accs[i] + val_accs[i] + test_accs[i])/3)

print('Best Accuracy: ' + str(max(accs)))

Best Accuracy: [1.]


### Wine

#### 80 / 20

In [None]:
clfs = []

for i in range(3):
    clf = GridSearchCV(cv=5, estimator=svm.SVC(), param_grid=[{'C': C_list, 'kernel': ['linear'], 'gamma': G_list}], n_jobs=-1)
    clf.fit(wine_X_train_80[i].values, wine_Y_train_80[i].values)
    clfs.append(clf)
    print(i)



In [None]:
train_accs = []
val_accs = []
test_accs = []

for i in range(3):
    train_accs.append(max(clfs[i].cv_results_['mean_train_score'].reshape((16, 1))))
    val_accs.append(max(clfs[i].cv_results_['mean_train_score'].reshape((16, 1))))
    test_accs.append(svm.SVC(C=clfs[i].best_estimator_.C, kernel='linear', gamma=clfs[i].best_estimator_.gamma).fit(wine_X_train_80[i],wine_Y_train_80[i]).score(wine_X_test_20[i], wine_Y_test_20[i]))
    print(i)


In [None]:
accs = []

for i in range(3):
    accs.append((train_accs[i] + val_accs[i] + test_accs[i])/3)

print('Best Accuracy: ' + str(max(accs)))

#### 50 / 50

In [None]:
clfs = []

for i in range(3):
    clf = GridSearchCV(cv=5, estimator=svm.SVC(), param_grid=[{'C': C_list, 'kernel': ['linear'], 'gamma': G_list}], n_jobs=-1)
    clf.fit(wine_X_train_50[i].values, wine_Y_train_50[i].values)
    clfs.append(clf)
    print(i)

In [None]:
train_accs = []
val_accs = []
test_accs = []

for i in range(3):
    train_accs.append(max(clfs[i].cv_results_['mean_train_score'].reshape((16, 1))))
    val_accs.append(max(clfs[i].cv_results_['mean_train_score'].reshape((16, 1))))
    test_accs.append(svm.SVC(C=clfs[i].best_estimator_.C, kernel='linear', gamma=clfs[i].best_estimator_.gamma).fit(wine_X_train_50[i],wine_Y_train_50[i]).score(wine_X_test_50[i], wine_Y_test_50[i]))
    print(i)


In [None]:
accs = []

for i in range(3):
    accs.append((train_accs[i] + val_accs[i] + test_accs[i])/3)

print('Best Accuracy: ' + str(max(accs)))

#### 20  / 80

In [None]:
clfs = []

for i in range(3):
    clf = GridSearchCV(cv=5, estimator=svm.SVC(), param_grid=[{'C': C_list, 'kernel': ['linear'], 'gamma': G_list}], n_jobs=-1)
    clf.fit(wine_X_train_20[i].values, wine_Y_train_20[i].values)
    clfs.append(clf)
    print(i)

In [None]:
train_accs = []
val_accs = []
test_accs = []

for i in range(3):
    train_accs.append(max(clfs[i].cv_results_['mean_train_score'].reshape((16, 1))))
    val_accs.append(max(clfs[i].cv_results_['mean_train_score'].reshape((16, 1))))
    test_accs.append(svm.SVC(C=clfs[i].best_estimator_.C, kernel='linear', gamma=clfs[i].best_estimator_.gamma).fit(wine_X_train_20[i],wine_Y_train_20[i]).score(wine_X_test_80[i], wine_Y_test_80[i]))
    print(i)


In [None]:
accs = []

for i in range(3):
    accs.append((train_accs[i] + val_accs[i] + test_accs[i])/3)

print('Best Accuracy: ' + str(max(accs)))

### Dota

#### 80 / 20

In [235]:
clfs = []

for i in range(3):
    clf = GridSearchCV(cv=5, estimator=svm.SVC(), param_grid=[{'C': C_list, 'kernel': ['linear'], 'gamma': G_list}], n_jobs=-1)
    clf.fit(dota_X_train_80[i].values, dota_Y_train_80[i].values)
    clfs.append(clf)
    print(i)

0
1
2


In [236]:
train_accs = []
val_accs = []
test_accs = []

for i in range(3):
    train_accs.append(max(clfs[i].cv_results_['mean_train_score'].reshape((16, 1))))
    val_accs.append(max(clfs[i].cv_results_['mean_train_score'].reshape((16, 1))))
    test_accs.append(svm.SVC(C=clfs[i].best_estimator_.C, kernel='linear', gamma=clfs[i].best_estimator_.gamma).fit(dota_X_train_80[i],dota_Y_train_80[i]).score(dota_X_test_20[i], dota_Y_test_20[i]))
    print(i)




0




1




2


In [237]:
accs = []

for i in range(3):
    accs.append((train_accs[i] + val_accs[i] + test_accs[i])/3)

print('Best Accuracy: ' + str(max(accs)))

Best Accuracy: [0.62337498]


#### 50 / 50

In [None]:
clfs = []

for i in range(3):
    clf = GridSearchCV(cv=5, estimator=svm.SVC(), param_grid=[{'C': C_list, 'kernel': ['linear'], 'gamma': G_list}], n_jobs=-1)
    clf.fit(dota_X_train_50[i].values, dota_Y_train_50[i].values)
    clfs.append(clf)
    print(i)

In [None]:
train_accs = []
val_accs = []
test_accs = []

for i in range(3):
    train_accs.append(max(clfs[i].cv_results_['mean_train_score'].reshape((16, 1))))
    val_accs.append(max(clfs[i].cv_results_['mean_train_score'].reshape((16, 1))))
    test_accs.append(svm.SVC(C=clfs[i].best_estimator_.C, kernel='linear', gamma=clfs[i].best_estimator_.gamma).fit(dota_X_train_50[i],dota_Y_train_50[i]).score(dota_X_test_50[i], dota_Y_test_50[i]))
    print(i)


In [None]:
accs = []

for i in range(3):
    accs.append((train_accs[i] + val_accs[i] + test_accs[i])/3)

print('Best Accuracy: ' + str(max(accs)))

#### 20  / 80

In [None]:
clfs = []

for i in range(3):
    clf = GridSearchCV(cv=5, estimator=svm.SVC(), param_grid=[{'C': C_list, 'kernel': ['linear'], 'gamma': G_list}], n_jobs=-1)
    clf.fit(dota_X_train_20[i].values, dota_Y_train_20[i].values)
    clfs.append(clf)
    print(i)

In [None]:
train_accs = []
val_accs = []
test_accs = []

for i in range(3):
    train_accs.append(max(clfs[i].cv_results_['mean_train_score'].reshape((16, 1))))
    val_accs.append(max(clfs[i].cv_results_['mean_train_score'].reshape((16, 1))))
    test_accs.append(svm.SVC(C=clfs[i].best_estimator_.C, kernel='linear', gamma=clfs[i].best_estimator_.gamma).fit(dota_X_train_20[i],dota_Y_train_20[i]).score(dota_X_test_80[i], dota_Y_test_80[i]))
    print(i)


In [None]:
accs = []

for i in range(3):
    accs.append((train_accs[i] + val_accs[i] + test_accs[i])/3)

print('Best Accuracy: ' + str(max(accs)))

# Logistic Regression

### Mushroom

#### 80 / 20

In [147]:
logregs = []

for i in range(3):
    logreg = LogisticRegressionCV(Cs= C_list, solver='liblinear', multi_class='ovr', n_jobs=-1, cv=5)
    logreg.fit(mushroom_X_train_80[i].values, mushroom_Y_train_80[i].values)
    logregs.append(logreg)
    print(i)

0
1
2


In [148]:
train_accs = []
val_accs = []
test_accs = []

for i in range(3):
    train_accs.append(logregs[i].score(mushroom_X_train_80[i], mushroom_Y_train_80[i]))
    val_accs.append(logregs[i].score(mushroom_X_train_80[i], mushroom_Y_train_80[i]))
    test_accs.append(logregs[i].score(mushroom_X_test_20[i], mushroom_Y_test_20[i]))
    print(i)


0
1
2


In [149]:
accs = []

for i in range(3):
    accs.append((train_accs[i] + val_accs[i] + test_accs[i])/3)

print('Best Accuracy: ' + str(max(accs)))

Best Accuracy: 1.0


#### 50 / 50

In [182]:
logregs = []

for i in range(3):
    logreg = LogisticRegressionCV(Cs= C_list, solver='liblinear', multi_class='ovr', n_jobs=-1, cv=5)
    logreg.fit(mushroom_X_train_50[i].values, mushroom_Y_train_50[i].values)
    logregs.append(logreg)
    print(i)

0
1
2


In [186]:
train_accs = []
val_accs = []
test_accs = []

for i in range(3):
    train_accs.append(logregs[i].score(mushroom_X_train_50[i], mushroom_Y_train_50[i]))
    val_accs.append(logregs[i].score(mushroom_X_train_50[i], mushroom_Y_train_50[i]))
    test_accs.append(logregs[i].score(mushroom_X_test_50[i], mushroom_Y_test_50[i]))
    print(i)


0
1
2


In [187]:
accs = []

for i in range(3):
    accs.append((train_accs[i] + val_accs[i] + test_accs[i])/3)

print('Best Accuracy: ' + str(max(accs)))

Best Accuracy: 1.0


#### 20 / 80

In [153]:
logregs = []

for i in range(3):
    logreg = LogisticRegressionCV(Cs= C_list, solver='liblinear', multi_class='ovr', n_jobs=-1, cv=5)
    logreg.fit(mushroom_X_train_20[i].values, mushroom_Y_train_20[i].values)
    logregs.append(logreg)
    print(i)

0
1
2


In [154]:
train_accs = []
val_accs = []
test_accs = []

for i in range(3):
    train_accs.append(logregs[i].score(mushroom_X_train_20[i], mushroom_Y_train_20[i]))
    val_accs.append(logregs[i].score(mushroom_X_train_20[i], mushroom_Y_train_20[i]))
    test_accs.append(logregs[i].score(mushroom_X_test_80[i], mushroom_Y_test_80[i]))
    print(i)


0
1
2


In [155]:
accs = []

for i in range(3):
    accs.append((train_accs[i] + val_accs[i] + test_accs[i])/3)

print('Best Accuracy: ' + str(max(accs)))

Best Accuracy: 0.9993846153846153


### Wine

#### 80 / 20

In [156]:
logregs = []

for i in range(3):
    logreg = LogisticRegressionCV(Cs= C_list, solver='liblinear', multi_class='ovr', n_jobs=-1, cv=5)
    logreg.fit(wine_X_train_80[i].values, wine_Y_train_80[i].values)
    logregs.append(logreg)
    print(i)



0




1
2


In [157]:
train_accs = []
val_accs = []
test_accs = []

for i in range(3):
    train_accs.append(logregs[i].score(wine_X_train_80[i], wine_Y_train_80[i]))
    val_accs.append(logregs[i].score(wine_X_train_80[i], wine_Y_train_80[i]))
    test_accs.append(logregs[i].score(wine_X_test_20[i], wine_Y_test_20[i]))
    print(i)


0
1
2


In [158]:
accs = []

for i in range(3):
    accs.append((train_accs[i] + val_accs[i] + test_accs[i])/3)

print('Best Accuracy: ' + str(max(accs)))

Best Accuracy: 0.5058333333333334


#### 50 / 50

In [159]:
logregs = []

for i in range(3):
    logreg = LogisticRegressionCV(Cs= C_list, solver='liblinear', multi_class='ovr', n_jobs=-1, cv=5)
    logreg.fit(wine_X_train_50[i].values, wine_Y_train_50[i].values)
    logregs.append(logreg)
    print(i)



0




1




2


In [160]:
train_accs = []
val_accs = []
test_accs = []

for i in range(3):
    train_accs.append(logregs[i].score(wine_X_train_50[i], wine_Y_train_50[i]))
    val_accs.append(logregs[i].score(wine_X_train_50[i], wine_Y_train_50[i]))
    test_accs.append(logregs[i].score(wine_X_test_50[i], wine_Y_test_50[i]))
    print(i)


0
1
2


In [161]:
accs = []

for i in range(3):
    accs.append((train_accs[i] + val_accs[i] + test_accs[i])/3)

print('Best Accuracy: ' + str(max(accs)))

Best Accuracy: 0.509


#### 20 / 80

In [162]:
logregs = []

for i in range(3):
    logreg = LogisticRegressionCV(Cs= C_list, solver='liblinear', multi_class='ovr', n_jobs=-1, cv=5)
    logreg.fit(wine_X_train_20[i].values, wine_Y_train_20[i].values)
    logregs.append(logreg)
    print(i)



ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: -1.0

In [163]:
train_accs = []
val_accs = []
test_accs = []

for i in range(3):
    train_accs.append(logregs[i].score(wine_X_train_20[i], wine_Y_train_20[i]))
    val_accs.append(logregs[i].score(wine_X_train_20[i], wine_Y_train_20[i]))
    test_accs.append(logregs[i].score(wine_X_test_80[i], wine_Y_test_80[i]))
    print(i)


IndexError: list index out of range

In [164]:
accs = []

for i in range(3):
    accs.append((train_accs[i] + val_accs[i] + test_accs[i])/3)

print('Best Accuracy: ' + str(max(accs)))

IndexError: list index out of range

### Dota

#### 80 / 20

In [165]:
logregs = []

for i in range(3):
    logreg = LogisticRegressionCV(Cs= C_list, solver='liblinear', multi_class='ovr', n_jobs=-1, cv=5)
    logreg.fit(dota_X_train_80[i].values, dota_Y_train_80[i].values)
    logregs.append(logreg)
    print(i)

0
1
2


In [166]:
train_accs = []
val_accs = []
test_accs = []

for i in range(3):
    train_accs.append(logregs[i].score(dota_X_train_80[i], dota_Y_train_80[i]))
    val_accs.append(logregs[i].score(dota_X_train_80[i], dota_Y_train_80[i]))
    test_accs.append(logregs[i].score(dota_X_test_20[i], dota_Y_test_20[i]))
    print(i)


0
1
2


In [167]:
accs = []

for i in range(3):
    accs.append((train_accs[i] + val_accs[i] + test_accs[i])/3)

print('Best Accuracy: ' + str(max(accs)))

Best Accuracy: 0.6168333333333333


#### 50 / 50

In [168]:
logregs = []

for i in range(3):
    logreg = LogisticRegressionCV(Cs= C_list, solver='liblinear', multi_class='ovr', n_jobs=-1, cv=5)
    logreg.fit(dota_X_train_50[i].values, dota_Y_train_50[i].values)
    logregs.append(logreg)
    print(i)

0
1
2


In [169]:
train_accs = []
val_accs = []
test_accs = []

for i in range(3):
    train_accs.append(logregs[i].score(dota_X_train_50[i], dota_Y_train_50[i]))
    val_accs.append(logregs[i].score(dota_X_train_50[i], dota_Y_train_50[i]))
    test_accs.append(logregs[i].score(dota_X_test_50[i], dota_Y_test_50[i]))
    print(i)


0
1
2


In [170]:
accs = []

for i in range(3):
    accs.append((train_accs[i] + val_accs[i] + test_accs[i])/3)

print('Best Accuracy: ' + str(max(accs)))

Best Accuracy: 0.6244


#### 20 / 80

In [171]:
logregs = []

for i in range(3):
    logreg = LogisticRegressionCV(Cs= C_list, solver='liblinear', multi_class='ovr', n_jobs=-1, cv=5)
    logreg.fit(dota_X_train_20[i].values, dota_Y_train_20[i].values)
    logregs.append(logreg)
    print(i)

0
1
2


In [172]:
train_accs = []
val_accs = []
test_accs = []

for i in range(3):
    train_accs.append(logregs[i].score(dota_X_train_20[i], dota_Y_train_20[i]))
    val_accs.append(logregs[i].score(dota_X_train_20[i], dota_Y_train_20[i]))
    test_accs.append(logregs[i].score(dota_X_test_80[i], dota_Y_test_80[i]))
    print(i)


0
1
2


In [173]:
accs = []

for i in range(3):
    accs.append((train_accs[i] + val_accs[i] + test_accs[i])/3)

print('Best Accuracy: ' + str(max(accs)))

Best Accuracy: 0.6466666666666666


# Random Forest

### Mushroom

#### 80 / 20

In [174]:
rfs = []

for i in range(3):
    rf = RandomForestClassifier(n_estimators=100)
    rf.fit(mushroom_X_train_80[i].values, mushroom_Y_train_80[i].values)
    rfs.append(rf)
    print(i)

0
1
2


In [189]:
train_accs = []
val_accs = []
test_accs = []

for i in range(3):
    train_accs.append(rfs[i].score(mushroom_X_train_80[i], mushroom_Y_train_80[i]))
    val_accs.append(rfs[i].score(mushroom_X_train_80[i], mushroom_Y_train_80[i]))
    test_accs.append(rfs[i].score(mushroom_X_test_20[i], mushroom_Y_test_20[i]))
    print(i)


0
1
2


In [191]:
accs = []

for i in range(3):
    accs.append((train_accs[i] + val_accs[i] + test_accs[i])/3)

print('Best Accuracy: ' + str(max(accs)))

Best Accuracy: 1.0


#### 50 / 50

In [192]:
rfs = []

for i in range(3):
    rf = RandomForestClassifier(n_estimators=100)
    rf.fit(mushroom_X_train_50[i].values, mushroom_Y_train_50[i].values)
    rfs.append(rf)
    print(i)

0
1
2


In [193]:
train_accs = []
val_accs = []
test_accs = []

for i in range(3):
    train_accs.append(rfs[i].score(mushroom_X_train_50[i], mushroom_Y_train_50[i]))
    val_accs.append(rfs[i].score(mushroom_X_train_50[i], mushroom_Y_train_50[i]))
    test_accs.append(rfs[i].score(mushroom_X_test_50[i], mushroom_Y_test_50[i]))
    print(i)


0
1
2


In [194]:
accs = []

for i in range(3):
    accs.append((train_accs[i] + val_accs[i] + test_accs[i])/3)

print('Best Accuracy: ' + str(max(accs)))

Best Accuracy: 1.0


#### 20 / 80

In [195]:
rfs = []

for i in range(3):
    rf = RandomForestClassifier(n_estimators=100)
    rf.fit(mushroom_X_train_20[i].values, mushroom_Y_train_20[i].values)
    rfs.append(rf)
    print(i)

0
1
2


In [196]:
train_accs = []
val_accs = []
test_accs = []

for i in range(3):
    train_accs.append(rfs[i].score(mushroom_X_train_20[i], mushroom_Y_train_20[i]))
    val_accs.append(rfs[i].score(mushroom_X_train_20[i], mushroom_Y_train_20[i]))
    test_accs.append(rfs[i].score(mushroom_X_test_80[i], mushroom_Y_test_80[i]))
    print(i)


0
1
2


In [197]:
accs = []

for i in range(3):
    accs.append((train_accs[i] + val_accs[i] + test_accs[i])/3)

print('Best Accuracy: ' + str(max(accs)))

Best Accuracy: 1.0


### Wine

#### 80 / 20

In [198]:
rfs = []

for i in range(3):
    rf = RandomForestClassifier(n_estimators=100)
    rf.fit(wine_X_train_80[i].values, wine_Y_train_80[i].values)
    rfs.append(rf)
    print(i)

0
1
2


In [200]:
train_accs = []
val_accs = []
test_accs = []

for i in range(3):
    train_accs.append(rfs[i].score(wine_X_train_80[i], wine_Y_train_80[i]))
    val_accs.append(rfs[i].score(wine_X_train_80[i], wine_Y_train_80[i]))
    test_accs.append(rfs[i].score(wine_X_test_20[i], wine_Y_test_20[i]))
    print(i)


0
1
2


In [201]:
accs = []

for i in range(3):
    accs.append((train_accs[i] + val_accs[i] + test_accs[i])/3)

print('Best Accuracy: ' + str(max(accs)))

Best Accuracy: 0.8975


#### 50 / 50

In [202]:
rfs = []

for i in range(3):
    rf = RandomForestClassifier(n_estimators=100)
    rf.fit(wine_X_train_50[i].values, wine_Y_train_50[i].values)
    rfs.append(rf)
    print(i)

0
1
2


In [203]:
train_accs = []
val_accs = []
test_accs = []

for i in range(3):
    train_accs.append(rfs[i].score(wine_X_train_50[i], wine_Y_train_50[i]))
    val_accs.append(rfs[i].score(wine_X_train_50[i], wine_Y_train_50[i]))
    test_accs.append(rfs[i].score(wine_X_test_50[i], wine_Y_test_50[i]))
    print(i)


0
1
2


In [204]:
accs = []

for i in range(3):
    accs.append((train_accs[i] + val_accs[i] + test_accs[i])/3)

print('Best Accuracy: ' + str(max(accs)))

Best Accuracy: 0.868


#### 20 / 80

In [205]:
rfs = []

for i in range(3):
    rf = RandomForestClassifier(n_estimators=100)
    rf.fit(wine_X_train_20[i].values, wine_Y_train_20[i].values)
    rfs.append(rf)
    print(i)

0
1
2


In [208]:
train_accs = []
val_accs = []
test_accs = []

for i in range(3):
    train_accs.append(rfs[i].score(wine_X_train_20[i], wine_Y_train_20[i]))
    val_accs.append(rfs[i].score(wine_X_train_20[i], wine_Y_train_20[i]))
    test_accs.append(rfs[i].score(wine_X_test_80[i], wine_Y_test_80[i]))
    print(i)


0
1
2


In [209]:
accs = []

for i in range(3):
    accs.append((train_accs[i] + val_accs[i] + test_accs[i])/3)

print('Best Accuracy: ' + str(max(accs)))

Best Accuracy: 0.8470833333333333


### Dota

#### 80 / 20

In [228]:
rfs = []

for i in range(3):
    rf = RandomForestClassifier(n_estimators=100)
    rf.fit(dota_X_train_80[i].values, dota_Y_train_80[i].values)
    rfs.append(rf)
    print(i)

0
1
2


In [229]:
train_accs = []
val_accs = []
test_accs = []

for i in range(3):
    train_accs.append(rfs[i].score(dota_X_train_80[i], dota_Y_train_80[i]))
    val_accs.append(rfs[i].score(dota_X_train_80[i], dota_Y_train_80[i]))
    test_accs.append(rfs[i].score(dota_X_test_20[i], dota_Y_test_20[i]))
    print(i)


0
1
2


In [230]:
accs = []

for i in range(3):
    accs.append((train_accs[i] + val_accs[i] + test_accs[i])/3)

print('Best Accuracy: ' + str(max(accs)))

Best Accuracy: 0.8546666666666667


#### 50 / 50

In [213]:
rfs = []

for i in range(3):
    rf = RandomForestClassifier(n_estimators=100)
    rf.fit(dota_X_train_50[i].values, dota_Y_train_50[i].values)
    rfs.append(rf)
    print(i)

0
1
2


In [214]:
train_accs = []
val_accs = []
test_accs = []

for i in range(3):
    train_accs.append(rfs[i].score(dota_X_train_50[i], dota_Y_train_50[i]))
    val_accs.append(rfs[i].score(dota_X_train_50[i], dota_Y_train_50[i]))
    test_accs.append(rfs[i].score(dota_X_test_50[i], dota_Y_test_50[i]))
    print(i)


0
1
2


In [215]:
accs = []

for i in range(3):
    accs.append((train_accs[i] + val_accs[i] + test_accs[i])/3)

print('Best Accuracy: ' + str(max(accs)))

Best Accuracy: 0.8525333333333333


#### 20 / 80

In [216]:
rfs = []

for i in range(3):
    rf = RandomForestClassifier(n_estimators=100)
    rf.fit(dota_X_train_20[i].values, dota_Y_train_20[i].values)
    rfs.append(rf)
    print(i)

0
1
2


In [217]:
train_accs = []
val_accs = []
test_accs = []

for i in range(3):
    train_accs.append(rfs[i].score(dota_X_train_20[i], dota_Y_train_20[i]))
    val_accs.append(rfs[i].score(dota_X_train_20[i], dota_Y_train_20[i]))
    test_accs.append(rfs[i].score(dota_X_test_80[i], dota_Y_test_80[i]))
    print(i)


0
1
2


In [218]:
accs = []

for i in range(3):
    accs.append((train_accs[i] + val_accs[i] + test_accs[i])/3)

print('Best Accuracy: ' + str(max(accs)))

Best Accuracy: 0.8496666666666667
