# Import Statements and Data

In [84]:
import pandas
import time
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import accuracy_score as acc
from sklearn.externals import joblib
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier as MLPC
from sklearn.naive_bayes import GaussianNB as GNB
from sklearn.naive_bayes import BernoulliNB as BNB
from sklearn.naive_bayes import MultinomialNB as MNB
from sklearn.preprocessing import MinMaxScaler as MMS
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression as LR
from sklearn.neighbors import KNeighborsClassifier as KNC
from sklearn.model_selection import GridSearchCV as GSCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.ensemble import AdaBoostClassifier as ABC
from sklearn.linear_model import SGDClassifier as SGDC


dota_data = pandas.read_csv('dota2Train.csv', header = None)
data = dota_data.iloc[:,1:]
target = dota_data.iloc[:,0]
(training_data, validation_data, training_target, validation_target) = tts(data, target, test_size = .15)

# MinMaxScaler

In [69]:
mms = MMS()
training_data_temp = mms.fit_transform(training_data)
validation_data_temp = mms.fit_transform(validation_data)
# training_target_temp = mms.fit_transform(training_target)
# validation_target_temp = mms.fit_transform(validation_target)

training_data_mms = pandas.DataFrame(data=training_data_temp[:,:])
validation_data_mms = pandas.DataFrame(data=validation_data_temp[:,:])
training_target_mms = training_target.reshape(-1,1)
validation_target_mms = validation_target.reshape(-1,1)
# training_target_mms = pandas.DataFrame(data=training_target_temp[:,:])
# validation_target_mms = pandas.DataFrame(data=validation_target_temp[:,:])
training_target_mms

array([[-1],
       [ 1],
       [-1],
       ..., 
       [ 1],
       [-1],
       [ 1]], dtype=int64)

# PCA

In [51]:
# http://stackoverflow.com/questions/32857029/python-scikit-learn-pca-explained-variance-ratio-cutoff
pca = PCA(n_components=10, whiten=True)
pca.fit(training_data)
# print(pca.explained_variance_)
# print(pca.explained_variance_ratio_)
# print(pca.explained_variance_ratio_.cumsum())

training_data_temp2 = pca.transform(training_data)
validation_data_temp2 = pca.transform(validation_data)

training_data_pca = pandas.DataFrame(data=training_data_temp2[:,:])
validation_data_pca = pandas.DataFrame(data=validation_data_temp2[:,:])

# Random Forest Classifier

In [52]:
rfc = RFC(n_jobs=-1, n_estimators=100, max_features=0.5)
rfc.fit(training_data,training_target)
rfc_pred = rfc.predict(validation_data)
print(acc(rfc_pred, validation_target))

0.569146639804


# Perceptron

In [53]:
perc = Perceptron(n_jobs=-1, n_iter=1001,eta0=0.1)
perc.fit(training_data,training_target)
perc_pred = perc.predict(validation_data)
print(acc(perc_pred, validation_target))

0.467045618075


# Multilayer Perceptron

In [54]:
mlpc = MLPC(hidden_layer_sizes=(115,115,115,115,115,115), activation='tanh', max_iter=500)
mlpc.fit(training_data,training_target)
mlpc_pred = mlpc.predict(validation_data)
print(acc(mlpc_pred, validation_target))

0.596272845014


# Naive Bayes

## Bernoulli

In [55]:
bnb = BNB()
bnb.fit(training_data,training_target)
bnb_pred = bnb.predict(validation_data)
print(acc(bnb_pred, validation_target))

0.573104043747


## Gaussian

In [56]:
gnb = GNB()
gnb.fit(training_data,training_target)
gnb_pred = gnb.predict(validation_data)
print(acc(gnb_pred, validation_target))

0.553101165635


## Multinomial

In [74]:
mnb = MNB()
mnb.fit(training_data_mms,training_target_mms)
mnb_pred = mnb.predict(validation_data_mms)
print(acc(mnb_pred, validation_target_mms))

0.535041013095


  y = column_or_1d(y, warn=True)


# Support Vector Machine

## RBF

In [None]:
start_time = time.time()
svc = SVC(C=0.25)
svc.fit(training_data,training_target)
print("--- Training took %s seconds ---" % (time.time() - start_time))
svc_pred = svc.predict(validation_data)
print(acc(svc_pred, validation_target))
print("--- %s seconds ---" % (time.time() - start_time))

--- Training took 978.3524339199066 seconds ---
0.583393293999
--- 1071.2892246246338 seconds ---


## Sigmoid

In [6]:
start_time = time.time()
sigmoid_svc = SVC(kernel='sigmoid')
sigmoid_svc.fit(training_data,training_target)
print("--- Training took %s seconds ---" % (time.time() - start_time))
sigmoid_svc_pred = sigmoid_svc.predict(validation_data)
print(acc(sigmoid_svc_pred, validation_target))
print("--- Total time: %s seconds ---" % (time.time() - start_time))

--- Training took 1085.9452102184296 seconds ---
0.519715066916
--- Total time: 1223.8227014541626 seconds ---


## Linear

In [5]:
start_time = time.time()
linear_svc = SVC(kernel='linear', C=0.05)
linear_svc.fit(training_data,training_target)
print("--- Training took %s seconds ---" % (time.time() - start_time))
linear_svc_pred = linear_svc.predict(validation_data)
print(acc(linear_svc_pred, validation_target))
print("--- Total time: %s seconds ---" % (time.time() - start_time))

--- Training took 1164.3281786441803 seconds ---
0.603396172111
--- Total time: 1243.3781158924103 seconds ---


### Dumping linear kernel

In [6]:
# joblib.dump(linear_svc, 'linear_svc_c0.05.pkl')
# linear_svc = joblib.load('linear_svc.pkl')

['linear_svc_c0.05.pkl']

## Polynomial

In [13]:
start_time = time.time()
poly_svc = SVC(kernel='poly', C=0.1, degree=2, gamma=0.005)
# print(poly_svc.get_params())
poly_svc.fit(training_data,training_target)
print("--- Training took %s seconds ---" % (time.time() - start_time))
poly_svc_pred = poly_svc.predict(validation_data)
print(acc(poly_svc_pred, validation_target))
print("--- Total time: %s seconds ---" % (time.time() - start_time))

--- Training took 1408.5175392627716 seconds ---
0.598215570586
--- Total time: 1489.2745802402496 seconds ---


### Dumping poly kernel

In [14]:
joblib.dump(poly_svc, 'poly_svc_d2_g.005_c.1.pkl')

['poly_svc_d2_g.005_c.1.pkl']

# Working with PCA data

## Linear SVM

In [31]:
start_time = time.time()
lin_svc_pca = SVC(kernel='linear', C=0.1, gamma=0.001)
lin_svc_pca.fit(training_data_pca,training_target_for_pca)
print("--- Training took %s seconds ---" % (time.time() - start_time))
lin_svc_pred_pca = lin_svc_pca.predict(validation_data_pca)
print(acc(lin_svc_pred_pca, validation_target_for_pca))
print("--- Total time: %s seconds ---" % (time.time() - start_time))

--- Training took 121.50322651863098 seconds ---
0.529500647575
--- Total time: 129.50003266334534 seconds ---


## Poly SVM

In [32]:
start_time = time.time()
poly_svc_pca = SVC(kernel='poly', degree=2, C=0.1, gamma=0.001)
poly_svc_pca.fit(training_data_pca,training_target_for_pca)
print("--- Training took %s seconds ---" % (time.time() - start_time))
poly_svc_pred_pca = poly_svc_pca.predict(validation_data_pca)
print(acc(poly_svc_pred_pca, validation_target_for_pca))
print("--- Total time: %s seconds ---" % (time.time() - start_time))

--- Training took 157.09088730812073 seconds ---
0.529500647575
--- Total time: 167.30319595336914 seconds ---


## RBF SVM

In [33]:
start_time = time.time()
rbf_svc_pca = SVC(kernel='rbf', C=0.1, gamma=0.001)
rbf_svc_pca.fit(training_data_pca, training_target_for_pca)
print("--- Training took %s seconds ---" % (time.time() - start_time))
rbf_svc_pred_pca = rbf_svc_pca.predict(validation_data_pca)
print(acc(rbf_svc_pred_pca, validation_target_for_pca))
print("--- Total time: %s seconds ---" % (time.time() - start_time))

--- Training took 218.41278553009033 seconds ---
0.529500647575
--- Total time: 236.5156762599945 seconds ---


# Logistic Regression

In [26]:
start_time = time.time()
log_reg = LR(solver='sag', max_iter=500, C=0.05)
# log_reg = LR(C=0.05)
log_reg.fit(training_data,training_target)
print("--- Training took %s seconds ---" % (time.time() - start_time))
log_reg_pred = log_reg.predict(validation_data)
print(acc(log_reg_pred, validation_target))
print("--- Total time: %s seconds ---" % (time.time() - start_time))
# 0.598287523385, 0.598143617787

--- Training took 32.70646333694458 seconds ---
0.598287523385
--- Total time: 32.71646428108215 seconds ---


# K Nearest Neighbor

In [48]:
start_time = time.time()
knc = KNC(weights='distance')
knc.fit(training_data,training_target)
print("--- Training took %s seconds ---" % (time.time() - start_time))
knc_pred = knc.predict(validation_data)
print(acc(knc_pred, validation_target))
print("--- Total time: %s seconds ---" % (time.time() - start_time))

--- Training took 2.882268190383911 seconds ---
0.525831054828
--- Total time: 44.47872018814087 seconds ---


# Loading in Pickled kernels

In [9]:
lin_svc_1 = joblib.load('Pickled Kernels/linear_svc.pkl')
lin_svc_2 = joblib.load('Pickled Kernels/linear_svc_c0.05.pkl')
lin_svc_3 = joblib.load('Pickled Kernels/linear_svc_c0.10.pkl')
lin_svc_4 = joblib.load('Pickled Kernels/linear_svc_c0.25.pkl')

poly_svc_1 = joblib.load('Pickled Kernels/poly_svc_c0.10.pkl')
poly_svc_2 = joblib.load('Pickled Kernels/poly_svc_c0.25.pkl')
poly_svc_3 = joblib.load('Pickled Kernels/poly_svc_c0.10_d2.pkl')
poly_svc_4 = joblib.load('Pickled Kernels/poly_svc_d2_g.001_c.1.pkl')
poly_svc_5 = joblib.load('Pickled Kernels/poly_svc_d2_g.005_c.1.pkl')

# Predicting and testing pickled kernels

In [11]:
# l1 = lin_svc_1.predict(validation_data)
l2 = lin_svc_2.predict(validation_data)
l3 = lin_svc_3.predict(validation_data)
l4 = lin_svc_4.predict(validation_data)

p1 = poly_svc_1.predict(validation_data)
p2 = poly_svc_2.predict(validation_data)
p3 = poly_svc_3.predict(validation_data)
p4 = poly_svc_4.predict(validation_data)
p5 = poly_svc_5.predict(validation_data)

# print('Linear SVC 1:', acc(l1, validation_target))
print('Linear SVC 2:', acc(l2, validation_target))
print('Linear SVC 3:', acc(l3, validation_target))
print('Linear SVC 4:', acc(l4, validation_target))

# print('Polynomial SVC 1:', acc(p1, validation_target))
# print('Polynomial SVC 2:', acc(p2, validation_target))
print('Polynomial SVC 3:', acc(p3, validation_target))
print('Polynomial SVC 4:', acc(p4, validation_target))
# print('Polynomial SVC 5:', acc(p5, validation_target))

Linear SVC 2: 0.60210102173
Linear SVC 3: 0.601885163333
Linear SVC 4: 0.60210102173
Polynomial SVC 3: 0.601165635343
Polynomial SVC 4: 0.600158296158


# Grid Search on Logistic Regression

In [36]:
# log_grid = {'C':[0.1,0.01,0.001], 'penalty':['l1','l2'], 'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag']}
# log_grid = {'C':[1.0,0.1,0.01,0.001], 'penalty':['l1','l2'], 'solver':['liblinear']}
# log_grid = {'C':[9,10.0,11], 'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag']}
log_grid = {'C':[9.96875,10.0,10.03125], 'solver':['lbfgs']}

log_reg = GSCV(LR(max_iter=1000, n_jobs=-1), log_grid)
log_reg.fit(training_data,training_target)

print('C:',log_reg.best_estimator_.C)                    #1.0,1.0,10,10,10,10,10
print('Penalty:',log_reg.best_estimator_.penalty)        #l1,l2,l2,l2,l2,l2,l2
print('Solver:',log_reg.best_estimator_.solver)          #liblinear,lbfgs,lbfgs,lbfgs,lbfgs,lbfgs,lbfgs

C: 10.0
Penalty: l2
Solver: lbfgs


# Logistic Regressor Final

In [42]:
logistic_regressor = LR(C=10,solver='lbfgs',max_iter=400,n_jobs=-1)
logistic_regressor.fit(training_data,training_target)
lr_pred = logistic_regressor.predict(validation_data)
print(acc(lr_pred, validation_target))

0.601597352137


# Grid Search on Multilayer Perceptron

In [62]:
# Here's a good resource I found on the hidden layer sizes
# http://stats.stackexchange.com/questions/181/how-to-choose-the-number-of-hidden-layers-and-nodes-in-a-feedforward-neural-netw
start_time = time.time()
mlpc_grid = {'activation': ['identity', 'logistic', 'tanh', 'relu'], 'solver': ['lbfgs', 'sgd', 'adam']}
mlpc_testing = GSCV(MLPC(hidden_layer_sizes=59), mlpc_grid) #hidden layer sizes = input layer + output layer /2 (116+1 /2)
# mlpc_testing = MLPC(activation='identity', solver='lbfgs', hidden_layer_sizes=59)
mlpc_testing.fit(training_data,training_target)

# print('Activation:',mlpc_testing.best_estimator_.activation)
# print('Solver:',mlpc_testing.best_estimator_.solver)
mlpc_final_pred = mlpc_testing.predict(validation_data)
print(acc(mlpc_final_pred, validation_target))

print("--- Total time: %s seconds ---" % (time.time() - start_time))

0.592531299468
--- Total time: 564.4012639522552 seconds ---


# Linear Discriminant Analysis

In [79]:
start_time = time.time()
lda = LDA(solver='lsqr')
lda.fit(training_data,training_target)
print("--- Training took %s seconds ---" % (time.time() - start_time))
lda_pred = lda.predict(validation_data)
print(acc(lda_pred, validation_target))
print("--- Total time: %s seconds ---" % (time.time() - start_time))

--- Training took 0.29528331756591797 seconds ---
0.598143617787
--- Total time: 0.3037912845611572 seconds ---


# AdaBoost Classifier

In [87]:
start_time = time.time()
abc = ABC(n_estimators=110)
abc.fit(training_data,training_target)
print("--- Training took %s seconds ---" % (time.time() - start_time))
abc_pred = abc.predict(validation_data)
print(acc(abc_pred, validation_target))
print("--- Total time: %s seconds ---" % (time.time() - start_time))

--- Training took 6.566806793212891 seconds ---
0.590012951504
--- Total time: 6.754987716674805 seconds ---


# SGD Classifier

In [None]:
start_time = time.time()
sgdc = SGDC()
sgdc.fit(training_data,training_target)
print("--- Training took %s seconds ---" % (time.time() - start_time))
sgdc_pred = sgdc.predict(validation_data)
print(acc(sgdc_pred, validation_target))
print("--- Total time: %s seconds ---" % (time.time() - start_time))