In [10]:
import csv
import os
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from xgboost.sklearn import XGBClassifier
import xgboost as xgb
import numpy as np
from sklearn import preprocessing
#I installed the xgboost module with conda install -c conda-forge xgboost

trainSet = pd.read_csv('TrainOnMe.csv',index_col=0 )
evalSet = pd.read_csv('EvaluateOnMe.csv',index_col=0)

#Clean trainingset
trainSet.iat[208,12] = "False"
trainSet = trainSet.dropna()


trainSet = trainSet[trainSet.x6 != "Bayesian Interference"]
trainSet = trainSet[trainSet.x12 != "Flase"]

X_train = trainSet.iloc[:,1:13].values
y_train = trainSet.iloc[:,0].values
eval_train = evalSet.values
x6 = trainSet.iloc[:,6].values
x12 = trainSet.iloc[:,12].values
#do the same for evalSet
e_x6 = evalSet.iloc[:,5].values
e_x12 = evalSet.iloc[:,11].values



#convert y,x6, x12, e_x6 and e_x12 to numeric
le1 = preprocessing.LabelEncoder()
le1.fit(["GMMs and Accordions", "Bayesian Inference"])
le2 = preprocessing.LabelEncoder()
le2 = le2.fit(["Shoogee","Atsuto", "Bob", "Jorg"])
le3 = preprocessing.LabelEncoder()
le3.fit(["True", "False"])
e_le1 = preprocessing.LabelEncoder()
e_le2 = preprocessing.LabelEncoder()
e_le1.fit(["GMMs and Accordions", "Bayesian Inference"])
e_le2.fit([True, False])
e_x6 = e_le1.transform(e_x6)
e_x12 = e_le2.transform(e_x12)
eval_train[:,5] = e_x6
eval_train[:,11] = e_x12
x6 = le1.transform(x6)
y_train = le2.transform(y_train) 
x12 = le3.transform(x12)
X_train[:,5] = x6
X_train[:,11] = x12
 
estimator = XGBClassifier(
                    seed=29,
                    n_jobs=4,
                    learning_rate=0.1,
                    n_estimators = 1000,
                    max_depth = 5,
                    min_child_weight=1,
                   
                    gamma=0,
                   
                    objective='multi:softmax',
                    verbosity = 0,
    
                     subsample=0.8,
                
                    colsample_bytree=0.8
                   
                    )
#perform cross valid. to determine n_estimators, defaults at 5-fold

xgb_params = estimator.get_xgb_params()

xgb_params['num_class']=4

xgb_train = xgb.DMatrix(X_train, label= y_train)

cv = xgb.cv(xgb_params, xgb_train,
            num_boost_round=estimator.get_params()['n_estimators'],
            nfold = 5,
            metrics='merror', early_stopping_rounds=50)

number_of_est = cv.shape[0]
estimator.set_params(n_estimators=number_of_est)



params1 = {'min_child_weight':range(1,6,2),'max_depth':range(3,10,2)
            }


gd_search1 = GridSearchCV(
                    estimator =estimator,
                    scoring = 'accuracy',
                    cv = 5,
                    n_jobs = 1,
                    param_grid = params1
                    )
gd_search1.fit(X_train,y_train)

#gridsearch1.best_params_: min_child_weight=3, max_depth=5
params2 = {'min_child_weight':[2,3,4],'max_depth':[4,5,6]}

estimator.set_params(min_child_weight=gd_search1.best_params_['min_child_weight'],
                     max_depth=gd_search1.best_params_['max_depth'])


gd_search2 = GridSearchCV(
                    estimator =estimator,
                    scoring = 'accuracy',
                    cv = 5,
                    n_jobs = 1,
                    param_grid = params2
                    )
gd_search2.fit(X_train,y_train)

#gridsearch1.best_params_: min_child_weight=2, max_depth=6
estimator.set_params(min_child_weight=gd_search2.best_params_['min_child_weight'],
                     max_depth=gd_search2.best_params_['max_depth'])

params3 = {'gamma':[i/100.0 + 2.0 for i in range(0,6)]}

gd_search3 = GridSearchCV(
                    estimator =estimator,
                    scoring = 'accuracy',
                    cv = 5,
                    n_jobs = 1,
                    param_grid = params3
                    )
gd_search3.fit(X_train,y_train)

estimator.set_params(gamma=gd_search3.best_params_['gamma'])

params4 = {'subsample': [i/10.0 for i in range(6,10)],
                         'colsample_bytree': [i/10.0 for i in range(6,10)]}
gd_search4 = GridSearchCV(
                    estimator =estimator,
                    scoring = 'accuracy',
                    cv = 5,
                    n_jobs = 1,
                    param_grid = params4
                    )
gd_search4.fit(X_train,y_train)

#optimal vals:'colsample_bytree': 0.8, 'subsample': 0.8
#divided learning_rate by 10 and multiply n_estimators by 5
#print(gd_search4.best_score_)

estimator.set_params(n_estimators=300,
                     learning_rate = 0.09)
#0.7368661489264504 with n_estimators=300, learning_rate = 0.09


res = cross_val_score(estimator, X=X_train, y=y_train, scoring="accuracy",
cv = KFold(shuffle=True, random_state=23333))

#I use le2.inverse_transform to revert labels in y



#print("### estimator parameters and cross val score###")
#print(estimator.get_params)
#print(res.mean())
#print("##########")

estimator.fit(X_train, y_train)

final_result = estimator.predict(eval_train)

final_result = le2.inverse_transform(final_result)
#print(final_result)
#print(np.shape(final_result))
#print("######")
#print(np.shape(e_x6))

np.savetxt("labels.txt", final_result,fmt='%s')

testest=np.loadtxt('labels.txt', dtype='str')
#print(np.shape(testest))


myfile=open("labels.txt","r")

preds=myfile.read()

myfile.close()

r=preds.split("\n")

stuff="\n".join(r[:-1])

myfile=open("labels.txt","w+")

for i in range(len(stuff)):
    myfile.write(stuff[i])
myfile.close()




