In [1]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from sklearn import decomposition
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from numpy import loadtxt
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from sklearn.model_selection import cross_val_score
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
import csv

**Clean training data set**

In [2]:
train = pd.read_csv('TrainOnMe.csv')
test = pd.read_csv('EvaluateOnMe.csv')

#drop id column
train.pop('id')
test.pop(test.columns[0])

# remove None values from training and test set.
train = train.dropna()
test = test.dropna()

# remove rows that have ?
train = train.drop(train[train.values  == '?'].index)
test = test.drop(test[test.values  == '?'].index)

*Data encoded with OneHotEncoder*

In [3]:
train_onehot = pd.get_dummies(train, columns = ['x5', 'x6'])
test_onehot = pd.get_dummies(test, columns = ['x5', 'x6'])

*Data encoded with label LabelEncoder*

In [4]:
le = preprocessing.LabelEncoder()
train_LabelEncode = train.copy()

le.fit(train_LabelEncode['x5'])
train_LabelEncode['x5'] = le.transform(train['x5'])

le.fit(train_LabelEncode['x6'])
train_LabelEncode['x6'] = le.transform(train_LabelEncode['x6'])

*Choose between the two ways of encoding*

In [5]:
#train = train_LabelEncode
train = train_onehot
y = train.pop('y')

test = test_onehot



**Random forest classifier**

In [6]:
classifier_random_forest = RandomForestClassifier(n_estimators= 150)
scores = cross_val_score(classifier_random_forest, train, y, cv=10)
print("random forest accuracy = ", scores.mean())

random forest accuracy =  0.8513333333333334


**Gradient Boosting Classifier**

In [7]:
classifier_gradient = GradientBoostingClassifier(n_estimators= 1000)
scores = cross_val_score(classifier_gradient, train, y, cv=10)
print("gradient boosting accuracy = ", scores.mean())

gradient boosting accuracy =  0.8733535353535353


In [8]:
# classifier_random_forest = GradientBoostingClassifier()
# parameters = [{'n_estimators' : [500, 100, 50], 'max_depth' : [10, 20, 30 , None]}]
# rand_search = GridSearchCV(estimator = classifier_random_forest,
#                            param_grid = parameters,
#                            scoring = 'accuracy',
#                            cv = 2,
#                            n_jobs = -1)
# rand_search = grid_search.fit(train, y)
# accuracy = rand_search.best_score_
# rand_search.best_params_

**Neural Network**

In [9]:
X_train_NN = train.copy()
# normalization:

scale = StandardScaler(with_mean=0, with_std=1)
scale.fit(X_train_NN, y)
X_train_NN = scale.transform(X_train_NN)
X_train_NN = np.asarray(train).astype('float64')

In [10]:

def create_model():
  model = Sequential()
  model.add(Dense(17, input_dim=17, activation='relu'))
  model.add(Dense(50, activation='sigmoid'))
  model.add(Dropout(0.2))
  model.add(Dense(3, activation='softmax'))
  model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
  return model
classifier_NN = KerasClassifier(build_fn=create_model, 
                                 epochs=200, 
                                 batch_size=50, 
                                 verbose=0)
scores = cross_val_score(classifier_NN, X_train_NN, y, cv=10)
print("NN accuracy = ", scores.mean())

NN accuracy =  0.8402828276157379


KNN

In [11]:
# Create the knn model.
classifier_KNN = KNeighborsClassifier(n_neighbors = 15)
scores = cross_val_score(classifier_KNN, train, y, cv=10)
print("KNN accuracy = ", scores.mean())

KNN accuracy =  0.8272121212121212




**Vote of different models**

In [None]:
#train all models
classifier_gradient.fit(train, y)
y_gradient = classifier_gradient.predict(test)

classifier_random_forest.fit(train, y)
y_random_forest = classifier_random_forest.predict(test)

classifier_NN.fit(train.astype('float64'), y)
y_NN = classifier_NN.predict(test.astype('float64'))

classifier_KNN.fit(train, y)
y_KNN = classifier_KNN.predict(test)

#vote
y_pred = []
for i in range(len(y)):
  ys = [y_gradient[i], y_random_forest[i], y_NN[i], y_KNN[i]]
  y_pred.append(max(set(ys), key = ys.count))

**Final model : Gradient boosting classifier**

In [13]:
classifier_gradient = GradientBoostingClassifier(n_estimators= 1000)
classifier_gradient.fit(train, y)
y_classes = classifier_gradient.predict(test)

**List to CSV**

In [14]:
pd.DataFrame(y_classes).to_csv("105990.txt", header = False, index = False)