In [52]:
# import everything
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import Perceptron
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cluster import KMeans


In [53]:
# read data

train = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# get data and labels
train_data = train.drop(['Target Variable (Discrete)'], axis=1)
train_labels = train['Target Variable (Discrete)']

# standardize data
scaler = StandardScaler()
scaler.fit(train_data)
train_data_ = scaler.transform(train_data)
train_data, val_data, train_labels, val_labels = train_test_split(train_data_, train_labels, test_size=0.2, random_state=42)
test_data = scaler.transform(test_data)

# print shapes
train_data.shape, train_labels.shape, val_data.shape, val_labels.shape, test_data.shape


((795, 24), (795,), (199, 24), (199,), (426, 24))

In [70]:
# knn imputer
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=5)
all_imp = imputer.fit_transform(train_data_)
train_data_imp = imputer.transform(train_data)
val_data_imp = imputer.transform(val_data)
test_data_imp = imputer.transform(test_data)

In [55]:
# pca
from sklearn.decomposition import PCA
pca = PCA(n_components=14)

train_data_pca = pca.fit_transform(train_data_imp)
val_data_pca = pca.fit_transform(val_data_imp)

In [56]:
# knn classifier

# grid search
knn = KNeighborsClassifier()
param_grid = {'n_neighbors': np.arange(1, 25)}
knn_gscv = GridSearchCV(knn, param_grid, cv=5)
knn_gscv.fit(train_data_imp, train_labels)
nn = knn_gscv.best_params_

# find accuracy
knn = KNeighborsClassifier(n_neighbors=nn['n_neighbors'])
knn.fit(train_data_imp, train_labels)
knn_pred = knn.predict(val_data_imp)
accuracy_score(val_labels, knn_pred)



0.7889447236180904

In [57]:
# knn on pca data

# grid search
knn = KNeighborsClassifier()
param_grid = {'n_neighbors': np.arange(1, 25)}
knn_gscv = GridSearchCV(knn, param_grid, cv=5)
knn_gscv.fit(train_data_pca, train_labels)
nn = knn_gscv.best_params_

# find accuracy
knn = KNeighborsClassifier(n_neighbors=nn['n_neighbors'])
knn.fit(train_data_pca, train_labels)
knn_pred = knn.predict(val_data_pca)
accuracy_score(val_labels, knn_pred)



0.5879396984924623

In [58]:
# softmax classifier

logreg = LogisticRegression(max_iter=10000)
logreg.fit(train_data_imp, train_labels)
logreg_pred = logreg.predict(val_data_imp)
accuracy_score(val_labels, logreg_pred)

0.7989949748743719

In [59]:
logreg.fit(train_data_pca, train_labels)
logreg_pred = logreg.predict(val_data_pca)
accuracy_score(val_labels, logreg_pred)

0.5728643216080402

Clearly, PCA is not useful.

In [69]:
# try with random forests

# grid search
rf = RandomForestClassifier()
param_grid = {'n_estimators': np.arange(1, 25)}
rf_gscv = GridSearchCV(rf, param_grid, cv=5)
rf_gscv.fit(train_data_imp, train_labels)
nn = rf_gscv.best_params_

est = nn['n_estimators']
# find accuracy
rf = RandomForestClassifier(n_estimators=nn['n_estimators'])
rf.fit(train_data_imp, train_labels)
rf_pred = rf.predict(val_data_imp)
accuracy_score(val_labels, rf_pred)



0.8793969849246231

In [61]:
# SVC

# grid search
svc = SVC()
param_grid = {'C': np.arange(1, 25)}
svc_gscv = GridSearchCV(svc, param_grid, cv=5)
svc_gscv.fit(train_data_imp, train_labels)
nn = svc_gscv.best_params_

# find accuracy
svc = SVC(C=nn['C'])
svc.fit(train_data_imp, train_labels)
svc_pred = svc.predict(val_data_imp)
accuracy_score(val_labels, svc_pred)



0.8291457286432161

In [62]:
# decision tree

# grid search
dt = DecisionTreeClassifier()
param_grid = {'max_depth': np.arange(1, 25)}
dt_gscv = GridSearchCV(dt, param_grid, cv=5)
dt_gscv.fit(train_data_imp, train_labels)
nn = dt_gscv.best_params_

# find accuracy
dt = DecisionTreeClassifier(max_depth=nn['max_depth'])
dt.fit(train_data_imp, train_labels)
dt_pred = dt.predict(val_data_imp)
accuracy_score(val_labels, dt_pred)



0.8442211055276382

In [63]:
# gradient boosting

# grid search
gb = GradientBoostingClassifier()
param_grid = {'n_estimators': np.arange(1, 25)} 
gb_gscv = GridSearchCV(gb, param_grid, cv=5)
gb_gscv.fit(train_data_imp, train_labels)
nn = gb_gscv.best_params_

# find accuracy
gb = GradientBoostingClassifier(n_estimators=nn['n_estimators'])
gb.fit(train_data_imp, train_labels)
gb_pred = gb.predict(val_data_imp)
accuracy_score(val_labels, gb_pred)



0.8442211055276382

In [66]:
# neural network

# grid search
nn = MLPClassifier(max_iter=10000)
param_grid = {'hidden_layer_sizes': np.arange(1, 25)}
nn_gscv = GridSearchCV(nn, param_grid, cv=5)
nn_gscv.fit(train_data_imp, train_labels)
nn = nn_gscv.best_params_

# find accuracy
nn = MLPClassifier(hidden_layer_sizes=nn['hidden_layer_sizes'], max_iter=10000)
nn.fit(train_data_imp, train_labels)
nn_pred = nn.predict(val_data_imp)
accuracy_score(val_labels, nn_pred)



0.8291457286432161

In [74]:
# final: random forests

rf = RandomForestClassifier(n_estimators=est)
all_labels = np.concatenate((train_labels, val_labels))
rf.fit(all_imp, all_labels)
rf_pred = rf.predict(test_data_imp)

# write to csv
rf_pred = pd.DataFrame(np.array([(i + 1, v) for i, v in enumerate(rf_pred)]))
rf_pred.columns = ['Id', 'Category']
rf_pred.to_csv('rf_pred.csv', index=False)
