In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 200) #to see all columns in dataframe


from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

In [2]:
x_train = pd.read_csv('train.csv', header=None)
y_train = pd.read_csv('trainLabels.csv', header=None)
x_test = pd.read_csv('test.csv', header=None)
x_train = np.asarray(x_train)
y_train = np.asarray(y_train)
x_test = np.asarray(x_test)
y_train = y_train.ravel()
print('training_x Shape:',x_train.shape,',training_y Shape:',y_train.shape, ',testing_x Shape:',x_test.shape)

training_x Shape: (1000, 40) ,training_y Shape: (1000,) ,testing_x Shape: (9000, 40)


In [4]:
x_all = np.r_[x_train,x_test]
print('x_all shape :',x_all.shape)

x_all shape : (10000, 40)


In [6]:
lowest_bic = np.infty
n_components_range = range(1, 7)
cv_types = ['spherical', 'tied', 'diag', 'full']
for cv_type in cv_types:
	for n_components in n_components_range:
        # Fit a mixture of Gaussians with EM
		gmm = GaussianMixture(n_components=n_components,covariance_type=cv_type)
		gmm.fit(x_all)
		if gmm.aic(x_all) < lowest_bic:
			lowest_bic = gmm.aic(x_all)
			best_gmm = gmm

In [7]:
best_gmm.fit(x_all)
x_train = best_gmm.predict_proba(x_train)
x_test = best_gmm.predict_proba(x_test)

In [10]:
knn = KNeighborsClassifier()
rf = RandomForestClassifier()
xgb = XGBClassifier()

In [11]:
# GRID SEARCH for BEST TUNING PARAMETERS FOR KNN
grid_search_knn = GridSearchCV(knn,param_grid=dict( ),cv=10,scoring='accuracy').fit(x_train,y_train)
print('best estimator KNN:',grid_search_knn.best_estimator_,'Best Score', grid_search_knn.best_estimator_.score(x_train,y_train))
knn_best = grid_search_knn.best_estimator_

# GRID SEARCH for BEST TUNING PARAMETERS FOR RandomForest
grid_search_rf = GridSearchCV(rf, param_grid=dict( ), verbose=3,scoring='accuracy',cv=10).fit(x_train,y_train)
print('best estimator RandomForest:',grid_search_rf.best_estimator_,'Best Score', grid_search_rf.best_estimator_.score(x_train,y_train))
rf_best = grid_search_rf.best_estimator_

# GRID SEARCH for BEST TUNING PARAMETERS FOR XGBoost
grid_search_xgb = GridSearchCV(xgb, param_grid=dict( ), verbose=3,scoring='accuracy',cv=10).fit(x_train,y_train)
print('best estimator XGBClassifier:',grid_search_xgb.best_estimator_,'Best Score', grid_search_xgb.best_estimator_.score(x_train,y_train))
xgb_best = grid_search_xgb.best_estimator_

best estimator KNN: KNeighborsClassifier() Best Score 0.996
Fitting 10 folds for each of 1 candidates, totalling 10 fits
[CV 1/10] END .................................., score=1.000 total time=   0.0s
[CV 2/10] END .................................., score=1.000 total time=   0.0s
[CV 3/10] END .................................., score=1.000 total time=   0.0s
[CV 4/10] END .................................., score=0.990 total time=   0.0s
[CV 5/10] END .................................., score=1.000 total time=   0.0s
[CV 6/10] END .................................., score=1.000 total time=   0.0s
[CV 7/10] END .................................., score=0.990 total time=   0.0s
[CV 8/10] END .................................., score=1.000 total time=   0.0s
[CV 9/10] END .................................., score=0.980 total time=   0.0s
[CV 10/10] END ................................., score=1.000 total time=   0.0s
best estimator RandomForest: RandomForestClassifier() Best Score 0.99

In [12]:
knn_best.fit(x_train,y_train)
print(knn_best.predict(x_test)[0:10])
rf_best.fit(x_train,y_train)
print(rf_best.predict(x_test)[0:10])
xgb_best.fit(x_train,y_train)
print(xgb_best.predict(x_test)[:10])

# SCORING THE MODELS
print('Score for KNN :',cross_val_score(knn_best,x_train,y_train,cv=10,scoring='accuracy').mean())
print('Score for Random Forest :',cross_val_score(rf_best,x_train,y_train,cv=10,scoring='accuracy').mean())
print('Score for XGBoost :',cross_val_score(xgb_best,x_train,y_train,cv=10,scoring='accuracy').mean())

[1 0 1 0 0 0 0 1 0 0]
[1 0 1 0 0 0 0 1 0 0]
[1 0 1 0 0 0 0 1 0 0]
Score for KNN : 0.9960000000000001
Score for Random Forest : 0.9960000000000001
Score for XGBoost : 0.9960000000000001
