In [None]:
# import everything
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier


In [None]:
# read data

train = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# get data and labels
train_data = train.drop(['Target Variable (Discrete)'], axis=1)
train_labels = train['Target Variable (Discrete)']

# standardize data
scaler = StandardScaler()
scaler.fit(train_data)
train_data_ = scaler.transform(train_data)

# black magic
# train_data_ = np.concatenate((train_data_, train_data_, train_data_))
# train_labels = np.concatenate((train_labels, train_labels, train_labels))

# split data into train and validation
train_data, val_data, train_labels, val_labels = train_test_split(train_data_, train_labels, test_size=0.2, random_state=42)

test_data = scaler.transform(test_data)

# print shapes
train_data.shape, train_labels.shape, val_data.shape, val_labels.shape, test_data.shape


In [None]:
np.bincount(train_labels) + np.bincount(val_labels, minlength=18)

In [None]:
# knn imputer
def knn_imputer(train_data, val_data, test_data, k):
    from sklearn.impute import KNNImputer
    imputer = KNNImputer(n_neighbors=k)
    all_imp = imputer.fit_transform(train_data_)
    train_data_imp = imputer.transform(train_data)
    val_data_imp = imputer.transform(val_data)
    test_data_imp = imputer.transform(test_data)
    return train_data_imp, val_data_imp, test_data_imp, all_imp

def median_imputer(train_data, val_data, test_data):
    from sklearn.impute import SimpleImputer
    imputer = SimpleImputer(strategy='median')
    all_imp = imputer.fit_transform(train_data_)
    train_data_imp = imputer.transform(train_data)
    val_data_imp = imputer.transform(val_data)
    test_data_imp = imputer.transform(test_data)
    return train_data_imp, val_data_imp, test_data_imp, all_imp

# PCA
from sklearn.decomposition import PCA

def pca(train_data, val_data, test_data, n):
    pca = PCA(n_components=n)
    pca.fit(train_data)
    train_data_pca = pca.transform(train_data)
    val_data_pca = pca.transform(val_data)
    test_data_pca = pca.transform(test_data)
    return train_data_pca, val_data_pca, test_data_pca

# augment data

def augment_data(train_data, train_labels):
    from imblearn.over_sampling import RandomOverSampler
    oversample = RandomOverSampler()
    train_data_aug, train_labels_aug = oversample.fit_resample(train_data, train_labels)
    return train_data_aug, train_labels_aug

In [None]:
train_data, train_labels = augment_data(train_data, train_labels)
val_data, val_labels = augment_data(val_data, val_labels)

In [None]:
train_data.shape, train_labels.shape, val_data.shape, val_labels.shape, test_data.shape

In [None]:
# train_data_imp, val_data_imp, test_data_imp, all_imp = knn_imputer(train_data, val_data, test_data, 7)
train_data_imp, val_data_imp, test_data_imp, all_imp = median_imputer(train_data, val_data, test_data)
train_data_imp.shape, val_data_imp.shape, test_data_imp.shape

In [None]:
train_data_pca, val_data_pca, test_data_pca = pca(train_data_imp, val_data_imp, test_data_imp, 15)

In [None]:
# train_data_aug, train_labels_aug = augment_data(train_data_pca, train_labels)
# val_data_aug, val_labels_aug = augment_data(val_data_pca, val_labels)

train_data_aug, train_labels_aug = train_data_pca, train_labels
val_data_aug, val_labels_aug = val_data_pca, val_labels

In [None]:
train_data_aug.shape, train_labels_aug.shape, val_data_aug.shape, val_labels_aug.shape

In [None]:
# # Gaussian mixture model

# from sklearn.mixture import GaussianMixture

# def gmm(train_data, val_data, test_data, n):
#     gmm = GaussianMixture(n_components=n)
#     gmm.fit(train_data)
#     train_data_gmm = gmm.predict_proba(train_data)
#     val_data_gmm = gmm.predict_proba(val_data)
#     test_data_gmm = gmm.predict_proba(test_data)
#     return train_data_gmm, val_data_gmm, test_data_gmm

# # grid search

# from sklearn.model_selection import GridSearchCV

# def grid_search(model, params, train_data, train_labels):
#     clf = GridSearchCV(model, params, scoring='f1_macro', n_jobs=-1, cv=5)
#     clf.fit(train_data, train_labels)
#     return clf.best_estimator_

# # do grid search of gmm

# # gmm_params = {'n_components': list(range(10, 25))}
# # gmm_best = grid_search(GaussianMixture(), gmm_params, train_data_pca, train_labels)
# # gmm_best

# u, v, w = gmm(train_data_pca, val_data_pca, test_data_pca, 15)
# u.shape, v.shape, w.shape

In [None]:
# # cross validation

# from sklearn.model_selection import cross_val_predict, cross_validate
# from sklearn.metrics import make_scorer

# knn = KNeighborsClassifier(n_neighbors=10)
# # scores = cross_validate(knn, np.concatenate((train_data_aug, val_data_aug)), np.concatenate((train_labels_aug, val_labels_aug)), cv=5, scoring=make_scorer(f1_score, average='weighted'))
# # scores = cross_validate(knn, train_data_aug, train_labels_aug, cv=5, scoring=make_scorer(f1_score, average='macro'))
# knn_pred = cross_val_predict(knn, train_data_aug, train_labels_aug, cv=5)
# # knn_preds = knn.predict(val_data_aug)
# # scores
# accuracy_score(val_labels_aug, knn_pred), f1_score(val_labels_aug, knn_pred, average='macro')

In [None]:
# knn classifier

# grid search
# knn = KNeighborsClassifier()
# param_grid = {'n_neighbors': np.arange(5, 30)}
# knn_gscv = GridSearchCV(knn, param_grid, cv=5, scoring='f1_macro')
# knn_gscv.fit(train_data_aug, train_labels_aug)
# nn = knn_gscv.best_params_

# find accuracy

knn = KNeighborsClassifier(n_neighbors=10, p=2)
knn.fit(train_data_aug, train_labels_aug)
knn_pred = knn.predict(val_data_aug)
accuracy_score(val_labels_aug, knn_pred), f1_score(val_labels_aug, knn_pred, average='macro')

In [None]:
# try with random forests

# grid search
# rf = RandomForestClassifier()
# param_grid = {'n_estimators': np.arange(1, 50)}
# rf_gscv = GridSearchCV(rf, param_grid, cv=5, scoring='f1_macro')
# rf_gscv.fit(train_data_aug, train_labels_aug)
# nn = rf_gscv.best_params_

# est = nn['n_estimators']
# find accuracy
rf = RandomForestClassifier(n_estimators=20)
rf.fit(train_data_aug, train_labels_aug)
rf_pred = rf.predict(val_data_aug)
accuracy_score(val_labels_aug, rf_pred), f1_score(val_labels_aug, rf_pred, average='macro')

In [None]:
# est

In [None]:
# rf_pred_np = rf.predict(test_data_pca)
# rf_pred = pd.DataFrame(np.array([(i + 1, v) for i, v in enumerate(rf_pred_np)]))
# rf_pred.columns = ['Id', 'Category']
# rf_pred.to_csv('rf_pred.csv', index=False)

In [None]:
knn_pred_np = knn.predict(test_data_pca)
knn_pred = pd.DataFrame(np.array([(i + 1, v) for i, v in enumerate(knn_pred_np)]))
knn_pred.columns = ['Id', 'Category']
knn_pred.to_csv('knn_pred.csv', index=False)