In [72]:
# import everything
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier


In [162]:
# read data

train = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# get data and labels
train_data = train.drop(['Target Variable (Discrete)'], axis=1)
train_labels = train['Target Variable (Discrete)']

# standardize data
scaler = StandardScaler()
scaler.fit(train_data)
train_data_ = scaler.transform(train_data)

# black magic
train_data_ = np.concatenate((train_data_, train_data_, train_data_))
train_labels = np.concatenate((train_labels, train_labels, train_labels))

# split data into train and validation
train_data, val_data, train_labels, val_labels = train_test_split(train_data_, train_labels, test_size=0.2, random_state=42)
test_data = scaler.transform(test_data)

# print shapes
train_data.shape, train_labels.shape, val_data.shape, val_labels.shape, test_data.shape


((2385, 24), (2385,), (597, 24), (597,), (426, 24))

In [163]:
np.bincount(train_labels) + np.bincount(val_labels, minlength=18)

array([ 747, 1464,  327,    9,    9,  123,  210,   15,   21,    6,    3,
          3,    3,    9,   15,   12,    3,    3])

In [164]:
# knn imputer
def knn_imputer(train_data, val_data, test_data, k):
    from sklearn.impute import KNNImputer
    imputer = KNNImputer(n_neighbors=k)
    all_imp = imputer.fit_transform(train_data_)
    train_data_imp = imputer.transform(train_data)
    val_data_imp = imputer.transform(val_data)
    test_data_imp = imputer.transform(test_data)
    return train_data_imp, val_data_imp, test_data_imp, all_imp

# PCA
from sklearn.decomposition import PCA

def pca(train_data, val_data, test_data, n):
    pca = PCA(n_components=n)
    pca.fit(train_data)
    train_data_pca = pca.transform(train_data)
    val_data_pca = pca.transform(val_data)
    test_data_pca = pca.transform(test_data)
    return train_data_pca, val_data_pca, test_data_pca

# augment data

def augment_data(train_data, train_labels):
    from imblearn.over_sampling import RandomOverSampler
    oversample = RandomOverSampler()
    train_data_aug, train_labels_aug = oversample.fit_resample(train_data, train_labels)
    return train_data_aug, train_labels_aug

In [165]:
train_data_imp, val_data_imp, test_data_imp, all_imp = knn_imputer(train_data, val_data, test_data, 7)
train_data_imp.shape, val_data_imp.shape, test_data_imp.shape

((2385, 24), (597, 24), (426, 24))

In [166]:
train_data_pca, val_data_pca, test_data_pca = pca(train_data_imp, val_data_imp, test_data_imp, 15)

In [167]:
train_data_aug, train_labels_aug = augment_data(train_data_pca, train_labels)
val_data_aug, val_labels_aug = augment_data(val_data_pca, val_labels)

In [168]:
train_data_aug.shape, train_labels_aug.shape, val_data_aug.shape, val_labels_aug.shape

((21312, 15), (21312,), (3920, 15), (3920,))

In [169]:
# knn classifier

# grid search
# knn = KNeighborsClassifier()
# param_grid = {'n_neighbors': np.arange(5, 30)}
# knn_gscv = GridSearchCV(knn, param_grid, cv=5, scoring='f1_macro')
# knn_gscv.fit(train_data_aug, train_labels_aug)
# nn = knn_gscv.best_params_

# find accuracy
knn = KNeighborsClassifier(n_neighbors=10, p=2)
knn.fit(train_data_aug, train_labels_aug)
knn_pred = knn.predict(val_data_aug)
accuracy_score(val_labels_aug, knn_pred), f1_score(val_labels_aug, knn_pred, average='macro')

(0.9839285714285714, 0.9183616583106672)

In [128]:
# # try with random forests

# # grid search
# # rf = RandomForestClassifier()
# # param_grid = {'n_estimators': np.arange(1, 50)}
# # rf_gscv = GridSearchCV(rf, param_grid, cv=5, scoring='f1_macro')
# # rf_gscv.fit(train_data_aug, train_labels_aug)
# # nn = rf_gscv.best_params_

# # est = nn['n_estimators']
# # find accuracy
# rf = RandomForestClassifier(n_estimators=20)
# rf.fit(train_data_aug, train_labels_aug)
# rf_pred = rf.predict(val_data_aug)
# accuracy_score(val_labels_aug, rf_pred), f1_score(val_labels_aug, rf_pred, average='macro')

(1.0, 1.0)

In [113]:
# est

74

In [131]:
# rf_pred_np = rf.predict(test_data_pca)
# rf_pred = pd.DataFrame(np.array([(i + 1, v) for i, v in enumerate(rf_pred_np)]))
# rf_pred.columns = ['Id', 'Category']
# rf_pred.to_csv('rf_pred.csv', index=False)

In [132]:
knn_pred_np = knn.predict(test_data_pca)
knn_pred = pd.DataFrame(np.array([(i + 1, v) for i, v in enumerate(knn_pred_np)]))
knn_pred.columns = ['Id', 'Category']
knn_pred.to_csv('knn_pred.csv', index=False)

In [133]:
for q in enumerate(zip(rf_pred_np, knn_pred_np)):
    if q[1][0] != q[1][1]:
        print(q)

(4, (1, 5))
(8, (1, 5))
(14, (1, 2))
(15, (0, 5))
(16, (2, 5))
(23, (2, 5))
(27, (0, 5))
(31, (1, 0))
(38, (0, 2))
(40, (1, 14))
(44, (0, 2))
(50, (0, 5))
(78, (1, 3))
(98, (1, 2))
(100, (0, 6))
(101, (2, 0))
(102, (1, 0))
(106, (1, 5))
(107, (0, 2))
(108, (0, 4))
(109, (1, 3))
(110, (1, 12))
(111, (0, 2))
(113, (2, 8))
(114, (0, 15))
(121, (1, 0))
(147, (1, 15))
(148, (2, 5))
(153, (1, 5))
(170, (0, 5))
(178, (1, 0))
(181, (1, 5))
(187, (0, 2))
(192, (2, 3))
(195, (0, 6))
(217, (1, 2))
(233, (1, 5))
(244, (1, 6))
(251, (0, 15))
(259, (1, 0))
(261, (1, 12))
(268, (0, 2))
(283, (8, 2))
(286, (2, 0))
(290, (0, 5))
(295, (0, 13))
(296, (0, 2))
(304, (2, 5))
(306, (1, 0))
(317, (1, 2))
(323, (0, 2))
(327, (0, 3))
(328, (1, 3))
(333, (1, 0))
(345, (1, 2))
(347, (1, 2))
(356, (0, 5))
(391, (0, 13))
(395, (1, 0))
(398, (1, 2))
(417, (1, 5))
(420, (0, 2))
(425, (1, 0))
