In [15]:
# import everything
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier


In [16]:
# read data

train = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# get data and labels
train_data = train.drop(['Target Variable (Discrete)'], axis=1)
train_labels = train['Target Variable (Discrete)']

# standardize data
scaler = StandardScaler()
scaler.fit(train_data)
train_data_ = scaler.transform(train_data)

# black magic
train_data_ = np.concatenate((train_data_, train_data_))
train_labels = np.concatenate((train_labels, train_labels))

# split data into train and validation
# train_data, val_data, train_labels, val_labels = train_test_split(train_data_, train_labels, test_size=0.2, random_state=42)

test_data = scaler.transform(test_data)

# print shapes
# train_data.shape, train_labels.shape, val_data.shape, val_labels.shape, test_data.shape


In [17]:
# knn imputer
def knn_imputer(train_data, val_data, test_data, k):
    from sklearn.impute import KNNImputer
    imputer = KNNImputer(n_neighbors=k)
    all_imp = imputer.fit_transform(train_data_)
    train_data_imp = imputer.transform(train_data)
    val_data_imp = imputer.transform(val_data)
    test_data_imp = imputer.transform(test_data)
    return train_data_imp, val_data_imp, test_data_imp, all_imp

def median_imputer(train_data, test_data):
    from sklearn.impute import SimpleImputer
    imputer = SimpleImputer(strategy='median')
    train_data_imp = imputer.fit_transform(train_data)
    test_data_imp = imputer.transform(test_data)
    return train_data_imp, test_data_imp

# PCA
from sklearn.decomposition import PCA

def pca(train_data, val_data, test_data, n):
    pca = PCA(n_components=n)
    pca.fit(train_data)
    train_data_pca = pca.transform(train_data)
    val_data_pca = pca.transform(val_data)
    test_data_pca = pca.transform(test_data)
    return train_data_pca, val_data_pca, test_data_pca

# augment data

def augment_data(train_data, train_labels):
    from imblearn.over_sampling import RandomOverSampler
    oversample = RandomOverSampler()
    train_data_aug, train_labels_aug = oversample.fit_resample(train_data, train_labels)
    return train_data_aug, train_labels_aug

In [18]:
train_data_, test_data = median_imputer(train_data_, test_data)
train_data, val_data, train_labels, val_labels = train_test_split(train_data_, train_labels, test_size=0.2, random_state=42)

train_data_aug, train_labels_aug = augment_data(train_data, train_labels)
# train_data.shape, train_labels.shape, val_data.shape, val_labels.shape, test_data.shape
# train_data_imp.shape, val_data_imp.shape, test_data_imp.shape

In [28]:
train_data_pca, val_data_pca, test_data_pca = pca(train_data_aug, val_data, test_data, 15)

In [39]:
knn = KNeighborsClassifier(n_neighbors=7, p=3)
knn.fit(train_data_pca, train_labels_aug)
knn_pred = knn.predict(val_data_pca)
accuracy_score(val_labels, knn_pred), f1_score(val_labels, knn_pred, average='macro')

(0.9020100502512562, 0.835699821039159)

In [41]:
y_pred = knn.predict(test_data_pca)
np.bincount(y_pred, minlength=18)

array([ 86, 208,  54,   7,   3,  27,  30,   0,   2,   0,   0,   0,   2,
         2,   2,   3,   0,   0])

In [None]:
y_pred = pd.DataFrame(np.array([(i + 1, v) for i, v in enumerate(y_pred)]))
y_pred.columns = ['Id', 'Target Variable (Discrete)']
y_pred.to_csv('knn_7_p3.csv', index=False)