In [1]:
# import everything
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier


In [2]:
# read data

train = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# get data and labels
train_data = train.drop(['Target Variable (Discrete)'], axis=1)
train_labels_ = train['Target Variable (Discrete)']

# standardize data
scaler = StandardScaler()
train_data_ = scaler.fit_transform(train_data)
test_data = scaler.transform(test_data)

# set numpy random seed
np.random.seed(42)

# black magic
# train_data_ = np.concatenate((train_data_, train_data_, train_data_))
# train_labels = np.concatenate((train_labels, train_labels, train_labels))


# impute missing values
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')
imputer.fit(train_data_)
train_data_ = imputer.transform(train_data_)
test_data = imputer.transform(test_data)
    
# split data into train and validation
train_data, val_data, train_labels, val_labels = train_test_split(train_data_, train_labels_, test_size=0.2)
# train_data, train_labels = train_data_, train_labels_


# # print shapes
train_data.shape, train_labels.shape, val_data.shape, val_labels.shape, test_data.shape


((795, 24), (795,), (199, 24), (199,), (426, 24))

In [3]:
np.bincount(train_labels) + np.bincount(val_labels, minlength=18)

array([249, 488, 109,   3,   3,  41,  70,   5,   7,   2,   1,   1,   1,
         3,   5,   4,   1,   1])

In [4]:
# knn imputer
def knn_imputer(train_data, val_data, test_data, k):
    from sklearn.impute import KNNImputer
    imputer = KNNImputer(n_neighbors=k)
    all_imp = imputer.fit_transform(train_data_)
    train_data_imp = imputer.transform(train_data)
    val_data_imp = imputer.transform(val_data)
    test_data_imp = imputer.transform(test_data)
    return train_data_imp, val_data_imp, test_data_imp, all_imp

# PCA
from sklearn.decomposition import PCA

def pca(train_data, val_data, test_data, n):
    pca = PCA(n_components=n)
    train_data_pca = pca.fit_transform(train_data)
    val_data_pca = pca.transform(val_data)
    test_data_pca = pca.transform(test_data)
    return train_data_pca, val_data_pca, test_data_pca

# augment data

def augment_data(train_data, train_labels):
    from imblearn.over_sampling import RandomOverSampler
    oversample = RandomOverSampler()
    train_data_aug, train_labels_aug = oversample.fit_resample(train_data, train_labels)
    return train_data_aug, train_labels_aug

In [5]:
train_data, train_labels = augment_data(train_data, train_labels)
# val_data, val_labels = augment_data(val_data, val_labels)

# train_data, val_data, test_data = pca(train_data, val_data, test_data, 15)

In [6]:
train_data.shape, train_labels.shape, val_data.shape, val_labels.shape

((6749, 24), (6749,), (199, 24), (199,))

In [7]:
# deep copy labels_tmp
labels_tmp = train_labels.copy()

keeps = [0, 1, 2, 5, 6, 9, 10, 11, 12, 14, 16, 17]
mask = np.isin(labels_tmp, keeps)

# if label not in keeps, set to -1
labels_tmp[np.logical_not(mask)] = -1

# train knn
# knn_keeps = KNeighborsClassifier(n_neighbors=5)
# knn_keeps.fit(train_data, labels_tmp)

# train random forest

param_grid = {
    'n_estimators': [80, 90, 100, 110, 120],
    'max_depth': [9, 10, 11, 12, 13]
}

rf_keeps = RandomForestClassifier()
grid_search = GridSearchCV(estimator=rf_keeps, param_grid=param_grid, cv=5, n_jobs=-1, scoring='f1_macro')
grid_search.fit(train_data, labels_tmp)
print(grid_search.best_params_)

rf_keeps = grid_search.best_estimator_
rf_keeps.fit(train_data, labels_tmp)

# predict
# preds_knn = knn_keeps.predict(val_data)
preds_rf = rf_keeps.predict(val_data)

val_labels_tmp = val_labels.copy()
mask = np.isin(val_labels_tmp, keeps)
val_labels_tmp[np.logical_not(mask)] = -1

# accuracy
# acc_knn = accuracy_score(val_labels_tmp, preds_knn)
# f1_knn = f1_score(val_labels_tmp, preds_knn, average='macro')
# print(f'KNN accuracy: {acc_knn}')
# print(f'KNN f1: {f1_knn}')
acc_rf = accuracy_score(val_labels_tmp, preds_rf)
f1_rf = f1_score(val_labels_tmp, preds_rf, average='macro')
print(f'RF accuracy: {acc_rf}')
print(f'RF f1: {f1_rf}')

{'max_depth': 15, 'n_estimators': 90}
RF accuracy: 0.8743718592964824
RF f1: 0.6925632691225412


In [8]:
labels = train_labels.copy()

# if label not in keeps, ignore
mask = np.isin(labels_tmp, keeps)
mask = np.logical_not(mask)
train_tmp = train_data[mask]
labels = labels[mask]

# train knn
# knn_other = KNeighborsClassifier(n_neighbors=5)
# knn_other.fit(train_tmp, labels)

# train random forest
param_grid = {
    'n_estimators': [80, 90, 100, 110, 120],
    'max_depth': [9, 10, 11, 12, 13]
}

rf_other = RandomForestClassifier()
grid_search = GridSearchCV(estimator=rf_other, param_grid=param_grid, cv=5, n_jobs=-1, scoring='f1_macro')
grid_search.fit(train_tmp, labels)
print(grid_search.best_params_)

rf_other = grid_search.best_estimator_
# rf_other = RandomForestClassifier(n_estimators=100, max_depth=10)
rf_other.fit(train_tmp, labels)

val_mask = np.isin(val_labels, keeps)
val_mask = np.logical_not(val_mask)
val_tmp = val_data[val_mask]
val_labels_tmp = val_labels[val_mask]

# predict
# preds_knn = knn_other.predict(val_tmp)
preds_rf = rf_other.predict(val_tmp)

# accuracy
# acc_knn = accuracy_score(val_labels_tmp, preds_knn)
# f1_knn = f1_score(val_labels_tmp, preds_knn, average='macro')
# print(f'KNN accuracy: {acc_knn}')
# print(f'KNN f1: {f1_knn}')
acc_rf = accuracy_score(val_labels_tmp, preds_rf)
f1_rf = f1_score(val_labels_tmp, preds_rf, average='macro')
print(f'RF accuracy: {acc_rf}')
print(f'RF f1: {f1_rf}')

{'max_depth': 9, 'n_estimators': 80}
RF accuracy: 0.45454545454545453
RF f1: 0.5


In [9]:
# combine rf_keeps and rf_other
rf_keeps_preds = rf_keeps.predict(val_data)
rf_other_preds = rf_other.predict(val_data)
rf_preds = np.where(rf_keeps_preds == -1, rf_other_preds, rf_keeps_preds)

# accuracy
acc_rf = accuracy_score(val_labels, rf_preds)
f1_rf = f1_score(val_labels, rf_preds, average='macro')
print(f'RF accuracy: {acc_rf}')
print(f'RF f1: {f1_rf}')

RF accuracy: 0.864321608040201
RF f1: 0.4788243599458553


In [10]:
# combine rf_keeps and rf_other on test data
rf_keeps_preds = rf_keeps.predict(test_data)
rf_other_preds = rf_other.predict(test_data)
rf_preds = np.where(rf_keeps_preds == -1, rf_other_preds, rf_keeps_preds)

# print(rf_preds)
print(np.bincount(rf_preds))

# save predictions
rf_preds = pd.DataFrame(np.array([(i + 1, v) for i, v in enumerate(rf_preds)]))
rf_preds.columns = ['Id', 'Category']
rf_preds.to_csv('rf_hierarchical.csv', index=False)

[107 217  42   2   4  14  27   2   4   0   0   0   0   1   2   4]
