In [55]:
# import everything
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier


In [56]:
# read data

train = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# get data and labels
train_data = train.drop(['Target Variable (Discrete)'], axis=1)
train_labels_ = train['Target Variable (Discrete)']

# standardize data
scaler = StandardScaler()
train_data_ = scaler.fit_transform(train_data)
test_data = scaler.transform(test_data)

# set numpy random seed
np.random.seed(42)

# impute missing values
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')
imputer.fit(train_data_)
train_data_ = imputer.transform(train_data_)
test_data = imputer.transform(test_data)
    
# split data into train and validation
train_data, val_data, train_labels, val_labels = train_test_split(train_data_, train_labels_, test_size=0.2)
# train_data, train_labels = train_data_, train_labels_

# oversample
from imblearn.over_sampling import RandomOverSampler
oversample = RandomOverSampler()
train_data, train_labels = oversample.fit_resample(train_data, train_labels)

# # print shapes
train_data.shape, train_labels.shape, val_data.shape, val_labels.shape, test_data.shape


((6749, 24), (6749,), (199, 24), (199,), (426, 24))

In [57]:
np.bincount(train_labels) + np.bincount(val_labels, minlength=18)

array([447, 488, 423, 398, 398, 405, 408, 399, 402, 397, 397,   1, 397,
       398, 398, 398, 397, 397])

In [58]:
subclasses = [
    [0, 1, 2, 5, 6, 9, 10, 11, 12, 14, 16, 17],
    [3, 4, 7, 8, 13, 15]
]

def train_differentiator(subclasses):
    # decides which subclass to use

    labels_tmp = train_labels.copy()
    for i in range(len(subclasses)):
        keeps = subclasses[i]
        mask = np.isin(train_labels, keeps)
        labels_tmp[mask] = i

    # train knn
    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(train_data, labels_tmp)

    # train random forest
    rf = RandomForestClassifier(n_estimators=100, max_depth=10)
    rf.fit(train_data, labels_tmp)

    return knn, rf


def train_subclass(subclass):

    mask = np.isin(train_labels, subclass)

    train_data_tmp = train_data[mask]
    labels_tmp = train_labels[mask]

    # train knn
    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(train_data_tmp, labels_tmp)

    # train random forest
    rf = RandomForestClassifier(n_estimators=100, max_depth=10)
    rf.fit(train_data_tmp, labels_tmp)

    return knn, rf

differentiator_knn, differentiator_rf = train_differentiator(subclasses)
subclass_models = []
for subclass in subclasses:
    subclass_models.append(train_subclass(subclass))

def predict_combined(data):
    diff_preds = differentiator_rf.predict(data)
    subclass_preds = []
    for i in range(len(subclass_models)):
        knn, rf = subclass_models[i]
        subclass_preds.append(rf.predict(data))

    final_preds = np.zeros(len(data))

    for i in range(len(subclasses)):
        subclass = subclasses[i]
        mask = np.isin(diff_preds, i)
        final_preds[mask] = subclass_preds[i][mask]
        
    return final_preds.astype(int)

In [59]:
preds = predict_combined(val_data)
acc = accuracy_score(val_labels, preds)
f1 = f1_score(val_labels, preds, average='macro')
print(f'Combined accuracy: {acc}')
print(f'Combined f1: {f1}')

Combined accuracy: 0.8442211055276382
Combined f1: 0.4549076716631934


In [60]:
preds = predict_combined(test_data)

print(preds)
print(np.bincount(preds))

# save predictions
preds = pd.DataFrame(np.array([(i + 1, v) for i, v in enumerate(preds)]))
preds.columns = ['Id', 'Category']
preds.to_csv('3split_preds.csv', index=False)



[ 6  2  1  1  1  2  1  5  1  0  1  6  0  1  2  0  2  1  2  1  5  5  1  2
  0  2  6  0  1  0  1  1  2  6  1  0  6  0  0  5  1  1  1  1  0  4  1  0
  6  1  2  1  1  1  1  1  6  2  0  0  1  1  1  6  1  1  2  2  1  2  1  2
  1  1  6  1  1  1  1  1  1  2  0  4  6  0  1  1  1  1  1  0  8  2  1  2
  0  1  2  0  1  0  0  1  1  1  1  2  0  8  1  0  0  0  0  6  0  1  2  1
  0  1  1  1  6  1  1  2  1  1  1  1  1  0  0  2  2  2  1  0  0  1  0  2
  1  1  6  2  3  0  1  1  6  1  1  1  1  0  1  1  0  1  0  0  2  1  0  0
  0  1  0  0  1  1  1  1  1  1  0  0  2  5  1  1  2  1  1  5  4  1  5  0
  1  6  1  1  0  2  1  1  1  0  1  6  1  1  1  0  1 14  0  0  0  0  2  1
  1  0  1  1  1  1  0  1  1  6  1  1  0  2  2  1  1  5  1  0  2  1  0  0
  0  1  1  5  6  5  1  6  0  1  0 15  1  1  1  2  0  0  0  1  6  1  1  0
  0  0  0  0  0  0  0  0  1  8  5  6  0  0  3  1  1  1  1  5  5  1  2  1
  0  1  0  1  1  6  0  5  0  1 14  0  1  1  1  1  2  1  1  1  6  1  2  1
  2  0  1  1  2  2  2  1  1  1  0  0  2  0  1  0  1