In [2]:
# import everything
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix


In [3]:
# read data

train = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# get data and labels
train_data = train.drop(['Target Variable (Discrete)'], axis=1)
train_labels = train['Target Variable (Discrete)']

# standardize data
scaler = StandardScaler()
scaler.fit(train_data)
train_data_ = scaler.transform(train_data)
test_data = scaler.transform(test_data)

train_data, val_data, train_labels, val_labels = train_test_split(train_data_, train_labels, test_size=0.2, random_state=42)


In [4]:
def median_imputer(train_data, val_data, test_data):
    from sklearn.impute import SimpleImputer
    imputer = SimpleImputer(strategy='median')
    train_data_imp = imputer.fit_transform(train_data)
    val_data_imp = imputer.transform(val_data)
    test_data_imp = imputer.transform(test_data)
    return train_data_imp, val_data_imp, test_data_imp


def augment_data(train_data, train_labels):
    from imblearn.over_sampling import RandomOverSampler
    oversample = RandomOverSampler()
    train_data_aug, train_labels_aug = oversample.fit_resample(train_data, train_labels)
    return train_data_aug, train_labels_aug

In [5]:
train_data_imp, val_data_imp, test_data_imp = median_imputer(train_data, val_data, test_data)
train_data_aug, train_labels_aug = augment_data(train_data_imp, train_labels)

In [15]:
rf = RandomForestClassifier(n_estimators=150)
rf.fit(train_data_aug, train_labels_aug)
rf_pred = rf.predict(val_data_imp)
accuracy_score(val_labels, rf_pred), f1_score(val_labels, rf_pred, average='macro')

(0.8693467336683417, 0.4692401598061975)

In [16]:
np.bincount(rf_pred)

array([56, 94, 29,  0,  1,  7, 11,  0,  0,  0,  0,  0,  0,  0,  1])

In [11]:
ovas = []

for i in range(18):
    # ovas.append(KNeighborsClassifier(n_neighbors=i))
    model = RandomForestClassifier(n_estimators=150, random_state=42)
    # model = SVC(kernel='rbf', C=1, gamma=0.1)
    # model = MLPClassifier(hidden_layer_sizes=(20), max_iter=500, alpha=0.01, solver='sgd', random_state=42,)
    train_labels_aug_ = [i if x == i else -1 for x in train_labels_aug]
    model.fit(train_data_aug, train_labels_aug_)
    ovas.append(model)



In [17]:
# predict
preds = []
for ova in ovas:
    preds.append(ova.predict(val_data_imp))

# print(preds)
preds = np.max(np.array(preds), axis=0)

# preds.shape
preds[preds == -1] = rf_pred[preds == -1]

accuracy_score(val_labels, preds), f1_score(val_labels, preds, average='macro')
# bad_bins = np.where(bins == 0)[0]

# bad_bins


[49 96 18  0  1  2 11  0  0  0  0  0  0  0  1  1  0  0]


(0.8592964824120602, 0.463370308777329)

In [18]:
np.bincount(preds)

array([53, 98, 29,  0,  1,  5, 11,  0,  0,  0,  0,  0,  0,  0,  1,  1])

In [8]:
weird_val_data = val_data_imp[preds == -1]
weird_val_data.shape

(0, 24)

In [22]:
# predict on test data

test_preds = []
for ova in ovas:
    test_preds.append(ova.predict(test_data_imp))

test_preds = np.max(np.array(test_preds), axis=0)
rf_test_pred = rf.predict(test_data_imp)
test_preds[test_preds == -1] = rf_test_pred[test_preds == -1]
np.bincount(test_preds, minlength=18)




array([102, 220,  54,   1,   3,  15,  27,   0,   1,   1,   0,   0,   0,
         0,   2,   0,   0,   0])

In [20]:
# write to csv
test_preds = pd.DataFrame(np.array([(i + 1, v) for i, v in enumerate(test_preds)]))
test_preds.columns = ['Id', 'Category']
test_preds.to_csv('ova_pred.csv', index=False)