# Choose Classificator based on Neighbour

In [17]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler

In [18]:
def find_nearest_neighbor_index(row, dataset):
    scaler = MinMaxScaler()
    dataset_scaled = scaler.fit_transform(dataset)
    row_scaled = scaler.transform([row])
    
    diffs = [np.sum((row_scaled[0] - ds_row)**2) for ds_row in dataset_scaled]
    return np.argmin(diffs)

Test implementation:

In [31]:
df_train = pd.read_csv("../data/train_new.csv", sep="|")
df_val   = pd.read_csv("../data/val_new.csv", sep="|")
df_test  = pd.read_csv("../data/test.csv", sep="|")


train_y = df_train["fraud"]
train_X = df_train.drop("fraud", axis=1)
val_y = df_val["fraud"]
val_X = df_val.drop("fraud", axis=1)
test_X = df_test.copy()


classifiers = {
    'xgb': XGBClassifier(),
    'svm': SVC(probability=True, gamma="auto")
}

# init(fit) classifiers: train on train_X
classifiers_fitted = {name: clf.fit(train_X.values, train_y.values) for name, clf in classifiers.items()}

# predict val_X to get propabilites
df_val_ext = df_val.copy()

xgp_probas = classifiers_fitted['xgb'].predict_proba(val_X.values).T
svm_probas = classifiers_fitted['svm'].predict_proba(val_X.values).T

xgb_proba = np.maximum(*xgp_probas)
svm_proba = np.maximum(*svm_probas)

xgb_pred = np.where(xgb_proba > 0.5, 1, 0)
svm_pred = np.where(svm_proba > 0.5, 0, 0)

#only say accuracy was higher when prediction was right
xgb_proba_true = np.where(xgb_pred == val_y, xgb_proba, 0)
svm_proba_true = np.where(svm_pred == val_y, svm_proba, 0)

df_val_ext["xgb_prob"] = xgb_proba_true
df_val_ext["svm_prob"] = svm_proba_true

# predict test with classificator that was best
def predict(X):
    test_y = []
    for test_row_X in X.values[0:1000]:
        idx = find_nearest_neighbor_index(test_row_X, val_X.values)
        #print(idx)
        clfid = 'xgb' if df_val_ext.iloc[idx].xgb_prob > df_val_ext.iloc[idx].svm_prob else 'svm'
        test_row_y_pred = classifiers_fitted[clfid].predict([test_row_X])
        test_y.append(test_row_y_pred[0])
    return test_y

Validate on val set

In [25]:
def calc_scores(y_test, y_pred):
    accuracy = metrics.accuracy_score(y_test, y_pred)
    confusion_matrix = metrics.confusion_matrix(y_test, y_pred)
    dmc_score = np.sum(confusion_matrix * np.array([[0, -25], [-5, 5]]))
    return accuracy, dmc_score, confusion_matrix

val_y_pred = predict(val_X)
calc_scores(val_y, val_y_pred)

Test on Test set

In [None]:
test_y_pred = predict(test_X)
#submit to DMC: test_y_pred

Ideas for the future:

In [None]:
#oder normal predicten mit mehreren classifiers und
#-per mehrheitsentscheid das häufigste ergebnis nehme
#-das ergebnis des zuversichtlichsten klassifikators nehmen