In [1]:
import numpy as np
import pandas as pd

In [2]:
train = pd.read_csv('Data/train.csv')
test  = pd.read_csv('Data/test.csv')

In [3]:
x_train = train.drop(columns=['target']).values
y_train = train['target'].values

x_test = test.drop(columns=['target']).values
y_test = test['target'].values

# 1. Feature Scaling

In [4]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(x_train)

x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

# 2. Augmentation

In [5]:
from imblearn.over_sampling import RandomOverSampler, SMOTE
from collections import Counter

## Oversampling

In [6]:
ros = RandomOverSampler(random_state=42)
x_over, y_over = ros.fit_resample(x_train, y_train)

print(sorted(Counter(y_over).items()))

[(0, 8340692), (1, 8340692), (2, 8340692), (3, 8340692), (4, 8340692), (5, 8340692)]


## SMOTE (Synthetic Minority Over-sampling )

In [7]:
sm = SMOTE(random_state=42)
x_smote, y_smote = sm.fit_resample(x_train, y_train)

print(sorted(Counter(y_smote).items()))

[(0, 8340692), (1, 8340692), (2, 8340692), (3, 8340692), (4, 8340692), (5, 8340692)]


In [8]:
from sklearn.metrics import accuracy_score, f1_score

def eval(model, x_tr, x_te, y_tr, y_te):
    yh = model.predict(x_tr)
    print(accuracy_score(y_tr, yh), f1_score(y_tr, yh, average=None))

    yh = model.predict(x_te)
    print(accuracy_score(y_te, yh), f1_score(y_te, yh, average=None))

In [9]:
from sklearn.tree import DecisionTreeClassifier

rf = DecisionTreeClassifier(random_state=42)

rf.fit(x_train, y_train)
eval(rf, x_train, x_test, y_train, y_test)

1.0 [1. 1. 1. 1. 1. 1.]
1.0 [1. 1. 1. 1. 1. 1.]


In [10]:
rf = DecisionTreeClassifier(random_state=42)

rf.fit(x_over, y_over)
eval(rf, x_over, x_test, y_over, y_test)

1.0 [1. 1. 1. 1. 1. 1.]
1.0 [1. 1. 1. 1. 1. 1.]


In [11]:
rf = DecisionTreeClassifier(random_state=42)

rf.fit(x_smote, y_smote)
eval(rf, x_smote, x_test, y_smote, y_test)

1.0 [1. 1. 1. 1. 1. 1.]
1.0 [1. 1. 1. 1. 1. 1.]
