# Importing Training and Testing Data

In [1]:
import numpy as np
import pandas as pd

train_data = pd.read_csv('data/train.csv')

X_t = train_data.drop(['category', 'ID'], axis=1)
y_t = train_data['category']

test_data = pd.read_csv('data/test.csv')

# Approach 1: Using `sklearn`

## Training Data without any preprocessing

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier



X_train, X_test, y_train, y_test = train_test_split(X_t, y_t, test_size=0.2, random_state=42)

rf = RandomForestClassifier()
rf.fit(X_train, y_train)

print(rf.score(X_test, y_test))


0.680327868852459


## Removing outliers using Isolation Forest and training again

In [3]:
from pipeline import OutlierDetection
# import importlib
# importlib.reload(OutlierDetection)

od = OutlierDetection('isolation_forest')
X_t_iso, y_t_iso = od.transform(X_t, y_t)

X_train, X_test, y_train, y_test = train_test_split(X_t_iso, y_t_iso, test_size=0.2, random_state=42)
rf.fit(X_train, y_train)
print("Accuracy:", rf.score(X_test, y_test))


Outlier percentage: 0.01069078947368421
Accuracy: 0.6556016597510373


## Removing outliers using LOF and training again

In [4]:
from pipeline import OutlierDetection

lof = OutlierDetection('lof')
X_t_lof, y_t_lof = lof.transform(X_t, y_t)

X_train, X_test, y_train, y_test = train_test_split(X_t_lof, y_t_lof, test_size=0.2, random_state=42)
rf.fit(X_train, y_train)
print("Accuracy:", rf.score(X_test, y_test))

Outlier percentage: 0.0024671052631578946
Accuracy: 0.7448559670781894


## Now also doing dimensionality reduction using LDA

In [5]:
from pipeline import DimReduction

dr = DimReduction('pca', 19)
X_t_pca, y_t_pca = dr.transform(X_t, y_t)


clf = IsolationForest().fit(X_t_lda)
y_pred = clf.predict(X_t_lda)

X_t_lda_iso = X_t_lda[y_pred == 1]
y_t_lda_iso = y_t[y_pred == 1]

X_train, X_test, y_train, y_test = train_test_split(X_t_lda_iso, y_t_lda_iso, test_size=0.2, random_state=0)

rf.fit(X_train, y_train)
print(rf.score(X_test, y_test))
print(X_t.shape)

NameError: name 'IsolationForest' is not defined

## Generating submission file

In [None]:
from datetime import datetime

X_test = test_data.drop(['ID'], axis=1)

X_test_lda = lda.transform(X_test)

rf.fit(X_t_lda_iso, y_t_lda_iso)
y_pred = rf.predict(X_test_lda)

submission = pd.DataFrame({'ID': test_data['ID'], 'Category': y_pred})
submission.to_csv(f"submissions/submission_{(datetime.now()).strftime('%Y_%m_%d-%H_%M')}.csv", index=False)