# Importing Training and Testing Data

In [13]:
import numpy as np
import pandas as pd

train_data = pd.read_csv('data/train.csv')

X_t = train_data.drop(['category', 'ID'], axis=1)
y_t = train_data['category']

test_data = pd.read_csv('data/test.csv')

# Approach 1: Using `sklearn`

## Training Data without any preprocessing

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier



X_train, X_test, y_train, y_test = train_test_split(X_t, y_t, test_size=0.2, random_state=42)

rf = RandomForestClassifier()
rf.fit(X_train, y_train)

print(rf.score(X_test, y_test))


0.680327868852459


## Removing outliers using Isolation Forest and training again

In [16]:
from sklearn.ensemble import IsolationForest

clf = IsolationForest().fit(X_t)
y_pred = clf.predict(X_t)

# percentage of outliers
print(sum(y_pred == -1) / len(y_pred))

# removing outliers
X_t_iso = X_t[y_pred == 1]
y_t_iso = y_t[y_pred == 1]

X_train, X_test, y_train, y_test = train_test_split(X_t_iso, y_t_iso, test_size=0.2, random_state=42)
rf.fit(X_train, y_train)
print(rf.score(X_test, y_test))


0.018092105263157895
0.6861924686192469


Not much noticeable improvement

## Now also doing dimensionality reduction using LDA

In [23]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda = LinearDiscriminantAnalysis()
lda.fit(X_t, y_t)
X_t_lda = lda.transform(X_t)

clf = IsolationForest().fit(X_t_lda)
y_pred = clf.predict(X_t_lda)

X_t_lda_iso = X_t_lda[y_pred == 1]
y_t_lda_iso = y_t[y_pred == 1]

X_train, X_test, y_train, y_test = train_test_split(X_t_lda, y_t, test_size=0.2, random_state=0)

rf.fit(X_train, y_train)
print(rf.score(X_test, y_test))

0.9959016393442623
