# Importing Training and Testing Data

In [10]:
import numpy as np
import pandas as pd

train_data = pd.read_csv('data/train.csv')

X_t = train_data.drop(['category', 'ID'], axis=1)
y_t = train_data['category']

test_data = pd.read_csv('data/test.csv')

# Approach 1: Using `sklearn`

## Training Data without any preprocessing

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier



X_train, X_test, y_train, y_test = train_test_split(X_t, y_t, test_size=0.2, random_state=42)

rf = RandomForestClassifier()
rf.fit(X_train, y_train)

print(rf.score(X_test, y_test))


## Removing outliers using Isolation Forest and training again

In [None]:
from pipeline import OutlierDetection
# import importlib
# importlib.reload(OutlierDetection)

od = OutlierDetection('isolation_forest')
X_t_iso, y_t_iso = od.transform(X_t, y_t)

X_train, X_test, y_train, y_test = train_test_split(X_t_iso, y_t_iso, test_size=0.2, random_state=42)
rf.fit(X_train, y_train)
print("Accuracy:", rf.score(X_test, y_test))


## Removing outliers using LOF and training again

In [None]:
from pipeline import OutlierDetection

lof = OutlierDetection('lof')
X_t_lof, y_t_lof = lof.transform(X_t, y_t)

X_train, X_test, y_train, y_test = train_test_split(X_t_lof, y_t_lof, test_size=0.2, random_state=42)
rf.fit(X_train, y_train)
print("Accuracy:", rf.score(X_test, y_test))

## Now also doing dimensionality reduction using LDA

In [None]:
from pipeline import DimReduction

dr = DimReduction('pca', 19)
X_t_pca, y_t_pca = dr.transform(X_t, y_t)


clf = IsolationForest().fit(X_t_lda)
y_pred = clf.predict(X_t_lda)

X_t_lda_iso = X_t_lda[y_pred == 1]
y_t_lda_iso = y_t[y_pred == 1]

X_train, X_test, y_train, y_test = train_test_split(X_t_lda_iso, y_t_lda_iso, test_size=0.2, random_state=0)

rf.fit(X_train, y_train)
print(rf.score(X_test, y_test))
print(X_t.shape)

## Generating submission file

In [None]:
from datetime import datetime

X_test = test_data.drop(['ID'], axis=1)

X_test_lda = lda.transform(X_test)

rf.fit(X_t_lda_iso, y_t_lda_iso)
y_pred = rf.predict(X_test_lda)

submission = pd.DataFrame({'ID': test_data['ID'], 'Category': y_pred})
submission.to_csv(f"submissions/submission_{(datetime.now()).strftime('%Y_%m_%d-%H_%M')}.csv", index=False)

# Approach 2: Pipelining the process

In [11]:
import metaclass as mc
import importlib
importlib.reload(mc)

<module 'metaclass' from 'c:\\Users\\aniru\\Desktop\\College\\Statistical Machine Learning (CSE342)\\Project\\metaclass.py'>

In [12]:
import metaclass as mc
import importlib
importlib.reload(mc)

pipeline = mc.Pipeline(
    clustering_alg=None,
    dim_reduction_alg="lda",
    outlier_detection_alg="isolation_forest",
    classification_alg="rf"
)
pipeline.fit(X_t, y_t)
cv_scores = pipeline.cross_validate(X_t, y_t)

TypeError: DimReduction.transform() takes 2 positional arguments but 3 were given

In [None]:
pipeline.generate_submission(test_data)