# Importing Training and Testing Data

In [27]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

train_data = pd.read_csv('data/train.csv')

X_t = train_data.drop(['category', 'ID'], axis=1)
y_t = train_data['category']

test_data = pd.read_csv('data/test.csv')

X_train, X_test, y_train, y_test = train_test_split(X_t, y_t, test_size=0.1, random_state=42)

# Approach 2: Pipelining the process

In [28]:
import metaclass as mc
import pipeline_components as pc
import importlib
importlib.reload(mc)

<module 'metaclass' from 'c:\\Users\\aniru\\Desktop\\College\\Statistical Machine Learning (CSE342)\\Project\\metaclass.py'>

In [29]:
pipeline = mc.Pipeline(
    clustering_alg=("kmeans", 4),
    dim_reduction_algs=[("pca", 350), ("lda", 19)],
    outlier_detection_alg=None,
    classification_alg="logistic",
    ensemble_algs=["bagging"],
)
# pipeline.fit(X_t, y_t)
pipeline.fit(X_train, y_train)
print("Pipeline done")

# cv_scores = pipeline.cross_validate(X_t, y_t, n_splits=5)
# print("Cross validation scores: ", cv_scores)

Standardizing data...
Currently at clustering: ('kmeans', 4)
Shape of X_t: (1094, 4096)
Shape of y_pred: (1094,)
Shape of X_t_kmeans: (1094, 4097)
Currently at dim reduction
X_t shape: (1094, 19)
Currently at outlier removal: None
Currently at classifier: logistic
Currently at ensembling
Done!
Pipeline done


In [30]:
# Training accuracy
y_pred = pipeline.predict(X_test)
print("Validation accuracy: ", np.sum(y_pred == y_test) / len(y_test))

# 255 pca, 1 lda, logistic, no ensemble
# cv_scores

Shape of X_t: (122, 4096)
Shape of y_pred: (122,)
Shape of X_t_kmeans: (122, 4097)
Validation accuracy:  0.7868852459016393


In [31]:
# pipeline.generate_submission(test_data)

# Approach 3 - New Pipeline

# Imports

In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler
from pipeline_components import *
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import BaggingClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

In [41]:
def generate_submission(pipeline, test_data):
    from datetime import datetime

    X_test = test_data.drop(['ID'], axis=1)
    y_pred = pipeline.predict(X_test)

    submission = pd.DataFrame({'ID': test_data['ID'], 'Category': y_pred})
    submission.to_csv(f"submissions/submission_{(datetime.now()).strftime('%Y_%m_%d-%H_%M')}.csv", index=False)


In [36]:
pipeline = Pipeline([
    ("PCA 250", PCA(n_components=300)),
    ("LDA 19", LinearDiscriminantAnalysis(n_components=19)),
    ('Logistic Regression', LogisticRegression(max_iter=1000))
    ])

pipeline.fit(X_t, y_t)
print("Pipeline done")
cross_val_scores=cross_val_score(pipeline, X_t, y_t, cv=5)

Pipeline done


In [38]:
cross_val_scores.mean(), cross_val_scores.std()

(0.7862038723605209, 0.021079661465683932)

In [42]:
generate_submission(pipeline, test_data)