In [1]:
import seaborn as sns
sns.set()
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings("ignore")

In [2]:
test_data = pd.read_csv('fashion-mnist_test.csv')
train_data = pd.read_csv('fashion-mnist_train.csv')

In [3]:
X = train_data.iloc[:,1:].values
y = train_data.iloc[:,0].values

In [4]:
# loading test data
y_test = test_data.iloc[:,0].values
X_test = test_data.iloc[:,1:].values

### Scale the data from 0 to 1

In [5]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
X_test = scaler.fit_transform(X_test)

### Apply SVD function to reduce the number of dimensions
I selected 300 components, and the variance ratio is 97.3%

In [6]:
svd  = TruncatedSVD(n_components=300)
svd.fit(X)

TruncatedSVD(n_components=300)

In [7]:
print(svd.explained_variance_ratio_.sum())

0.9733544630044759


Transform X and X_test to SVD applied variables

In [8]:
X_new = svd.transform(X)

In [9]:
X_test_new = svd.transform(X_test)

### Train discriminative classifier(multinomial logistic regression) on both training data set after SVD and original data set

In [10]:
# SVD data
clf = LogisticRegression()
clf.fit(X_new,y)

LogisticRegression()

In [11]:
# Tuning hyperparameter
grid_params = {
    'C' : [1,5,10],
    'max_iter':[100,300,500]
}

In [12]:
gs_LR = GridSearchCV(clf, grid_params, cv=10)
gs_LR.fit(X_new,y)
print("Best Parameters : ", gs_LR.best_params_)
print("Best Score : ", gs_LR.best_score_)
print("Best Test Score : ", gs_LR.score(X_test_new, y_test))

Best Parameters :  {'C': 1, 'max_iter': 500}
Best Score :  0.8557166666666667
Best Test Score :  0.8538


In [13]:
clf = LogisticRegression(C=1 ,max_iter=500 )
clf.fit(X_new,y)

LogisticRegression(C=1, max_iter=500)

In [14]:
print(f'accuracy on test data = {clf.score(X_test_new,y_test)}')

accuracy on test data = 0.8538


In [15]:
# original data
clf.fit(X,y)

LogisticRegression(C=1, max_iter=500)

In [16]:
print(f'accuracy on test data = {clf.score(X_test,y_test)}')

accuracy on test data = 0.8544


### Train generative classifier(Naive Bayes) on both training data set after SVD and original data set

In [17]:
clf2 = MultinomialNB()
clf2.fit(X,y)

MultinomialNB()

In [18]:
# Tuning hyperparameter
grid_params = {
    'alpha' : [1,5,10]
}

In [19]:
gs_NB = GridSearchCV(clf2, grid_params, cv=10)
gs_NB.fit(X,y)
print("Best Parameters : ", gs_NB.best_params_)
print("Best Score : ", gs_NB.best_score_)
print("Best Test Score : ", gs_NB.score(X_test, y_test))

Best Parameters :  {'alpha': 1}
Best Score :  0.6649166666666667
Best Test Score :  0.6674


In [21]:
clf2 = MultinomialNB(alpha=1)
clf2.fit(X,y)

MultinomialNB(alpha=1)

In [22]:
print(f'accuracy on test data = {clf2.score(X_test,y_test)}')

accuracy on test data = 0.6674


### Train generative classifier(KNN) on both training data set after SVD and original data set

In [23]:
clf3 = KNeighborsClassifier()
clf3.fit(X_new,y)

KNeighborsClassifier()

In [24]:
# Tuning hyperparameter
grid_params = {
    'n_neighbors' : [1,5,10,15,20],
}

In [25]:
gs = GridSearchCV(clf3, grid_params, cv=10)
gs.fit(X_new,y)
print("Best Parameters : ", gs.best_params_)
print("Best Score : ", gs.best_score_)
print("Best Test Score : ", gs.score(X_test_new, y_test))

Best Parameters :  {'n_neighbors': 5}
Best Score :  0.8622000000000002
Best Test Score :  0.8647


In [26]:
clf3 = KNeighborsClassifier(n_neighbors = 5)
clf3.fit(X_new,y)

KNeighborsClassifier()

In [27]:
print(f'accuracy on test data = {clf3.score(X_test_new,y_test)}')

accuracy on test data = 0.8647


In [28]:
clf3.fit(X,y)

KNeighborsClassifier()

In [29]:
print(f'accuracy on test data = {clf3.score(X_test,y_test)}')

accuracy on test data = 0.8592


# Write a brief description to compare the performances of these classifiers in terms of accuracy on the test set.

### From the result, I could find that logistic regression and KNN shows similar accuracy, 0.86, on test dataset. On the other hand, naive bayes classifier shows lower accuracy, 0.66. This is because an assumption of naive bayes that all pixels are independent is wrong. That is, the naive bayes classifier is not good for classifying MNIST dataset.