In [None]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_mldata
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt


import warnings
warnings.filterwarnings('ignore')



mnist = fetch_mldata('MNIST original')
X, y = mnist["data"], mnist["target"]

X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]
shuffle_index = np.random.permutation(60000)
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]

In [None]:
#preparing dataset for a 5/non5 classificiations
y_train_5 = y_train == 5

from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier

sgd_clf = SGDClassifier(max_iter = 5, random_state=42)
sgd_clf.fit(X_train, y_train_5)

from sklearn.model_selection import cross_val_score, cross_val_predict
cross_val_score(sgd_clf, X_train, y_train_5, cv=3, scoring = "accuracy")

#making predctions on the training data as a 5 or not
y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3)


#-------------------------------- using complete dataset and not just 5/non5 -----------

sgd_scrs = cross_val_score(sgd_clf, X_train, y_train, cv = 3, scoring = "accuracy")
y_train_pred = cross_val_predict(sgd_clf, X_train, y_train, cv=3)
print("(SGD non scaled)", sgd_scrs)

#using randomforestclassifier
forest_clf = RandomForestClassifier(random_state = 42)
frst_scrs = cross_val_score(forest_clf, X_train, y_train, cv = 3, scoring = "accuracy")
print("(Random Forest)", frst_scrs)



In [None]:
#using standard scalar
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))
sgd_scrs_scaled = cross_val_score(sgd_clf, X_train_scaled, y_train, cv = 3, scoring = "accuracy")
print("(SGD scaled)", sgd_scrs_scaled)

In [None]:
#automatically sgd makes onevsall classifier, if we want one vsone this is how you do it. 
from sklearn.multiclass import OneVsOneClassifier
ovo_clf = OneVsOneClassifier(SGDClassifier(max_iter=5, random_state=42))
ovo_clf.fit(X_train, y_train)
ovo_clf.predict([X_train[1]])
len(ovo_clf.estimators_)

In [None]:
#illustration purposes
from sklearn.metrics import confusion_matrix
y_train_pred = cross_val_predict(sgd_clf, X_train_scaled, y_train, cv=3)
conf_mx = confusion_matrix(y_train, y_train_pred)
row_sums = conf_mx.sum(axis=1, keepdims=True)
norm_conf_mx = conf_mx / row_sums
np.fill_diagonal(norm_conf_mx, 0)

plt.subplots(figsize=(10,10))
a = plt.subplot(1,2,1) 
a.matshow(conf_mx, cmap=plt.cm.gray)
b = plt.subplot(1,2,2)
b.matshow(norm_conf_mx, cmap=plt.cm.gray)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score


y_train_large = (y_train >= 7)
y_train_odd = (y_train % 2 == 1)
y_multilabel = np.c_[y_train_large, y_train_odd]

knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_multilabel)

y_train_knn_pred = cross_val_predict(knn_clf, X_train, y_multilabel, cv=3, n_jobs=-1)
f1_score(y_multilabel, y_train_knn_pred, average="macro")


In [None]:
#starting multioutput-multiclass classification. making new dataset by introducing noise to our set

noise = np.random.randint(0, 100, (len(X_train), 784))
X_train_mod = X_train + noise
noise = np.random.randint(0, 100, (len(X_test), 784))
X_test_mod = X_test + noise
y_train_mod = X_train
y_test_mod = X_test

#training
knn_clf.fit(X_train_mod, y_train_mod)


noisy_rand_digit = X_train_mod[10]
cl_d = knn_clf.predict([noisy_rand_digit])

rand_digit = X_train[10]

od = rand_digit.reshape(28,28)
nd = noisy_rand_digit.reshape(28,28)
cd = cl_d.reshape(28,28)

d1 = plt.subplot(1,3,2)
d1.set_title("noisy digit")
d1.imshow(nd)

d2 = plt.subplot(1,3,1)
d2.imshow(od)
d2.set_title("original digit")

d3 = plt.subplot(1,3,3)
d3.imshow(cd)
d3.set_title("cleaned digit")


