In [0]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_mldata
from sklearn.model_selection import train_test_split,cross_val_score,cross_val_predict
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix,precision_score,recall_score,f1_score,precision_recall_curve,roc_curve,roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

In [0]:
mnist = pd.read_csv("https://raw.githubusercontent.com/cerndb/dist-keras/master/examples/data/mnist.csv",sep=',')

In [0]:
mnist_train,mnist_test = train_test_split(mnist,test_size = 0.2)

In [0]:
mnist_train_data = mnist_train.drop("label",axis=1)
mnist_train_label = mnist_train["label"].copy()
mnist_test_data = mnist_test.drop("label",axis=1)
mnist_test_label = mnist_test["label"].copy()

In [0]:
digit = mnist_train_data.iloc[10008:10009,:]
digit = digit.values.reshape(28,28)
plt.imshow(digit,cmap=matplotlib.cm.binary,interpolation="nearest")
plt.axis("off")
plt.show()

In [8]:
shuffle_index = np.random.permutation(33600)
mnist_train_data, mnist_train_label = mnist_train_data[shuffle_index], mnist_train_label[shuffle_index]

KeyError: ignored

In [0]:
mnist_train_label_5 = (mnist_train_label == 5)
mnist_test_label_5 = (mnist_test_label == 5)

In [0]:
sgd = SGDClassifier(random_state = 42)
sgd.fit(mnist_train_data,mnist_train_label_5)

In [0]:
sgd.predict(mnist_test_data)

In [0]:
cross_val_score(sgd,mnist_train_data,mnist_train_label_5,cv=3,scoring='accuracy')

In [0]:
predict=  cross_val_predict(sgd,mnist_train_data,mnist_train_label_5,cv=3)

In [0]:
cm = confusion_matrix(mnist_train_label_5,predict)

In [0]:
precesion = (cm[1,1]/(cm[0,1]+cm[1,1]))*100

In [0]:
 tpr =  (cm[1,1]/(cm[1,1]+cm[1,0]))*100

In [0]:
precision_score(mnist_train_label_5,predict)

In [0]:
recall_score(mnist_train_label_5,predict)

In [0]:
f1_score(mnist_train_label_5,predict)

In [0]:
y_score = sgd.decision_function(mnist_test_data)

In [0]:
y_score =  cross_val_predict(sgd,mnist_train_data,mnist_train_label_5,cv=3,method="decision_function")

In [0]:
precisions,recalls,thresholds = precision_recall_curve(mnist_train_label_5,y_score)

In [0]:
plt.plot(thresholds,precisions[:-1],"b--",label="precision")
plt.plot(thresholds,recalls[:-1],"g-",label="recall")
plt.xlabel("Threshold")
plt.legend(loc="upper left")
plt.ylim([0,1])

In [0]:
Y_train_90 = (y_score>70000)

In [0]:
precision_score(mnist_train_label_5,Y_train_90)

In [0]:
recall_score(mnist_train_label_5,Y_train_90)

In [0]:
fpr,tpr,thresholds = roc_curve(mnist_train_label_5,y_score)

In [0]:
plt.plot(fpr,tpr,linewidth=2,label = "tpr")
plt.plot([0,1],[0,1],'k--')
plt.axis([0,1,0,1])
plt.xlabel('False positive')
plt.xlabel('True positive')

In [0]:
roc_auc_score(mnist_train_label_5,y_score)

random forest

In [0]:
forest = RandomForestClassifier(random_state=42)

In [0]:
y_forest = cross_val_predict(forest,mnist_train_data,mnist_train_label_5,cv=3,method="predict_proba")

In [0]:
y_forest_scores =y_forest[:,1]

In [0]:
fpr_forest,tpr_forest,thresholds_forest = roc_curve(mnist_train_label_5,y_forest_scores)

In [0]:
plt.plot(fpr_forest,tpr_forest,linewidth=2,label = "tpr")
plt.plot([0,1],[0,1],'k--')
plt.axis([0,1,0,1])
plt.xlabel('False positive')
plt.xlabel('True positive')

multiclass sgd

In [0]:
sgd.fit(mnist_train_data,mnist_train_label)

In [0]:
predicted = sgd.predict(mnist_test_data)

In [0]:
confusion_matrix(mnist_test_label,predicted)

In [0]:
predicted_decision = sgd.decision_function(mnist_test_data)

In [0]:
sgd.classes_

In [0]:
ovo = OneVsOneClassifier(SGDClassifier(random_state = 42))

In [0]:
ovo.fit(mnist_train_data,mnist_train_label)

In [0]:
ovo.predict(mnist_test_data)

In [0]:
len(ovo.estimators_)

multiclass random forest

In [0]:
forest.fit(mnist_train_data,mnist_train_label)

In [0]:
forest.predict_proba(mnist_test_data[5:6])

In [0]:
Scaler = StandardScaler()
mnist_train_scaled = Scaler.fit_transform(mnist_train_data.astype(np.float64))
cross_val_score(sgd,mnist_train_scaled,mnist_train_label,cv=3,scoring="accuracy")

In [0]:
mnist_train_predict = cross_val_predict(sgd,mnist_train_scaled,mnist_train_label,cv=3)

In [0]:
conf_mat =confusion_matrix(mnist_train_label,mnist_train_predict)

In [0]:
plt.matshow(conf_mat,cmap=plt.cm.gray)

In [0]:
norm_cm = conf_mat/(conf_mat.sum(axis=1,keepdims=True))
np.fill_diagonal(norm_cm,0)
plt.matshow(norm_cm,cmap=plt.cm.gray)

multi label classification

In [0]:
mnist_train_large = (mnist_train_label <=7)
mnist_train_odd = (mnist_train_label % 2 == 1)
mnist_multilabel = np.c_[mnist_train_large,mnist_train_odd]

In [0]:
knn_clf = KNeighborsClassifier()

In [0]:
knn_clf.fit(mnist_train_data,mnist_multilabel)

In [0]:
knn_clf.predict(mnist_test_data)

In [0]:
mnist_train_knn_pred = cross_val_predict(knn_clf,mnist_train_data,mnist_train_label,cv=3)

In [0]:
f1_score = (mnist_train_label,mnist_train_knn_pred,average="macro")

multioutput classification

In [0]:
noise =np.random.randint(0, 100, (len(mnist_train_data), 784))
X_train_mod = mnist_train_data + noise
noise =np.random.randint(0, 100, (len(mnist_test_data), 784))
X_test_mod = mnist_test_data + noise
y_train_mod = mnist_train_label
y_test_mod = mnist_test_label

In [0]:
knn_clf.fit(X_train_mod,y_train_mod)

digit = knn_clf.predict(X_test_mod.iloc[400:401,:])
digit = digit.values.reshape(28,28)
plt.imshow(digit,cmap=matplotlib.cm.binary,interpolation="nearest")
plt.axis("off")
plt.show()