<a href="https://colab.research.google.com/github/ARKA1112/HOML_AurelienG/blob/main/chap3_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### Classification


In [None]:
#import the mnist dataset

from sklearn.datasets import fetch_openml, load_files
#mnist = fetch_openml('mnist_784', cache=True)

In [None]:
#to use the cached data
from joblib import Memory

memory = Memory('~/scikit_learn_data')
fetch_openml_cached = memory.cache(fetch_openml)

In [None]:
mnist = fetch_openml_cached('mnist_784')

In [None]:
mnist.data.shape

In [None]:
#separating the features and the target
X_, y_ = mnist['data'], mnist['target']

In [None]:
X = X_.to_numpy()

In [None]:
X

In [None]:
X[0].reshape(28,28)

In [None]:
#Each row is has 284 features ie pixels
#Hence we need to reshape the data to make it useful

import matplotlib as mpl
import matplotlib.pyplot as plt

%matplotlib inline
some_digit = X[0]
some_digit_image = some_digit.reshape(28, 28)

plt.imshow(some_digit_image, cmap='binary')
plt.axis("off")
plt.show()

In [None]:
y_[0]   #The data label says us so

In [None]:
#since the label is a number but a string 
#we will convert it to a number
import numpy as np
y = y_.astype(np.uint8)

In [None]:
#Create a training and the test dataset 
#Data is already split into 60000,10000

In [None]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000],y[60000:]

#### Training a binary classifier

In [None]:
#at first we will identify only 5

y_train_5 = (y_train == 5)
y_test_5 = (y_test == 5)

In [None]:
y_test_5

#### Implementing SGD

In [None]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(random_state = 42)
sgd_clf.fit(X_train, y_train_5)

In [None]:
#now comes the prediction

sgd_clf.predict([some_digit])

#### Performance Measures

In [None]:
##Measuring Accuracy Using Cross-Validation

In [None]:
### Implementing Cross Validation


#same as using cross_val_score

from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

skfolds = StratifiedKFold(n_splits = 3, random_state = 42, shuffle=True)

for train_index, test_index in skfolds.split(X_train, y_train_5):
    clone_clf = clone(sgd_clf)
    X_train_folds = X_train[train_index]
    y_train_folds = y_train_5[train_index]
    X_test_fold = X_train[test_index]
    y_test_fold = y_train_5[test_index]
    
    clone_clf.fit(X_train_folds, y_train_folds)
    y_pred = clone_clf.predict(X_test_fold)
    n_correct = sum(y_pred == y_test_fold)
    print(n_correct / len(y_pred))
    

In [None]:
### Now using cross val score

from sklearn.model_selection import cross_val_score

cross_val_score(sgd_clf, X_train, y_train_5, cv = 3, scoring='accuracy')

In [None]:
#Since the accuracy is well above 93% accuracy (ratio of correct predictions)

In [None]:
#So lets model a dumb classifier that predicts the elements which are not 5

In [None]:
from sklearn.base import BaseEstimator

class Never5Classifier(BaseEstimator):
    def fit(self, X, y=None):
        pass
    def predict(self, X):
        return np.zeros((len(X), 1), dtype=bool)

In [None]:
np.zeros((len(range(12)),1),dtype='bool')
y_train_5

In [None]:
X_test_fold.flatten()

In [None]:
n5c = Never5Classifier()
n5c.fit(X_train,y_train_5)
sum(n5c.predict(X_test_fold.flatten()))


In [None]:
never_5_clf = Never5Classifier()

In [None]:
cross_val_score(never_5_clf, X_train, y_train_5, cv=3, scoring='accuracy')

In [None]:
#Because only 10% of the images contains images of 5

In [None]:
#This is why accuracy metrics are not preferred in classification methods
#instead confusion matrix is used

#### Confusion Matrix

In [None]:
#To compute the confusion matrix you need to have a set of predictions so that they cane
#be compared to the actual targets. You could make predictions so that they can be compared to the actual targets. You could make predictions on the test set, but lets keep it untouched for now.

In [None]:
from sklearn.model_selection import cross_val_predict

In [None]:
y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3)

In [None]:
y_train_pred

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
confusion_matrix(y_train_5, y_train_pred)

In [None]:
#in the above case TN/non-5 identified as non5
#in the above case FN/non-5 identified as 5
#in the above case FP/5 identified as non 5
#in the above case TP/5 identified as 5


#One would ideally want the FN and the FP to be zero
#TN/(TN + FP) is specificity

#precision = TP/(TP + FP)

#and recall/sensitivity or true positive rate  is the ratio of

#recall = TP/(TP + FN)

![Screenshot from 2022-10-21 18-11-47.png](attachment:cc07a378-bc3b-42af-8fa8-e20e17607557.png)

In [None]:
#calculating precision and recall

from sklearn.metrics import precision_score, recall_score

print(precision_score(y_train_5, y_train_pred))
print(recall_score(y_train_5, y_train_pred))
print(f'specificity {53892/(53892+1891)}')

In [None]:
#It is often convenient to combine precision and recall into a single metric 
#called F1_score in particular if you need a simple way to compare two classifiers

#f1score is the harmonic mean of the precision and recall and gives much more weight
#to the lower values hence f1 score isw only high when precision and recall both 
#are high

In [None]:
from sklearn.metrics import f1_score
f1_score(y_train_5, y_train_pred)

![Screenshot from 2022-10-21 19-13-51.png](attachment:0871fcfa-f2d2-46a5-b59c-b3a45025d271.png)

#### Precision/Recall Trade=off

In [None]:
#SGD classifier assigns a score based on a decision function for each instance
#if that score is above a threshold
#SGD assigns the instance in a positive class or negative if otherwies



#to compute that one can call the decision function 

y_scores = sgd_clf.decision_function([some_digit])
y_scores

In [None]:
threshold = 0
y_some_digit_pred = (y_scores > threshold)
y_some_digit_pred

In [None]:
#assigning a threshold as 0
threshold  = 5000
y_some_digit_pred = (y_scores > threshold)
y_some_digit_pred   #assigns false for the threshold is above the score

In [None]:
#How do you decide which threshold to use
#lets return the decision scores instead of the predictions

y_scores = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3,method='decision_function')

In [None]:
y_scores

In [None]:
len(y_scores)

In [None]:
#with these scores use the precision_recall_curve() function to compute precision and recall for all
#possible thresholds

from sklearn.metrics import precision_recall_curve

precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores)

In [None]:
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    a = plt.plot(thresholds, precisions[:-1],'b--',label='Precision')
    b = plt.plot(thresholds, recalls[:-1], 'g-', label='Recall')
    plt.legend()
    plt.xlim([-50000,50000])
    plt.grid(linestyle='--')
    plt.axvline(5000,ymax=0.88, color='red', linestyle=':')
    plt.axhline(0.915,xmax=0.56, color='red', linestyle=':')
    plt.axhline(0.4,xmax=0.55, color='red', linestyle=':')
    plt.scatter(5000,0.917,color='red')
    plt.scatter(5000,0.4,color='red')
    plt.show()

In [None]:
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)

In [None]:
recalls.shape, precisions.shape, thresholds.shape

In [None]:
#another way to assess is to plot precision directly against recall

In [None]:
plt.plot(precisions[:-1], recalls[:-1])
plt.xlim((0.1,1.0))
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.grid()
plt.axvline(0.71,ymax=0.77, color='red',linestyle='--')
plt.axhline(0.81,xmax=0.67, color='red',linestyle='--')
plt.axvline(0.51,ymax=0.88, color='magenta',linestyle=':')
plt.axhline(0.91,xmax=0.45, color='magenta',linestyle=':')
plt.scatter(0.71,0.81, color='red')
plt.scatter(0.71,0.81, color='red')
plt.scatter(0.51,0.91, color='red')
plt.annotate(text='90% Precision',xy=(0.51,0.91),xytext=(0.52,0.95))
plt.annotate(text='80% Precision',xy=(0.71,0.81), xytext=(0.72, 0.85))

plt.show()

In [None]:
#However to find the max value of threshold at 90 percent precision we can use a function called np.argmax which gives the first index of the max value it encounters
thresholds_90_percent = thresholds[np.argmax(precisions[:-1] >= 0.9)]
thresholds_90_percent

In [None]:
y_train_pred_90 = (y_scores >= thresholds_90_percent)

y_train_pred_90


In [None]:
precision_score(y_train_5, y_train_pred_90)
print(recall_score(y_train_5, y_train_pred_90))
f1_score(y_train_5, y_train_pred_90)

#### The ROC Curve (Receiver Operating Characteristics)

In [None]:
#simillar to Precision vs Recall but plots TPR vs FPR
#FPR is the negatives that are falsely classified as positives
#FPR = FP/(FP + TN)
#TNR = TN/(TN + FP) (specificity)
#TPR = TP/(TP + FN) (sensitivity)/(recall)
#precision = TP/(TP + TN)

In [None]:
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_train_5, y_scores)

In [None]:
def roc_plot(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0,1],[0,1],'k--')  #Dashed diagonal
    plt.xlabel('False Positive Rate')
    plt.ylabel("Recall/ True Positive Rate")
    plt.title("ROC Curve Plot")
    plt.grid(linestyle='--')


roc_plot(fpr, tpr)
plt.show()

In [None]:
#The dotted line shows Roc curve of a purely random classifier
#A good classifier stays away from that line

In [None]:
#One way to compare classifiers is to measure the area under the curve (AUC) a perfect one will have AUC of 1 whereas a purely random one will have 0.5. sklearn provides a function to compute the roc auc

from sklearn.metrics import roc_auc_score

roc_auc_score(y_train_5, y_scores)

In [None]:
#Choose ROC when you care about the true positives
#precision/recall curve when you care about the false positives more

#### Train a RandomForestClassifier

###### Train a RandomForestClassifier to and compare its ROC cure and ROC AUC

In [None]:
##Also the random forest classifier doesnt have decision_function() instead it has a predict_proba() method

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict

forest_clf = RandomForestClassifier(random_state=42)

y_probas_forest = cross_val_predict(forest_clf, X_train, y_train_5, cv=3,method='predict_proba')

In [None]:
#The roc_curve() function expects labels and scores, but instead one can also provide class's probability

y_scores_forest = y_probas_forest[:,1]

fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train_5, y_scores_forest)

In [None]:
plt.grid()
plt.plot(fpr_forest, tpr_forest, label='Random Forest')
plt.plot(fpr,tpr,label='SGD',linestyle='--')
plt.legend(loc='lower right')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title("Random Forest and SGD plot")
plt.plot([0,1],[0,1],color='k',alpha=0.1)
plt.show()

In [None]:
y_scores_forest_bool = y_scores_forest > 0.5

In [None]:
y_scores_forest_bool

In [None]:
from sklearn.metrics import roc_auc_score, precision_score, recall_score,confusion_matrix
roc_auc_score(y_train_5, y_scores_forest)

In [None]:
y_scores_forest_bool

In [None]:
confusion_matrix(y_train_5,y_scores_forest_bool)

In [None]:
print(precision_score(y_train_5, y_scores_forest_bool))
print(recall_score(y_train_5, y_scores_forest_bool))

#### Multiclass Classification

In [None]:
#Some algorithms (such as Support Vector classifiers) scale poorly with the size of the training set. For these algorithms OvO is preferred because it is faster to train many classifiers on small training sets than to train few  classifiers on large training sets. For most binary classification algorithms, however, OvR is preferred.

#SVC performs poorly on large datasets

#### Trying with SVC

In [None]:
from sklearn.svm import SVC

svm_clf = SVC()
svm_clf.fit(X_train, y_train)  #y_train, not y_train_5
svm_clf.predict([some_digit])


In [None]:
some_digit_score = svm_clf.decision_function([some_digit])   #shows 10 scores for 0 to 9
#And the score with the highest value ie 9.313 is for the number five hence it predicts five

In [None]:
#To check which number it is

print(np.argmax(some_digit_score))
svm_clf.classes_

#### If you want to force OneVsRestClassifier to SVC

In [None]:
from sklearn.multiclass import OneVsRestClassifier
ovr_clf = OneVsRestClassifier(SVC())
ovr_clf.fit(X_train, y_train)
ovr_clf.predict([some_digit])

In [None]:
len(ovr_clf.estimators_)   #There are 10 estimators

#### Now using SGD classifier

# New Section

In [None]:
sgd_clf.fit(X_train, y_train)

In [None]:
sgd_clf.predict([some_digit])

In [None]:
sgd_clf.decision_function([some_digit])

In [None]:
image = some_digit.reshape(28,28)
plt.imshow(image, cmap='BuGn')
plt.axis('off')

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict
sgd_clf = SGDClassifier()

In [None]:
#Now testing its accuracy

cross_val_score(sgd_clf, X_train[:2000], y_train[:2000], cv=3, scoring='accuracy')

In [None]:
#Also by scaling the training data the accuracy can be improved

from sklearn.preprocessing import StandardScaler

scaler  = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))


In [None]:
y_train_pred = cross_val_score(sgd_clf, X_train_scaled[:2000], y_train[:2000], cv = 3, scoring='accuracy')

In [None]:
y_train_pred  #89% accuracy achieved

In [None]:

y_train_pred = cross_val_predict(sgd_clf, X_train_scaled[:2000], y_train[:2000], cv = 3)

In [None]:
import numpy as np
np.savez_compressed('/content/y_train_pred_fille.npz', y_train_pred)

In [None]:
f = np.load('/content/y_train_pred_fille.npz',allow_pickle=True)

In [None]:
lst = f.files

In [None]:
for item in lst:
  print(item)
  f[item]

In [None]:
A = f[item]

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

In [None]:
conf_mx = confusion_matrix(y_train[:2000], y_train_pred)

In [None]:
conf_mx

In [None]:
#To make it more meaningful we will use the matplotlibs matshow function
plt.figure(figsize=(17,17))
plt.matshow(conf_mx, cmap=plt.cm.gray)
plt.show()

In [None]:
row_sums = conf_mx.sum(axis=1, keepdims=True)

In [None]:
norm_conf_mx = conf_mx / row_sums
norm_conf_mx

#### Now fill the diagonal with zeroes to keep only the errors

In [None]:
np.fill_diagonal(norm_conf_mx, 0)

In [None]:
plt.matshow(norm_conf_mx, cmap=plt.cm.gray)

In [None]:
#The column 8 has many brighter rows which means that the many of the 8s have been misclassified 

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
conf_matrix_disp = ConfusionMatrixDisplay.from_predictions(y_train_pred, y_train[:2000])

In [None]:
#As in the above image we can clearly see that the number of 8 s that have been missclassified is pretty large

In [None]:
#Now we will show it in percentage

plt.rc('font', size=10)  #sets the fontsize for all
ConfusionMatrixDisplay.from_predictions(y_train[:2000],  y_train_pred, normalize='true', values_format=".0%")
plt.show()

In [None]:
#Now lets plot for which the predictions were erroneous

sample_weight = (y_train_pred != y_train[:2000])

ConfusionMatrixDisplay.from_predictions(y_train[:2000], y_train_pred, sample_weight=sample_weight,normalize='true',values_format=".0%")
plt.show()


#This makes clear many numbers were misclassified as 8s

In [None]:
#Lets put all the plots in a couple of figures for the book:

fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(19,6))
plt.rc('font',size= 9)
ConfusionMatrixDisplay.from_predictions(y_train[:2000],y_train_pred[:2000], ax=axs[0])
axs[0].set_title("Confusion Matrix")
plt.rc('font', size=10)
ConfusionMatrixDisplay.from_predictions(y_train[:2000], y_train_pred[:2000], ax=axs[1], normalize='true', values_format='.0%')
axs[1].set_title('CM Normalized by row')
plt.show()

In [None]:
#Plot of errors by row

fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(19,6))
plt.rc('font',size=9)
ConfusionMatrixDisplay.from_predictions(y_train[:2000], y_train_pred,ax = axs[0], sample_weight=sample_weight,normalize='true', values_format='.0%')
axs[0].set_title("errors normalized by row")

plt.rc('font', size=10)
ConfusionMatrixDisplay.from_predictions(y_train[:2000], y_train_pred, ax=axs[1], normalize='pred', sample_weight=sample_weight,values_format='.0%')
axs[1].set_title("Errors normalized by columns")

plt.show()
plt.rc('font', size=12)

In [None]:
X_train[y_train == 5]

In [None]:
cl_a, cl_b  = 3, 5
X_aa = X_train[:2000][(y_train[:2000] == cl_a) & (y_train_pred == cl_a)]
X_ab = X_train[:2000][(y_train[:2000] == cl_a) & (y_train_pred == cl_b)]
X_ba = X_train[:2000][(y_train[:2000] == cl_b) & (y_train_pred == cl_a)]
X_bb = X_train[:2000][(y_train[:2000] == cl_b) & (y_train_pred == cl_b)]
plt.matshow(X_bb[1].reshape(28,28), cmap='binary')
plt.axis('off')

In [None]:
#creating a figure

X_ba

In [None]:
plt.matshow(X_ba[:5*5])

In [None]:
for images, (label_col, label_row) in [(X_ba, (0, 0)), (X_bb, (1, 0)),(X_aa, (0, 1)), (X_ab, (1, 1))]

#refer the handson_ml3 repo for more 

## Multilabel Classification

In [None]:
#classification system that outputs multiple binary tags are called multilabel classiification system

In [None]:
np.c_?

In [None]:
from sklearn.neighbors import KNeighborsClassifier
#a particular number can be odd or even
#this selects the even and the odds and concats by the last axis
#hence when a number at an index is even it shows[True, False]
#hence a binary classifier
y_train_large = (y_train >= 7)
y_train_odd = (y_train % 2 == 1)
y_multilabel = np.c_[y_train_large, y_train_odd]

knn_clf  = KNeighborsClassifier()
knn_clf.fit(X_train, y_multilabel)

In [None]:
knn_clf.predict([some_digit])

In [None]:
plt.matshow(some_digit.reshape(28,28))
plt.axis('off')

#since the some_digit is 5 the prediction is true


In [None]:
#Evaluating the model

from sklearn.model_selection import cross_val_predict
from sklearn.metrics import f1_score

y_train_knn_pred = cross_val_predict(knn_clf, X_train, y_multilabel, cv=3)