# Perceptrons
You should build an end-to-end machine learning pipeline using a perceptron model. In particular, you should do the following:
- Load the `mnist` dataset using [Pandas](https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html). You can find this dataset in the datasets folder.
- Split the dataset into training and test sets using [Scikit-Learn](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html).
- Build an end-to-end machine learning pipeline, including a [perceptron](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Perceptron.html) model.
- Optimize your pipeline by validating your design decisions.
- Test the best pipeline on the test set and report various [evaluation metrics](https://scikit-learn.org/0.15/modules/model_evaluation.html).  
- Check the documentation to identify the most important hyperparameters, attributes, and methods of the model. Use them in practice.

In [28]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Perceptron
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [29]:
df = pd.read_csv('https://raw.githubusercontent.com/m-mahdavi/teaching/refs/heads/main/datasets/mnist.csv')
df.head()

Unnamed: 0,id,class,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
0,31953,5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,34452,8,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,60897,5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,36953,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1981,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
print(df_train.shape)
print(df_test.shape)

(3200, 786)
(800, 786)


In [8]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Columns: 786 entries, id to pixel784
dtypes: int64(786)
memory usage: 24.0 MB


In [9]:
df_train.dtypes

Unnamed: 0,0
id,int64
class,int64
pixel1,int64
pixel2,int64
pixel3,int64
...,...
pixel780,int64
pixel781,int64
pixel782,int64
pixel783,int64


In [10]:
df.describe()

Unnamed: 0,id,class,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
count,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,...,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0
mean,34415.17925,4.4395,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.07675,0.01525,0.013,0.0015,0.0,0.0,0.0,0.0,0.0,0.0
std,20508.890104,2.879655,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.616022,0.964495,0.822192,0.094868,0.0,0.0,0.0,0.0,0.0,0.0
min,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,16575.75,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,34435.5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,52111.5,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,69998.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,125.0,61.0,52.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
df.drop("id",axis = 1, inplace = True)
df

Unnamed: 0,class,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
0,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3996,6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3997,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3998,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
df_train.isnull().sum().value_counts()


Unnamed: 0,count
0,786


In [15]:
df_train.duplicated()

Unnamed: 0,0
3994,False
423,False
2991,False
1221,False
506,False
...,...
1130,False
1294,False
860,False
3507,False


In [20]:
X_train = df_train.drop('class', axis=1)
y_train = df_train['class']
X_test = df_test.drop('class', axis=1)
y_test = df_test['class']
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(3200, 785)
(3200,)
(800, 785)
(800,)


In [32]:
Perceptron(random_state=42)

In [33]:
y1 = Perceptron(random_state=42)
y1.fit(X_train, y_train)

In [34]:
acc1 = y1.score(X_test, y_test)
print(acc1)

0.7575


In [27]:
Perceptron


In [31]:
Pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('perceptron', Perceptron())

])

In [35]:
Pipeline.fit(X_train, y_train)

In [36]:
Y_pred = Pipeline.predict(X_test)

In [37]:
print(accuracy_score(y_test, Y_pred))
print(confusion_matrix(y_test, Y_pred))
print(classification_report(y_test, Y_pred))

0.8325
[[68  0  0  0  0  1  0  0  1  0]
 [ 0 93  1  1  1  1  0  0  3  0]
 [ 0  2 57  2  2  0  5  1  3  1]
 [ 0  0  5 66  0 10  1  0  3  1]
 [ 0  0  0  0 72  0  3  3  2  0]
 [ 0  0  2  4  0 50  1  0  4  3]
 [ 0  0  4  0  2  3 78  0  1  2]
 [ 0  0  1  0  1  1  0 63  0  1]
 [ 1  3  3  4  1 10  1  1 69  1]
 [ 0  0  5  1  4  3  0  8  5 50]]
              precision    recall  f1-score   support

           0       0.99      0.97      0.98        70
           1       0.95      0.93      0.94       100
           2       0.73      0.78      0.75        73
           3       0.85      0.77      0.80        86
           4       0.87      0.90      0.88        80
           5       0.63      0.78      0.70        64
           6       0.88      0.87      0.87        90
           7       0.83      0.94      0.88        67
           8       0.76      0.73      0.75        94
           9       0.85      0.66      0.74        76

    accuracy                           0.83       800
   macro avg