In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import tensorflow
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense , Flatten

In [None]:
(X_train , y_train) , (X_test , y_test) = keras.datasets.mnist.load_data()

In [None]:
X_train.shape

In [None]:
import matplotlib.pyplot as plt
plt.imshow(X_train[0])

In [None]:
plt.imshow(X_train[2])

### Now converting the pixel range into 0 to 1

In [None]:
X_train = X_train / 255
X_test  = X_test / 255

In [None]:
X_train[0]

### MNIST images are shape (28, 28) (2D), but a Dense layer in Keras needs a 1D vector like (784,).

### Input layer -> 1st one
### Hidden layer -> 2nd one
### Output layer -> last one , **As there are More Than 1 O/p** so using ***Activation = Softmax***

✅ Flatten(input_shape=(28,28))
This converts each image from shape (28, 28) → (784,).

So yes, the input size of 784 is fixed because MNIST images are always 28×28 = 784 pixels.


✅ Dense(128, activation='relu')
This is your hidden layer, and the number 128 is chosen by you — it's a hyperparameter.

You could choose 64, 256, 512, etc.

More neurons = more capacity to learn, but also higher risk of overfitting.


✅ Dense(10, activation='softmax')
The number 10 is fixed because MNIST has 10 classes: digits 0 to 9.

The softmax function gives a probability distribution over these 10 classes.



In [None]:
model = Sequential()
model.add(Flatten(input_shape = (28,28)))
model.add(Dense(128 , activation = 'relu'))
model.add(Dense(10 , activation = 'softmax'))  

### 🧠 Parameter Calculation for the Model

#### 📐 Layer 1: Flatten → Dense(128)

- Input size after flattening: 784  
- Hidden layer size: 128  
- Each hidden neuron connects to all 784 inputs  
- Each hidden neuron has 1 bias

**Calculation:**

- Weights = 784 × 128 = 100,352  
- Biases = 128  
- **Total parameters = 100,352 + 128 = 100,480**

---

#### 📐 Layer 2: Dense(128) → Dense(10)

- Hidden layer size: 128  
- Output layer size: 10  
- Each output neuron connects to all 128 hidden neurons  
- Each output neuron has 1 bias

**Calculation:**

- Weights = 128 × 10 = 1,280  
- Biases = 10  
- **Total parameters = 1,280 + 10 = 1,290**

---

### ✅ Total Trainable Parameters

**100,480 + 1,290 = 101,770**

You can verify this in your code using:

```python
model.summary()


In [None]:
model.summary()

### 🧠 Why `sparse_categorical_crossentropy`?

We use it because MNIST labels are integers (0–9), not one-hot encoded.

- `sparse_categorical_crossentropy` handles integer labels directly.
- Saves memory and avoids using `to_categorical`.

```python
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [None]:
model.compile(loss = 'sparse_categorical_crossentropy' , optimizer = 'Adam')

In [None]:
model.fit (X_train , y_train, epochs=10, validation_split = 0.2)

In [None]:
model.predict(X_test)

In [None]:
y_prob = model.predict(X_test)

#### 👉 y_prob.argmax(axis=1) means:
### It finds the index of the maximum value along axis=1 (i.e., across each row) — which gives you the predicted class for each sample.



```
 y_prob = [
    [0.1, 0.3, 0.05, 0.55, 0.0, ..., 0.0],  # sample 1
    [0.0, 0.0, 0.98, 0.02, 0.0, ..., 0.0],  # sample 2
]

y_pred = y_prob.argmax(axis=1)
# Output: [3, 2]
```
> 🔹 3 is the class with highest prob for sample 1
> 🔹 2 is the class with highest prob for sample 2

In [None]:
y_prob.argmax(axis = 1)

In [None]:
y_pred = y_prob.argmax(axis = 1)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test , y_pred)

### Now improving accuracy_Score by adding one more hidden layer

In [None]:
model_new = Sequential()
model_new.add(Flatten(input_shape = (28,28)))
model_new.add(Dense(128 , activation = 'relu'))
model_new.add(Dense(32 , activation = 'relu'))
model_new.add(Dense(10 , activation = 'softmax'))  

In [None]:
model_new.summary()

In [None]:
model_new.compile(loss = 'sparse_categorical_crossentropy' , optimizer = 'Adam' , metrics = ['accuracy'])

In [None]:
history = model_new.fit (X_train , y_train, epochs=25, validation_split = 0.2)

In [None]:
y_prob_new = model_new.predict(X_test)

In [None]:
y_prob_new.argmax(axis = 1)

In [None]:
y_pred_new = y_prob.argmax(axis = 1)

In [None]:
accuracy_score(y_test , y_pred_new)

### We can see after some time the val_loss increases
### > Overfitting

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])

### Overfitting

In [None]:
plt.plot(history.history['accuracy'] , label = 'Accuracy')
plt.plot(history.history['val_accuracy'] , label = 'Val_Accuracy')
plt.legend()

In [None]:
plt.imshow(X_test[0])

### Now lets predict our trained model

````
###🤔 Why Use `reshape(1, 28, 28)` and `argmax(axis=1)`?

---

####🧱 1. Reshaping the Input

When predicting a single image:

```python
X_test[0]  # shape: (28, 28)
````

But the model expects a **batch**, even if it contains just one image.
So we reshape it to:

```python
X_test[0].reshape(1, 28, 28)  # shape: (1, 28, 28)
```

This makes it a **batch of 1 sample**, which the model accepts.

---

#### 🧠 2. Why Use `argmax(axis=1)`

After prediction:

```python
y_prob = model.predict(X_test[0].reshape(1, 28, 28))
```

You get output like:

```python
[[0.01, 0.02, 0.85, 0.03, 0.04, ..., 0.01]]
```

Each number is the **probability for a class (0–9)**.

We use:

```python
y_prob.argmax(axis=1)
```

To get the **index of the highest probability** — that is, the **predicted class**.

---

#### ✅ Final Code:

```python
model.predict(X_test[0].reshape(1, 28, 28)).argmax(axis=1)
```

This gives you the **predicted digit** (e.g., `2`) for that test image.

```
```


In [None]:
model.predict(X_test[0].reshape(1,28,28)).argmax(axis=1)