In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [2]:
os.getcwd()

'/kaggle/working'

### Load and preprocess data

In [3]:
data = pd.read_csv("../input/digit-recognizer/train.csv")
test_data = pd.read_csv("../input/digit-recognizer/test.csv")

In [4]:
X = data.drop("label", axis=1).values
y = data["label"].values

In [5]:
X = X / 255.0
test_data = test_data / 255.0

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# One-hot encode the labels
encoder = OneHotEncoder(sparse=False)
y_train_encoded = encoder.fit_transform(y_train.reshape(-1, 1))
y_test_encoded = encoder.transform(y_test.reshape(-1, 1))



In [8]:
# Dimensions for X and y
num_samples, num_features = X_train.shape
num_classes = y_train_encoded.shape[1]

### Initialize Weights and Biases

In [9]:
W = np.random.randn(num_features, num_classes) * 0.01  # Small random weights
b = np.zeros((1, num_classes))  # Bias is initially zero

### Define sigmoid function and cost function (Binary Cross-Entropy)

In [10]:
def softmax(z):
    exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
    return exp_z / np.sum(exp_z, axis=1, keepdims=True)

In [11]:
def compute_cost(A, y):
    m = y.shape[0]
    cost = -1/m * np.sum(y * np.log(A) + (1 - y) * np.log(1 - A))
    return cost

### Implement Gradient Descent

In [12]:
def gradient_descent(X, y, W, b, learning_rate=0.01, num_iterations=1000):
    m = X.shape[0]
    
    for i in range(num_iterations):
        # Forward propagation
        Z = np.dot(X, W) + b
        A = softmax(Z)

        cost = compute_cost(A, y)
        
        # Backward propagation
        dZ = A - y
        dW = np.dot(X.T, dZ) / m
        db = np.sum(dZ) / m
        
        W -= learning_rate * dW
        b -= learning_rate * db
        
        if i % 100 == 0:
            print(f"Cost after iteration {i}: {cost:.4f}")
    
    return W, b

### Training model

In [13]:
# Set hyperparameters
learning_rate = 0.2
num_iterations = 2000

# Train the model
W_trained, b_trained = gradient_descent(X_train, y_train_encoded, W, b, learning_rate, num_iterations)

Cost after iteration 0: 3.2560
Cost after iteration 100: 0.8148
Cost after iteration 200: 0.6864
Cost after iteration 300: 0.6313
Cost after iteration 400: 0.5988
Cost after iteration 500: 0.5766
Cost after iteration 600: 0.5602
Cost after iteration 700: 0.5473
Cost after iteration 800: 0.5370
Cost after iteration 900: 0.5283
Cost after iteration 1000: 0.5210
Cost after iteration 1100: 0.5146
Cost after iteration 1200: 0.5091
Cost after iteration 1300: 0.5041
Cost after iteration 1400: 0.4997
Cost after iteration 1500: 0.4956
Cost after iteration 1600: 0.4920
Cost after iteration 1700: 0.4886
Cost after iteration 1800: 0.4855
Cost after iteration 1900: 0.4827


### Test model

In [14]:
def predict(X, W, b):
    Z = np.dot(X, W) + b
    A = softmax(Z)
    return np.argmax(A, axis=1)

In [20]:
y_pred = predict(X_test, W_trained, b_trained)

In [21]:
accuracy = np.mean(y_pred == y_test)
print(f"Test accuracy: {accuracy:.4f}")

Test accuracy: 0.9154


### Make submission

In [22]:
results = predict(test_data, W_trained, b_trained)

In [23]:
submission = pd.DataFrame(results)
submission.index.name='ImageId'
submission.index+=1
submission.columns=['Label']

In [24]:
submission.to_csv('submission.csv', header=True)