In [3]:
import pandas as pd

# Load the datasets

train_data = pd.read_csv("/kaggle/input/digit-rec/train.csv")
test_data = pd.read_csv("/kaggle/input/digit-rec/test.csv")

# Display the first few rows of the training data
train_data.head()


Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


We'll check the distribution of labels in the training data to ensure that there's no significant imbalance. This is important because an imbalanced dataset might bias the model towards the class that has more samples.

In [4]:
# Check the distribution of labels in the training data
train_data['label'].value_counts().sort_index()


0    4132
1    4684
2    4177
3    4351
4    4072
5    3795
6    4137
7    4401
8    4063
9    4188
Name: label, dtype: int64

The distribution of labels in the training data seems quite balanced. There's no significant discrepancy in the number of samples for each digit, which is good for training our model.

The next step is to preprocess the data. 

For this problem, a Convolutional Neural Network (CNN) would be a good choice, given its strong performance on image classification tasks.

In [5]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import numpy as np

# Normalize the pixel values
train_data.iloc[:, 1:] = train_data.iloc[:, 1:] / 255.0
test_data = test_data / 255.0

# Reshape the images
train_images = train_data.iloc[:, 1:].values.reshape(-1, 28, 28, 1)
test_images = test_data.values.reshape(-1, 28, 28, 1)

# One-hot encode the labels
encoder = OneHotEncoder(sparse=False)
train_labels = encoder.fit_transform(train_data['label'].values.reshape(-1, 1))

# Split the training data into a training set and a validation set
train_images, val_images, train_labels, val_labels = train_test_split(
    train_images, train_labels, test_size=0.2, random_state=42
)

train_images.shape, train_labels.shape, val_images.shape, val_labels.shape


  train_data.iloc[:, 1:] = train_data.iloc[:, 1:] / 255.0


((33600, 28, 28, 1), (33600, 10), (8400, 28, 28, 1), (8400, 10))

let's proceed to building and training the model.

We'll use Keras to construct a Convolutional Neural Network (CNN). This type of network is commonly used in image classification tasks and has shown great performance.

Here's a simple CNN architecture to start with:

1. Convolutional layer with 32 filters, a 3x3 kernel, and 'relu' activation function.
2. MaxPooling layer with 2x2 pool size.
3. Convolutional layer with 64 filters, a 3x3 kernel, and 'relu' activation function.
4. MaxPooling layer with 2x2 pool size.
5. Flatten layer to transform the 2D data to 1D.
6. Dense layer (fully connected) with 256 units and 'relu' activation function.
7. Output Dense layer with 10 units (for the 10 classes) and 'softmax' activation function.
8. We'll use the 'categorical_crossentropy' loss function which is suitable for multi-class classification, and the 'adam' optimizer which is a commonly used optimizer.

In [6]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense

# Initialize the model
model = Sequential()

# Add layers
model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)))
model.add(MaxPooling2D((2, 2)))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D((2, 2)))
model.add(Flatten())
model.add(Dense(256, activation='relu'))
model.add(Dense(10, activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Show the model summary
model.summary()


caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 26, 26, 32)        320       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 13, 13, 32)       0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 11, 11, 64)        18496     
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 5, 5, 64)         0         
 2D)                                                             
                                                                 
 flatten (Flatten)           (None, 1600)              0         
                                                                 
 dense (Dense)               (None, 256)               4

In [7]:
# Train the model
history = model.fit(
    train_images, train_labels, 
    validation_data=(val_images, val_labels), 
    epochs=10, batch_size=32
)

# Save the model for future use
model.save("digit_recognizer_model.h5")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


Once the model is trained, you can evaluate it on your validation data and make predictions on your test data

In [14]:
# Evaluate the model
val_loss, val_acc = model.evaluate(val_images, val_labels)

# Make predictions
predictions = model.predict(test_images)




The predictions array will contain the predicted probabilities of each class (digit) for each image in the test set. To get the predicted class, you take the index of the maximum probability:

In [18]:
predicted_labels = np.argmax(predictions, axis=1)

Creating a submission file

In [19]:
# Create a DataFrame with the ImageIds and the predicted labels
submission = pd.DataFrame({
    "ImageId": range(1, len(test_images) + 1),
    "Label": predicted_labels
})

# Save the DataFrame to a CSV file
submission.to_csv("submission.csv", index=False)
