In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Abstract

I have used the TMNIST (MNIST Typography) dataset Glyphs a collection of over 500,000 MNIST-style images comprised of 2,990 different font styles and 1,812 distinct glyphs.

The csv file is as follows:

1. Column headers in the first row are ['fontname', 'glyphname', 'label','1', '2',.....'784'].
2. 'Acme-Regular' and 'ZillaSlab-Bold' are only a couple of the font file names in the 'font_name' column.
3. The unicodedata name for the glyph, such as "LATIN CAPITAL LETTER A" and "DEVANAGARI LETTER AA," is found in the "glyph_name" column.
4. The 'glyphname' column provides the names of both characters, joined together with a '+' symbol, for glyphs that are represented by more than one unicode character. For instance, "" has the glyphname "DEVANAGARI SIGN ANUSVARA + DEVANAGARI LETTER A"
5. The "label" column includes letters like "," "E," or "." the 784 additional columns




# Reading Dataset

In [None]:
# Reading Data
df = pd.read_csv('/kaggle/input/tmnist-glyphs-1812-characters/Glyphs_TMNIST_updated.csv')
df.head()

# Exploratory Data Analysis

In [None]:
print(f"The Shape of the Dataframe is: {df.shape}")
print(f"Number of Samples: {df.shape[0]}")

In [None]:
# Alphanumeric and Symbols List

symbols = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 
           'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 
           'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 
           '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@','[',']','\\','^','_','`','{','}',"|",'~']

len(symbols)

In [None]:
# Get all the sample hows label are present in the list "symbols"

df = df[df.label.isin(symbols)]

In [None]:
# DataFrame Description

df.describe().T

In [None]:
# DataFrame feature's Datatype

df.info(verbose=True)

In [None]:
# Number of Fonts in the Dataframe
print(f"Number of unique fonts present in the Dataset: {len(df.font_name.unique())}")

In [None]:
# Number of unique character in the Dataframe
print(f"Number of unique character present in the Dataset: {len(df.label.unique())}")

In [None]:
# Spliting the Labels and the features

X = df.drop(columns=['font_name','glyph_name','label']).values
y = df[['label']].values
del df

In [None]:
X = X.astype('u1')
X.dtype

In [None]:
X.shape, y.shape

In [None]:
# Display few of the characters

import matplotlib.pyplot as plt

X_images = X.reshape(-1,28,28)
fig,axs = plt.subplots(3,3,figsize=(9,9))
for i in range(9):
    r=i//3
    c=i%3
    axs[r][c].set_xticks([])
    axs[r][c].set_yticks([])
    axs[r][c].imshow(X_images[i])
plt.show()
del X_images

# Split the Dataframe into Training and Test Dataframe

We use train_test_split from sklearn to split our dataframe

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
y_train = y_train.reshape((-1,))
y_test = y_test.reshape((-1,))

In [None]:
X_train.shape, X_test.shape

In [None]:
y_train.shape, y_test.shape

# Binarize Labels

It is used for encoding categorical variables or labels. It converts a categorical target column into multiple binary columns, with a 1 or 0 for each category. 

In [None]:
from sklearn.preprocessing import LabelBinarizer

lb = LabelBinarizer()
y_train_label = lb.fit_transform(y_train)
y_test_label = lb.transform(y_test)
print('Train labels dimension:');print(y_train.shape)
print('Test labels dimension:');print(y_test.shape)

# Normalize the Training and Testing Dataset

Scaling the input features so that they have a similar range and distribution. It can help to improve the performance and stability of the model.

In [None]:
# Normalizing the Dataset for the Neural Network

X_train, X_test = np.true_divide(X_train, 255), np.true_divide(X_test, 255)

# Dense Neural Network

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import roc_auc_score, accuracy_score

In [None]:
# Create Model

model = Sequential()
model.add(Dense(250, input_shape=(X_train.shape[1],), activation='relu'))
model.add(Dense(125, activation='relu'))
model.add(Dense(100, activation='relu'))
model.add(Dense(y_train_label.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# Model Summary

model.summary()

In [None]:
# Configure the model and start training
model.fit(X_train, y_train_label, epochs=50, batch_size=150, verbose=1, validation_split=0.1)

In [None]:
# Test the model after training
test_results = model.evaluate(X_test, y_test_label, verbose=1)
print(f'Test results - Accuracy: {test_results[1]}%')

# Convolution Neural Network

A typical CNN architecture consists of three types of layers: convolutional layers, pooling layers, and fully connected layers. The convolutional layers perform feature extraction by applying a set of filters to the input image, producing a set of feature maps. The pooling layers then downsample the feature maps, reducing the spatial dimensions of the data. Finally, the fully connected layers perform classification by taking the output of the previous layers and mapping it to a set of output classes.

In [None]:
# Reshape X_train and X_test for CNN

X_train = X_train.reshape(-1,28,28,1).astype('float32')
X_test = X_test.reshape(-1,28,28,1).astype('float32')

In [None]:
X_train.shape

In [None]:
#CNN Model

cnnmodel = Sequential()
cnnmodel.add(Conv2D(32,(4,4),input_shape = (28,28,1),activation = 'relu'))
cnnmodel.add(MaxPooling2D(pool_size=(2,2)))
cnnmodel.add(Conv2D(64,(3,3),activation = 'relu'))
cnnmodel.add(MaxPooling2D(pool_size=(2,2)))
cnnmodel.add(Dropout(0.2))
cnnmodel.add(Flatten())
cnnmodel.add(Dense(128,activation='relu'))
cnnmodel.add(Dense(y_train_label.shape[1], activation='softmax'))
cnnmodel.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])

In [None]:
# CNN Model Summary

cnnmodel.summary()

In [None]:
# Train the CNN Model

result = cnnmodel.fit(X_train, y_train_label, validation_split=0.1, epochs=50, batch_size=100, verbose=1)

In [None]:
# Test the CNN model after training
test_results = cnnmodel.evaluate(X_test, y_test_label, verbose=1)
print(f'Test results - Accuracy:{round(test_results[1],4)*100}%')

# Visualizing the model performance

In [None]:
# Function for Plotting
def Plott (data):
    fig, ax = plt.subplots(1,2 , figsize = (20,7))
    # summarize history for accuracy
    ax[0].plot(data.history['accuracy'])
    ax[0].plot(data.history['val_accuracy'])
    ax[0].set_title('model accuracy')
    ax[0].legend(['train', 'test'], loc='upper left')

    # summarize history for loss
    ax[1].plot(data.history['loss'], label =['loss'])
    ax[1].plot(data.history['val_loss'] ,label =['val_loss'])
    ax[1].set_title('model loss')
    ax[1].legend(['train', 'test'], loc='upper left')
    plt.show()


In [None]:
Plott(result)

# Conclusion

Accuracy was 94.81% utilizing a CNN Model with 50 epochs and a batch size of 100. Additionally, by altering the CNN model's epoch and batch size and evaluating the results, the model's accuracy can be improved.

# References

1. https://www.kaggle.com/code/nikhilkuttan/tmnist-glyphs-cnn
2. https://www.kaggle.com/code/sheshngupta/tminst-character-recognition-94-4-accuracy
3. https://www.kaggle.com/code/rushabhfegade/tmnist-glyphs/notebook#Split-the-Dataframe-into-Training-and-Test-Dataframe

# MIT License

Copyright (c) 2024 Anushka Rajesh Darade

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

