# **1. Installing required Packages**

In [4]:
!pip install gdown



# **2. Importing required packages**

In [5]:
import gdown
import zipfile
import os
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
import shutil
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import joblib
import cv2
import numpy as np
from PIL import Image

# **3. Downloading all the neccesary Datasets**

In [6]:
# Downloading alphabets dataset zip file
file_id = '12s2_c2hAiWTScm9jgQC9eyxTVcITiX2-'
output = 'alphabets_dataset.zip'
gdown.download(f'https://drive.google.com/uc?id={file_id}', output, quiet=False)

# Download and load the sentiment analysis dataset
file_id_sentiment = '1JGHXIqwYlJmNs-ZbxVrp4ONCSceUfCCs'
output_sentiment = 'sentiment_analysis_dataset.csv'
gdown.download(f'https://drive.google.com/uc?id={file_id_sentiment}', output = output_sentiment, quiet=False)

# Download the target labels for performance evaluation
file_id_target = '1aRK2wcR207iuaPeYIECWd1YtHGv78Pek'
output_target = 'target_labels.csv'
gdown.download(f'https://drive.google.com/uc?id={file_id_target}', output = output_target, quiet=False)

# Download the target images for evaluation
target_images_folder_url = 'https://drive.google.com/drive/folders/1nE29IzfQoesIQi56nBc29JxdX-YUlJO5?usp=drive_link'
target_images_folder_path = 'content/target_images'
gdown.download_folder(target_images_folder_url, output=target_images_folder_path, quiet=False)

Downloading...
From (original): https://drive.google.com/uc?id=12s2_c2hAiWTScm9jgQC9eyxTVcITiX2-
From (redirected): https://drive.google.com/uc?id=12s2_c2hAiWTScm9jgQC9eyxTVcITiX2-&confirm=t&uuid=457bbb42-bac6-41fb-9a34-178a411b7a6f
To: /content/alphabets_dataset.zip
100%|██████████| 944M/944M [00:24<00:00, 38.3MB/s]
Downloading...
From: https://drive.google.com/uc?id=1JGHXIqwYlJmNs-ZbxVrp4ONCSceUfCCs
To: /content/sentiment_analysis_dataset.csv
100%|██████████| 4.00k/4.00k [00:00<00:00, 10.0MB/s]
Downloading...
From: https://drive.google.com/uc?id=1aRK2wcR207iuaPeYIECWd1YtHGv78Pek
To: /content/target_labels.csv
100%|██████████| 131/131 [00:00<00:00, 122kB/s]
Retrieving folder contents


Processing file 1gxFptf-CKg43GoWckb5dlOZpKGo8hjxK line_1.png
Processing file 1bbBrUK5MlR1e02M5Y2tmb-RIkr-AXVOn line_2.png
Processing file 1mwrjJ1NtIfaLj4mGjozeUFAzu126O8Qp line_3.png
Processing file 16WvvlGPojnFG5FTEgRNO7B434-M93wH9 line_4.png
Processing file 1T6jyUxdnTpHVUcpxDTtWkWnRoxctXkMI line_5.png
Processing file 1mve18oSThdpxMH1XHcOU7sa-hR2lyWsP line_6.png


Retrieving folder contents completed
Building directory structure
Building directory structure completed
Downloading...
From: https://drive.google.com/uc?id=1gxFptf-CKg43GoWckb5dlOZpKGo8hjxK
To: /content/content/target_images/line_1.png
100%|██████████| 23.9k/23.9k [00:00<00:00, 18.7MB/s]
Downloading...
From: https://drive.google.com/uc?id=1bbBrUK5MlR1e02M5Y2tmb-RIkr-AXVOn
To: /content/content/target_images/line_2.png
100%|██████████| 23.9k/23.9k [00:00<00:00, 33.1MB/s]
Downloading...
From: https://drive.google.com/uc?id=1mwrjJ1NtIfaLj4mGjozeUFAzu126O8Qp
To: /content/content/target_images/line_3.png
100%|██████████| 21.6k/21.6k [00:00<00:00, 35.1MB/s]
Downloading...
From: https://drive.google.com/uc?id=16WvvlGPojnFG5FTEgRNO7B434-M93wH9
To: /content/content/target_images/line_4.png
100%|██████████| 23.2k/23.2k [00:00<00:00, 34.8MB/s]
Downloading...
From: https://drive.google.com/uc?id=1T6jyUxdnTpHVUcpxDTtWkWnRoxctXkMI
To: /content/content/target_images/line_5.png
100%|██████████| 23.6k/

['content/target_images/line_1.png',
 'content/target_images/line_2.png',
 'content/target_images/line_3.png',
 'content/target_images/line_4.png',
 'content/target_images/line_5.png',
 'content/target_images/line_6.png']

# **4. Unzipping and Creating Dataframes**

In [7]:
# Unzipping and Extracting files from alphabets_dataset.zip
extract_dir = '/content/extracted_files'
os.makedirs(extract_dir, exist_ok=True)
with zipfile.ZipFile(output, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

# Identify the extracted CSV file and image directory
extracted_files = os.listdir(extract_dir)
csv_file = [f for f in extracted_files if f.endswith('.csv')][0]
image_dir = [f for f in extracted_files if not f.endswith('.csv')][0]

csv_file_path = os.path.join(extract_dir, csv_file)
image_folder_path = os.path.join(extract_dir, image_dir)

# Load and display the CSV file containing image labels
df = pd.read_csv(csv_file_path)
print(df.head())

# Prepare subdirectories for images based on their labels for training the OCR model
for _, row in df.iterrows():
    label = str(row['label'])
    src = os.path.join(image_folder_path, row['file'])
    dest_dir = os.path.join(image_folder_path, label)
    os.makedirs(dest_dir, exist_ok=True)
    shutil.move(src, os.path.join(dest_dir, row['file']))

# Set up data generators for image augmentation and data splitting for training and validation
image_size = (28, 28)
batch_size = 32

datagen = ImageDataGenerator(rescale=1.0/255.0, validation_split=0.2)

train_generator = datagen.flow_from_directory(
    image_folder_path,
    target_size=image_size,
    color_mode='grayscale',
    batch_size=batch_size,
    class_mode='sparse',
    subset='training'
)

validation_generator = datagen.flow_from_directory(
    image_folder_path,
    target_size=image_size,
    color_mode='grayscale',
    batch_size=batch_size,
    class_mode='sparse',
    subset='validation'
)

# Creating dataframe for sentimental_analysis.csv
df_sentiment = pd.read_csv('sentiment_analysis_dataset.csv')
df_sentiment.head()

# Creating dataframe for target_images
target_df = pd.read_csv('target_labels.csv')
target_df.head()


          file label
0  image_1.png     A
1  image_2.png     A
2  image_3.png     A
3  image_4.png     A
4  image_5.png     A
Found 297971 images belonging to 26 classes.
Found 74480 images belonging to 26 classes.


Unnamed: 0,file,sentiment
0,line_1.png,Angry
1,line_2.png,Angry
2,line_3.png,Happy
3,line_4.png,Happy
4,line_5.png,Neutral


# **5. Defining and compiling the CNN model**

In [8]:
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dense(train_generator.num_classes, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the CNN model
history = model.fit(
    train_generator,
    validation_data=validation_generator,
    epochs=10
)

# Save the trained OCR model
model.save('ocr_model.h5')



  saving_api.save_model(



# **6. Filter sentiment analysis dataset and encode labels**

In [9]:
valid_labels = ['Happy', 'Angry', 'Neutral']
df_sentiment = df_sentiment[df_sentiment['sentiment'].isin(valid_labels)]
label_mapping = {'Happy': 0, 'Angry': 1, 'Neutral': 2}
df_sentiment['sentiment'] = df_sentiment['sentiment'].map(label_mapping).astype(int)

# **7. Split data and vectorize text for Naive Bayes sentiment analysis**

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df_sentiment['line'], df_sentiment['sentiment'], test_size=0.2, random_state=42)
vectorizer = CountVectorizer()
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

# **8. Training and Evaluating Naive Bayes**

In [11]:
nb_model = MultinomialNB()
nb_model.fit(X_train_vect, y_train)

y_pred = nb_model.predict(X_test_vect)
accuracy = accuracy_score(y_test, y_pred)
print(f'Naive Bayes Sentiment Analysis Accuracy: {accuracy * 100:.2f}%')
print(classification_report(y_test, y_pred, target_names=['Happy', 'Angry', 'Neutral']))

joblib.dump(vectorizer, 'vectorizer.pkl')
joblib.dump(nb_model, 'naive_bayes_model.pkl')

Naive Bayes Sentiment Analysis Accuracy: 66.67%
              precision    recall  f1-score   support

       Happy       0.50      0.50      0.50         2
       Angry       0.50      0.50      0.50         2
     Neutral       1.00      1.00      1.00         2

    accuracy                           0.67         6
   macro avg       0.67      0.67      0.67         6
weighted avg       0.67      0.67      0.67         6



['naive_bayes_model.pkl']

# **9. Loading all the neccesary models**

In [12]:
ocr_model = load_model('ocr_model.h5')
vectorizer = joblib.load('vectorizer.pkl')
nb_model = joblib.load('naive_bayes_model.pkl')

# **10. Defining the character mapping from numeric labels to characters**

In [13]:
label_to_char = {
    0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E', 5: 'F', 6: 'G', 7: 'H', 8: 'I', 9: 'J',
    10: 'K', 11: 'L', 12: 'M', 13: 'N', 14: 'O', 15: 'P', 16: 'Q', 17: 'R', 18: 'S', 19: 'T',
    20: 'U', 21: 'V', 22: 'W', 23: 'X', 24: 'Y', 25: 'Z', 26: ' '
}

# **11. Function to preprocess and predict text from images**

In [14]:
def predict_text(image_path):
    try:
        img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
        height, width = img.shape
        detected_texts = []
        block_size = 28

        for y in range(0, height, block_size):
            line_texts = []
            for x in range(0, width, block_size):
                block = img[y:y+block_size, x:x+block_size]
                if np.all(block == 0):
                    line_texts.append(' ')
                else:
                    if block.shape[0] != 28 or block.shape[1] != 28:
                        block = cv2.resize(block, (28, 28))
                    block_array = np.array(block).reshape(1, 28, 28, 1) / 255.0
                    prediction = ocr_model.predict(block_array)
                    predicted_label = np.argmax(prediction, axis=1)[0]
                    predicted_char = label_to_char.get(predicted_label, '?')
                    line_texts.append(predicted_char)
            detected_texts.append(''.join(line_texts))

        full_text = ''.join(detected_texts)
        print(f'Predicted text from {image_path}: {full_text}')
        return full_text
    except Exception as e:
        print(f'Error in predicting text from {image_path}: {e}')
        return None


# **12. Function to evaluate OCR and sentiment analysis on the target images**

In [15]:
def evaluate_performance():
    correct_count = 0
    total_count = len(target_df)

    for _, row in target_df.iterrows():
        image_path = os.path.join(target_images_folder_path, row['file'])
        predicted_text = predict_text(image_path)

        if predicted_text is not None:
            sentiment_vector = vectorizer.transform([predicted_text])
            sentiment_prediction = nb_model.predict(sentiment_vector)[0]

            if sentiment_prediction == label_mapping.get(row['sentiment'], -1):
                correct_count += 1

    accuracy = correct_count / total_count
    print(f'Performance on Target Images: {accuracy * 100:.2f}%')

# **13. Evaluating the performance of the OCR and sentiment analysis**

In [16]:
evaluate_performance()

Predicted text from content/target_images/line_1.png: I AM REALLY ANNOYED BY YOUR CONSTANT COMPLAINING AND YOU NEVER OFEER ANY SOLUTIONS WHICH IS VERY UNHELPFUL AND NEGATIVE 
Predicted text from content/target_images/line_2.png: IT IS FRUSTRATING THAT YOU NEVER PAY ATTENTION DURING DISCUSSIONS AND YOUR LACK OF FOCUS IS REALLY AFFECTING OUR PROGRESS         
Predicted text from content/target_images/line_3.png: I AM BELIGHTED BY BOUR ERIENDLINESS AND YOU ALWAYS MAKE EVERYONE EEEL WELCOME WHICH FOSTERS A SENSE OF COMMUNITY        
Predicted text from content/target_images/line_4.png: IT IS WONDERFUL THAT YOU ALWAYS SHOW KINDNESS ANO YOUR EMPATHY TOWARDS OTHERS IS TRULY HEARTWARMINJ AND APPRECIATEO     
Predicted text from content/target_images/line_5.png: YOUR ANALYSIS OF THE DATA WAS ACCURATE AND WELL PRESENTED PROVIDING A CLEAR UNOERSTANDING OF THE TREWDS AND PATTERNS    
Predicted text from content/target_images/line_6.png: THE MEETING MINUTES YOW PREPARED WERE DETAILED AND WELL ORGAN

# **Libraries and Methods Documentation Links**

**Libraries :**

1. Google Drive API Documentation: https://developers.google.com/drive/api/v3/about-sdk

2. gdown GitHub Repository: https://github.com/wkentaro/gdown

3. Python zipfile Module Documentation: https://docs.python.org/3/library/zipfile.html

4. Python os Module Documentation: https://docs.python.org/3/library/os.html

5. Pandas Documentation: https://pandas.pydata.org/docs/

6. TensorFlow ImageDataGenerator Documentation: https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/image/ImageDataGenerator

7. TensorFlow Sequential Model Documentation: https://www.tensorflow.org/api_docs/python/tf/keras/Sequential

8. TensorFlow Keras Layers Documentation: https://www.tensorflow.org/api_docs/python/tf/keras/layers

9. Python shutil Module Documentation: https://docs.python.org/3/library/shutil.html

10. Scikit-learn CountVectorizer Documentation: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

11. Scikit-learn MultinomialNB Documentation: https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html

12. Scikit-learn train_test_split Documentation: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

13. Scikit-learn accuracy_score Documentation: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html

14. Scikit-learn classification_report Documentation: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html

15. Joblib Documentation: https://joblib.readthedocs.io/en/latest/

16. OpenCV Documentation: https://docs.opencv.org/4.x/

17. NumPy Documentation: https://numpy.org/doc/stable/

18. Pillow Documentation: https://pillow.readthedocs.io/en/stable/

**Methods :**

1. gdown.download: https://github.com/wkentaro/gdown#usage

2. gdown.download_folder: https://github.com/wkentaro/gdown#usage

3. os.makedirs: https://docs.python.org/3/library/os.html#os.makedirs

4. os.listdir: https://docs.python.org/3/library/os.html#os.listdir

5. os.path.join: https://docs.python.org/3/library/os.path.html#os.path.join

6. os.path.exists: https://docs.python.org/3/library/os.path.html#os.path.exists

7. zipfile.ZipFile.extractall: https://docs.python.org/3/library/zipfile.html#zipfile.ZipFile.extractall

8. pandas.read_csv: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html

9. pandas.DataFrame.head: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.head.html

10. tensorflow.keras.preprocessing.image.ImageDataGenerator.flow_from_directory: https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/image/ImageDataGenerator#flow_from_directory

11. tensorflow.keras.Sequential.fit: https://www.tensorflow.org/api_docs/python/tf/keras/Sequential#fit

12. tensorflow.keras.models.load_model: https://www.tensorflow.org/api_docs/python/tf/keras/models/load_model

13. sklearn.feature_extraction.text.CountVectorizer.fit_transform: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html#sklearn.feature_extraction.text.CountVectorizer.fit_transform

14. sklearn.feature_extraction.text.CountVectorizer.transform: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html#sklearn.feature_extraction.text.CountVectorizer.transform

15. sklearn.naive_bayes.MultinomialNB.fit: https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html#sklearn.naive_bayes.MultinomialNB.fit

16. sklearn.naive_bayes.MultinomialNB.predict: https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html#sklearn.naive_bayes.MultinomialNB.predict

17. sklearn.metrics.accuracy_score: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html

18. sklearn.metrics.classification_report: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html

19. joblib.dump: https://joblib.readthedocs.io/en/latest/generated/joblib.dump.html

20. joblib.load: https://joblib.readthedocs.io/en/latest/generated/joblib.load.html

21. PIL.Image.open: https://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.open

22. numpy.array: https://numpy.org/doc/stable/reference/generated/numpy.array.html

23. numpy.all: https://numpy.org/doc/stable/reference/generated/numpy.all.html

24. numpy.argmax: https://numpy.org/doc/stable/reference/generated/numpy.argmax.html

25. cv2.imread: https://docs.opencv.org/4.x/d4/da8/group__imgcodecs.html#ga3a96fbf3214e6b0d9a5c6eb9e1d4cd53

26. cv2.resize: https://docs.opencv.org/4.x/da/d54/group__imgproc__transform.html#ga2f8e0a188c27d3513b0df3c2662385d4









**References :**
1. https://learner-cares.medium.com/handwritten-digit-recognition-using-convolutional-neural-network-cnn-with-tensorflow-2f444e6c4c31
2. https://www.shiksha.com/online-courses/articles/handwritten-digit-recognition-with-98-accuracy/
3. https://www.analyticsvidhya.com/blog/2022/03/building-naive-bayes-classifier-from-scratch-to-perform-sentiment-analysis/#:~:text=In%20sentiment%20analysis%2C%20Naive%20Bayes,class%20with%20the%20highest%20probability.
4. https://youtu.be/WQeoO7MI0Bs
5. https://youtu.be/wQ8BIBpya2k
6. https://youtu.be/EGKeC2S44Rs
7. https://youtu.be/6zm9NC9uRkk
8. stackoverflow and other websites for debugging, understanding 