<a href="https://colab.research.google.com/github/DarShabi/OCR-Company-Project/blob/main/Week%232/Easy_solution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q ImageHash

In [2]:
!apt-get install tesseract-ocr
!pip install pytesseract

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 16 not upgraded.


In [3]:
! pip install fasttext



In [4]:
from PIL import Image
from PIL import ImageEnhance, ImageOps
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import glob
import json
from collections import Counter, defaultdict
import re
import matplotlib.image as mpimg
import seaborn as sns
import cv2
import imagehash
import pytesseract
import fasttext
from sklearn.model_selection import train_test_split
import cv2
import re
import urllib.request
import time

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
data_path = '/content/drive/MyDrive/OCR_Project/data.csv'

df = pd.read_csv(data_path)

In [7]:
NA_df = pd.read_csv('/content/drive/MyDrive/OCR_Project/N_A_images.csv')

# PreProcess - changing NaN to string 'unknown'

In [8]:
nan_count_before = df['text'].isnull().sum()
print(f"Number of NaN values before replacement: {nan_count_before}")

Number of NaN values before replacement: 1031


In [9]:
# Replace 'NaN' with 'unknown'
df['text'].fillna('unknown', inplace=True)

In [10]:
# Check the count of 'NaN' after replacement
nan_count_after = df['text'].isnull().sum()
print(f"Number of NaN values after replacement: {nan_count_after}")

Number of NaN values after replacement: 0


In [11]:
# Initial split: 80% train, 20% temp
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)

# Split the 20% temp into half: 10% validation, 10% test
val_df, test_df = train_test_split(temp_df, test_size=2/3, random_state=42)

print(f"Train size: {len(train_df)}")
print(f"Validation size: {len(val_df)}")
print(f"Test size: {len(test_df)}")


Train size: 3407
Validation size: 487
Test size: 974


# Step 1: Solving N/A and one-pixle images

In [12]:
# 1. Calculate hashes for all images in na_df and store their respective texts
hashes_na = {}
for index, row in NA_df.iterrows():
    image_path = row['file_path']
    with Image.open(image_path) as img:
        h = imagehash.dhash(img)
        hashes_na[h] = row['text']



In [13]:
# 2 & 3. Calculate hashes for all images in val_df and assign matching predicted_text
for index, row in val_df.iterrows():
    image_path = row['file_path']
    with Image.open(image_path) as img:
        h = imagehash.dhash(img)
        if h in hashes_na:
            val_df.at[index, 'predicted_text'] = hashes_na[h]
        else:
            val_df.at[index, 'predicted_text'] = 'TBD'



In [14]:
correct_predictions = (val_df['predicted_text'] == val_df['text']).sum()
total_predictions = len(val_df)
accuracy = (correct_predictions / total_predictions) * 100
print(f"Accuracy after accounting for visually similar images: {accuracy:.4f}%")

Accuracy after accounting for visually similar images: 6.1602%


In [15]:
def find_one_color_images(df, column_name='file_path'):
    """
    Identifies images in the given DataFrame that consist of a single color.

    Parameters:
    df (DataFrame): A DataFrame containing file paths to the images.
    column_name (str, optional): The name of the column in the DataFrame containing the file paths. Defaults to 'file_path'.

    Returns:
    list: A list of file paths to the images that consist of a single color.
    """
    one_color_images = []

    for index, row in df.iterrows():
        image_path = row[column_name]
        image = Image.open(image_path).convert("RGB")
        pixels = np.array(image)
        first_pixel = pixels[0, 0]

        if (pixels == first_pixel).all():
            one_color_images.append(image_path)

    return one_color_images

In [16]:
def set_nan_for_one_color_images(df, one_color_images):
    """
    Set the prediction to NaN for one-color images.

    Parameters:
    df (DataFrame): The DataFrame containing the image paths and their predictions.
    one_color_images (list): The list of image paths that are one-color images.

    Returns:
    DataFrame: Updated DataFrame with predictions set to NaN for one-color images.
    """
    for img_path in one_color_images:
        df.loc[df['file_path'] == img_path, 'predicted_text'] = 'unknown'
    return df


In [17]:
# Identify one-color images in the val_df
one_color_images_val = find_one_color_images(val_df)

# Update the val_df to set predictions to NaN for the identified one-color images
val_df = set_nan_for_one_color_images(val_df, one_color_images_val)

In [18]:
correct_predictions = ((val_df['predicted_text'] == val_df['text'])).sum()

total_predictions = len(val_df)

accuracy = (correct_predictions / total_predictions) * 100

print(f"Accuracy after handling 'n_a_images' and one-color images: {accuracy:.4f}%")


Accuracy after handling 'n_a_images' and one-color images: 6.9815%


# Step 2: Solving no text images.

In [19]:
url = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin"
filename = "lid.176.bin"

urllib.request.urlretrieve(url, filename)
print(f"{filename} has been downloaded successfully!")


lid.176.bin has been downloaded successfully!


In [20]:
# Load a pretrained FastText model for language detection
lang_model = fasttext.load_model('lid.176.bin')



In [21]:
counter = 0

def enhanced_ocr(image_path):
    """
    Pre-process and recognize text from an image using Tesseract with enhanced configurations.

    Parameters:
    image_path (str): Path to the image file.

    Returns:
    str: Recognized text from the image.
    """
    global counter
    counter += 1
    start_time = time.time()

    # Pre-processing
    img = Image.open(image_path).convert("L")  # Convert to grayscale
    img = ImageOps.autocontrast(img)           # Improve contrast
    enhancer = ImageEnhance.Sharpness(img)
    img = enhancer.enhance(2)                  # Sharpen the image

    # Possible rotations
    rotations = [0, 90, 180, 270]
    best_text = ''
    best_angle = 0

    # Try OCR on different rotations and choose the one with the maximum length output
    for angle in rotations:
        rotated_img = img.rotate(angle)
        text = pytesseract.image_to_string(rotated_img, config='--psm 11')
        if len(text) > len(best_text):
            best_text = text
            best_angle = angle

    # Check if there's no or very short text
    if len(best_text) < 2:
        elapsed_time = time.time() - start_time
        print(f"Processed {counter} images. Current image took: {elapsed_time:.2f} seconds. Result: unknown")
        return 'unknown'

    elapsed_time = time.time() - start_time
    print(f"Processed {counter} images. Current image took: {elapsed_time:.2f} seconds. Result length: {len(best_text)}")
    return 'TBD'

In [22]:
val_df.loc[val_df['predicted_text'] == 'TBD', 'predicted_text'] = val_df[val_df['predicted_text'] == 'TBD']['file_path'].apply(enhanced_ocr)

Processed 1 images. Current image took: 0.53 seconds. Result: unknown
Processed 2 images. Current image took: 4.84 seconds. Result length: 490
Processed 3 images. Current image took: 6.46 seconds. Result length: 281
Processed 4 images. Current image took: 3.24 seconds. Result length: 233
Processed 5 images. Current image took: 17.18 seconds. Result length: 1466
Processed 6 images. Current image took: 2.74 seconds. Result length: 266
Processed 7 images. Current image took: 0.89 seconds. Result: unknown
Processed 8 images. Current image took: 3.46 seconds. Result length: 85
Processed 9 images. Current image took: 0.95 seconds. Result length: 23
Processed 10 images. Current image took: 3.46 seconds. Result length: 211
Processed 11 images. Current image took: 3.76 seconds. Result length: 295
Processed 12 images. Current image took: 1.09 seconds. Result length: 37
Processed 13 images. Current image took: 7.64 seconds. Result length: 435
Processed 14 images. Current image took: 2.79 seconds.



Processed 15 images. Current image took: 0.56 seconds. Result length: 5
Processed 16 images. Current image took: 14.03 seconds. Result length: 914




Processed 17 images. Current image took: 0.51 seconds. Result: unknown
Processed 18 images. Current image took: 6.91 seconds. Result length: 365
Processed 19 images. Current image took: 2.40 seconds. Result length: 93
Processed 20 images. Current image took: 15.25 seconds. Result length: 718
Processed 21 images. Current image took: 0.83 seconds. Result length: 51
Processed 22 images. Current image took: 6.26 seconds. Result length: 568
Processed 23 images. Current image took: 1.37 seconds. Result length: 36
Processed 24 images. Current image took: 18.15 seconds. Result length: 1374
Processed 25 images. Current image took: 0.57 seconds. Result length: 12
Processed 26 images. Current image took: 0.50 seconds. Result: unknown
Processed 27 images. Current image took: 11.49 seconds. Result length: 1123
Processed 28 images. Current image took: 7.44 seconds. Result length: 599
Processed 29 images. Current image took: 6.48 seconds. Result length: 326
Processed 30 images. Current image took: 2.



Processed 41 images. Current image took: 0.89 seconds. Result: unknown
Processed 42 images. Current image took: 2.00 seconds. Result length: 102
Processed 43 images. Current image took: 2.74 seconds. Result length: 201
Processed 44 images. Current image took: 1.64 seconds. Result length: 49
Processed 45 images. Current image took: 2.68 seconds. Result length: 111
Processed 46 images. Current image took: 10.63 seconds. Result length: 364
Processed 47 images. Current image took: 9.66 seconds. Result length: 645
Processed 48 images. Current image took: 2.22 seconds. Result length: 75
Processed 49 images. Current image took: 1.38 seconds. Result length: 91
Processed 50 images. Current image took: 10.53 seconds. Result length: 558
Processed 51 images. Current image took: 1.41 seconds. Result length: 47
Processed 52 images. Current image took: 2.26 seconds. Result length: 149
Processed 53 images. Current image took: 1.59 seconds. Result length: 131
Processed 54 images. Current image took: 2.



Processed 75 images. Current image took: 0.57 seconds. Result length: 5
Processed 76 images. Current image took: 1.79 seconds. Result length: 116
Processed 77 images. Current image took: 4.42 seconds. Result length: 455
Processed 78 images. Current image took: 0.74 seconds. Result length: 17
Processed 79 images. Current image took: 0.53 seconds. Result: unknown
Processed 80 images. Current image took: 4.20 seconds. Result length: 767
Processed 81 images. Current image took: 0.54 seconds. Result length: 4
Processed 82 images. Current image took: 0.89 seconds. Result length: 6
Processed 83 images. Current image took: 4.42 seconds. Result length: 516
Processed 84 images. Current image took: 3.07 seconds. Result length: 265
Processed 85 images. Current image took: 13.77 seconds. Result length: 1360
Processed 86 images. Current image took: 1.25 seconds. Result length: 95
Processed 87 images. Current image took: 1.07 seconds. Result length: 98
Processed 88 images. Current image took: 10.23 s



Processed 113 images. Current image took: 0.91 seconds. Result: unknown
Processed 114 images. Current image took: 2.20 seconds. Result length: 99
Processed 115 images. Current image took: 7.82 seconds. Result length: 1112




Processed 116 images. Current image took: 0.60 seconds. Result length: 11




Processed 117 images. Current image took: 0.90 seconds. Result length: 16
Processed 118 images. Current image took: 4.35 seconds. Result length: 170
Processed 119 images. Current image took: 5.87 seconds. Result length: 535
Processed 120 images. Current image took: 2.96 seconds. Result length: 156
Processed 121 images. Current image took: 1.03 seconds. Result length: 3
Processed 122 images. Current image took: 5.30 seconds. Result length: 537
Processed 123 images. Current image took: 0.72 seconds. Result length: 4
Processed 124 images. Current image took: 1.18 seconds. Result length: 23
Processed 125 images. Current image took: 1.35 seconds. Result length: 85
Processed 126 images. Current image took: 0.60 seconds. Result length: 9
Processed 127 images. Current image took: 1.89 seconds. Result length: 38




Processed 128 images. Current image took: 0.55 seconds. Result length: 3
Processed 129 images. Current image took: 2.31 seconds. Result length: 88
Processed 130 images. Current image took: 9.06 seconds. Result length: 458
Processed 131 images. Current image took: 0.67 seconds. Result length: 7
Processed 132 images. Current image took: 0.70 seconds. Result length: 23
Processed 133 images. Current image took: 1.93 seconds. Result length: 91
Processed 134 images. Current image took: 0.81 seconds. Result length: 58
Processed 135 images. Current image took: 0.78 seconds. Result length: 29
Processed 136 images. Current image took: 12.74 seconds. Result length: 1314
Processed 137 images. Current image took: 0.82 seconds. Result length: 19
Processed 138 images. Current image took: 5.12 seconds. Result length: 397
Processed 139 images. Current image took: 2.98 seconds. Result length: 174
Processed 140 images. Current image took: 2.87 seconds. Result length: 130
Processed 141 images. Current ima

KeyboardInterrupt: ignored

In [None]:
correct_predictions = ((val_df['predicted_text'] == val_df['text'])).sum()

total_predictions = len(val_df)

accuracy = (correct_predictions / total_predictions) * 100

print(f"Accuracy after handling 'n_a_images', one-color images, and no-text images: {accuracy:.4f}%")


In [None]:
# Rows where 'text' is 'unknown' but 'predicted_text' is not
unknown_text_diff_predicted_val = val_df[(val_df['text'] == 'unknown') & (val_df['predicted_text'] != 'unknown')]

# Rows where 'predicted_text' is 'unknown' but 'text' is not
unknown_predicted_diff_text_val = val_df[(val_df['predicted_text'] == 'unknown') & (val_df['text'] != 'unknown')]

# Calculate percentages
percentage_unknown_text_diff_predicted_val = (len(unknown_text_diff_predicted_val) / len(val_df)) * 100
percentage_unknown_predicted_diff_text_val = (len(unknown_predicted_diff_text_val) / len(val_df)) * 100

print(f"Percentage where 'text' in val_df is 'unknown' but 'predicted_text' is not: {percentage_unknown_text_diff_predicted_val:.2f}%")
print(f"Percentage where 'predicted_text' in val_df is 'unknown' but 'text' is not: {percentage_unknown_predicted_diff_text_val:.2f}%")


In [None]:
def display_images_from_df(dataframe, num_images=5):
    fig, axes = plt.subplots(1, num_images, figsize=(15, 5))
    for idx, (ax, row) in enumerate(zip(axes, dataframe.iterrows())):
        image_path = row[1]['file_path']
        image = Image.open(image_path)
        ax.imshow(image)
        ax.set_title(f"Text: {row[1]['text']}\nPredicted: {row[1]['predicted_text']}")
        ax.axis('off')
    plt.tight_layout()
    plt.show()

In [None]:
display_images_from_df(unknown_text_diff_predicted_val)
display_images_from_df(unknown_predicted_diff_text_val)