In [9]:
import numpy as np
import pandas as pd
import easyocr
import matplotlib.pyplot as plt
from collections import Counter
from PIL import Image, ImageDraw
import cv2

In [10]:
df = pd.read_csv('./Data/dataset/european_images.csv', index_col=0)
df['img_path'] = [f"./Data/dataset/{index}.png" for index in df.index]
df['text'] =  pd.Series(dtype='object')
df['confidence'] =  pd.Series(dtype='object')
df['bbox'] =  pd.Series(dtype='object')

Need to determine the best OCR model for this.

Candidates:
    easyocr (built in, uses pytorch)

# OCR Text Recognition
1. It's important to note that we're looking at a plethora of European languages, which will employ various modifications to the latin script, cyrillic, etc. 
2. As such, multiple OCR readers can be employed. We can run each reader on an image, extract the confidence and various pieces of text above some threshold, and then input the collected data to our classifier.
3. We could also use a reader with multiple languages in the list. This will remove the ability to handle different languages differently, but that may be better to avoid data leakage?
4. It may be necessary to prune the 'Google' labels from the images.


## OCR Text Recognition: Language definition

https://www.jaided.ai/easyocr/

In [11]:
class ocr_reader():
    def __init__(self, lang):
        self.lang = lang
        self.reader = easyocr.Reader([lang])
    
    def __call__(self, df):
        self.df_results = df.copy()
        
        for index, row in self.df.iterrows():
            
            results = self.OCR_extraction(row['img_path'], self.reader)
            
            bbox_ = []
            text = []
            confidence = []
            for result in results:
                bbox_.append(result[0])
                text.append(result[1])
                confidence.append(result[2])
                
            self.df_results.at[index, "bbox"] = bbox_
            self.df_results.at[index, "text"] = text
            self.df_results.at[index, "confidence"] = confidence
        
        return self.df_results

In [12]:
languages = {
    "be": "be",
    "bg": "bg",
    "cs": "cs",
    "cy": "cy",
    "da": "da",
    "de": "de",
    "en": "en",
    "es": "es",
    "et": "et",
    "fr": "fr",
    "ga": "ga",
    "hr": "hr",
    "hu": "hu",
    "is": "is",
    "it": "it",
    "la": "la",
    "lt": "lt",
    "lv": "lv",
    "mt": "mt",
    "nl": "nl",
    "no": "no",
    "pl": "pl",
    "ro": "ro",
    "ru": "ru",
    "rs_latin": "rs",
    "rs_cyrillic": "rc",
    "sk": "sk",
    "sl": "sl",
    "sq": "sq",
    "sv": "sv",
    "uk": "uk"
}

Readers = {}

for key, lang in languages.items():
    Readers[lang] = ocr_reader(key)

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster wi

Either only store text above a certain threshold for all, or only store text above a certain threshold for each language.
On CPU, roughly 3s to initialize the OCR reader.

### Strip Google watermarks from detected text

Generating text for each image for each reader, then using manual pruning to determine which labels to drop for each language.
These can then be associated with the correct language.

I use the english reader, identify matching watermark text, determine the bounding box, and mask the image for that watermark.

In [13]:
def OCR_extraction(path, reader):
    results =  reader.readtext(path)
    return results
    

In [16]:
reader_en = Readers["en"].reader

watermarks = ["google", "gccgle", "gcogle", "gocgle"]

### For each image, generate a mask (initially blank). Use this to mask results for the other readers.
for index, row in df.iterrows():
    
    ### Generating the mask with the english reader, filtering out the sections of the image containing the "Google" watermark.
    image = cv2.imread(row['img_path'])
    mask = np.ones(image.shape[:2], dtype=np.uint8) * 255
    results = OCR_extraction(row['img_path'], reader_en)
    
    bbox_ = []
    text = []
    confidence = []
    for result in results:
        bbox_.append(result[0])
        text.append(result[1])
        confidence.append(result[2])
    
    for idx, t in enumerate(text):
        ### Filtereing text matching the filter, or text with less than min_length characters (meaningless)
        min_length = 3
        if t.upper() in (watermark.upper() for watermark in watermarks) or len(t) <= min_length:
            corners = bbox_[idx]
            x0, y0 = np.array(corners[0], dtype=np.int32) ### Top left corner
            x1, y1 = np.array(corners[2], dtype=np.int32) ### Bottom right corner
            cv2.rectangle(mask, (x0, y0), (x1, y1), 0, -1)
        else:
            print(f"Text {t.upper()} passed the filter for img index {index} and confidence {confidence[idx]}")
    
    ### We now have a masked image instead of the original image. Other lang readers can now use this image.
    masked_image = cv2.bitwise_and(image, image, mask=mask)
    
    ### Store the images in a new directory (masked_for_ocr/) so that I only have to do this once
    cv2.imwrite(f"./Data/masked_for_ocr/{index}.png", masked_image)    

Text SENUONIS passed the filter for img index 156 and confidence 0.00015776343750967432


In [None]:
masked_df = pd.read_csv('./Data/dataset/european_images.csv', index_col=0)
masked_df['img_path'] = [f"./Data/masked_for_ocr/{index}.png" for index in df.index]
masked_df['text'] =  pd.Series(dtype='object')
masked_df['confidence'] =  pd.Series(dtype='object')
masked_df['bbox'] =  pd.Series(dtype='object')

##### Now, with the text extracted from the masked images, we can set confidence thresholds, and only keep results above specific thresholds.

In [None]:
### this is a really inefficient way to handle this but...
Full_Results = {}
for lang, reader in Readers.items():
    
    Full_Results[lang] = reader(masked_df)
    for idx, row in Full_Results[lang].iterrows():
        if row['text'] is not None and row['text'] != []:
            continue

#### Pass extracted text through LLM to predict country or language

        
        