In [None]:
!pip install paddlepaddle-gpu==2.5.1
!pip install "paddleocr>=2.0.1"
!pip install vietocr

In [None]:
!wget http://nz2.archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1.1f-1ubuntu2.19_amd64.deb
!sudo dpkg -i libssl1.1_1.1.1f-1ubuntu2.19_amd64.deb

In [None]:
import paddle
paddle.utils.run_check()

In [None]:
!pip install scipy==1.10.1

In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("wandb_api_key")

In [None]:
import wandb

wandb.login(key = secret_value_0)
run = wandb.init(project="hcmai")

In [None]:
import os
import shutil
import cv2
from tqdm import tqdm
import pandas as pd
from vietocr.tool.predictor import Predictor
from vietocr.tool.config import Cfg
from paddleocr import PaddleOCR
from PIL import Image
from typing import Any


class OCR:
    def __init__(self, saved_directory:str) -> None:
        self.text_detector = PaddleOCR(use_angle_cls=True,lang="en", show_log = False)
        config = Cfg.load_config_from_name("vgg_seq2seq")
        config["cnn"]["pretrained"] = True
        config["device"] = "cuda:0"
        config["predictor"]["beamsearch"] = False
        self.text_recognitor = Predictor(config)
        self.df_directory = saved_directory

    def text_detection(self, image_path: str) -> Any:
        """input: path of image
        output: list of bounding boxes
        """
        return self.text_detector.ocr(image_path)

    def save_text_boxes(self, image_path: str) -> list:
        """input: path of image
        output: list of file path where the bounding boxes stored
        """

        detector_result = self.text_detection(image_path)
        bouding_boxes = [
            detector_result[0][i][0] for i in range(len(detector_result[0]))
        ]

        image = cv2.imread(image_path)
        count = 0
        file_path = []

        splitted_directory = image_path.split("/")
        splitted_directory.pop()
        temp_directory = splitted_directory.pop()
        directory = f"/kaggle/working/{temp_directory}"
        
        another_directory = image_path.split("/")
        wanted = another_directory.pop().split(".")[0]
        save_directory = os.path.join(directory, wanted)
        os.makedirs(save_directory)

        for box in bouding_boxes:
            y_min, y_max, *other = [int(point[0]) for point in box]
            x_min, *other, x_max = [int(point[1]) for point in box]
            region = image[x_min:x_max, y_min:y_max]
            cv2.imwrite(f"{save_directory}/output_{count}.jpg", region)
            file_path.append(f"{save_directory}/output_{count}.jpg")
            count += 1
            
        return file_path

    def text_recognition(self, image_path: str) -> tuple:
        """input: image path
        output: tuple of image path and text
        """
        image_path_list = []
        text_list = []
        file_path_list = self.save_text_boxes(image_path)
        for path in file_path_list:
            image = Image.open(path)
            
            text = self.text_recognitor.predict(image)
            image_path_list.append(image_path)
            text_list.append(text)
        return image_path_list, text_list

    def __call__(self, directory: str, key_num : str):
        """
        input: directory store image
        output: None
        process OCR through all image in this directory
        then save result into a csv file
        """
        
        result_dict = {"n": [], "text": []}
        for path in os.listdir(os.path.join(directory)):
            image_path = os.path.join(directory, path)
            image_path_list, text_list = self.text_recognition(image_path)
            for image_path, text in zip(image_path_list, text_list):
                result_dict["n"].append(path)
                result_dict["text"].append(text)
        
        shutil.rmtree(f"{self.df_directory}{directory.split('/')[-1]}")
        
        os.makedirs(f"{self.df_directory}/{key_num}/", exist_ok = True)
        
        result_df = pd.DataFrame(result_dict)
        directory = directory.split("/")[-1]
        result_df.to_csv(f"{self.df_directory}/{key_num}/{directory}.csv", index=False)
        
        return


In [None]:
ocr_toolkit = OCR(saved_directory="/kaggle/working/")

In [None]:
direc = "/kaggle/input/hcmai-keyframe/keyframe/"
output = "/kaggle/working/"
for key_num in tqdm(os.listdir(direc)):
    for vid in tqdm(os.listdir(os.path.join(direc, key_num))):
        directory = os.path.join(direc, key_num, vid)
        
        ocr_toolkit(directory, key_num)
        
    shutil.make_archive(os.path.join(output, key_num), 'zip', os.path.join(output, key_num))
    
    # Wandb
    artifact = wandb.Artifact(name = "ocr", type = "dataset")
    artifact.add_file(os.path.join(output,key_num) + ".zip")
    run.log_artifact(artifact)
    
    gc.collect()
    torch.cuda.empty_cache()