# DeepSeek OCR Setup Guide

## Installation Steps

In [1]:
!git clone https://github.com/deepseek-ai/DeepSeek-OCR.git
%cd DeepSeek-OCR
!pip install -r requirements.txt
!pip install transformers pillow
!pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu118
!pip install vllm-0.8.5+cu118-cp38-abi3-manylinux1_x86_64.whl
!pip install -r requirements.txt
!pip install flash-attn==2.7.3 --no-build-isolation

Cloning into 'DeepSeek-OCR'...
remote: Enumerating objects: 46, done.[K
remote: Total 46 (delta 0), reused 0 (delta 0), pack-reused 46 (from 1)[K
Receiving objects: 100% (46/46), 7.79 MiB | 31.03 MiB/s, done.
Resolving deltas: 100% (10/10), done.
/content/DeepSeek-OCR
Collecting transformers==4.46.3 (from -r requirements.txt (line 1))
  Downloading transformers-4.46.3-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers==0.20.3 (from -r requirements.txt (line 2))
  Downloading tokenizers-0.20.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting PyMuPDF (from -r requirements.txt (line 3))
  Downloading pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting img2pdf (from -r requirements.txt (line 4))
  Downloading img2pdf-0.6.3-py3-none-any.whl.metadata (13 kB)
Collecting addict (from -r requireme

In [2]:
!unzip '/content/imagees.zip' -d '/content/test images'


Archive:  /content/imagees.zip
   creating: /content/test images/imagees/
  inflating: /content/test images/imagees/5949345152205785880_121.jpg  
  inflating: /content/test images/imagees/5949345152205785881_121.jpg  
  inflating: /content/test images/imagees/5949345152205785882_121.jpg  
  inflating: /content/test images/imagees/5949345152205785883_121.jpg  
  inflating: /content/test images/imagees/5949345152205785884_121.jpg  
  inflating: /content/test images/imagees/5949345152205785885_121.jpg  
  inflating: /content/test images/imagees/5949345152205785886_121.jpg  
  inflating: /content/test images/imagees/5949345152205785887_121.jpg  
  inflating: /content/test images/imagees/5949345152205785888_121.jpg  
  inflating: /content/test images/imagees/5949345152205785889_121.jpg  
  inflating: /content/test images/imagees/5949345152205785890_121.jpg  
  inflating: /content/test images/imagees/5949345152205785891_121.jpg  
  inflating: /content/test images/imagees/5949345152205785892_

# images enhancment

In [8]:
import os
from PIL import Image, ImageEnhance
from pathlib import Path

def preprocess_image(image_path, output_folder):
    try:
        img = Image.open(image_path)

        if img.mode != 'RGB':
            img = img.convert('RGB')

        enhancer = ImageEnhance.Sharpness(img)
        img = enhancer.enhance(2.0)

        enhancer = ImageEnhance.Contrast(img)
        img = enhancer.enhance(1.5)

        image_name = os.path.basename(image_path)
        enhanced_path = os.path.join(output_folder, image_name)
        img.save(enhanced_path, quality=95)

        print(f"Processed: {image_name}")
        return enhanced_path

    except Exception as e:
        print(f"Error: {image_path} - {e}")
        return None

input_folder = "/content/test images/imagees"
output_folder = "/content/test_images_enhanced"
Path(output_folder).mkdir(parents=True, exist_ok=True)

image_extensions = ['*.jpg', '*.jpeg', '*.png', '*.JPG', '*.JPEG', '*.PNG']
all_images = []
for ext in image_extensions:
    all_images.extend(Path(input_folder).glob(ext))

print(f"Found {len(all_images)} images")
print("="*50)

for image_path in all_images:
    preprocess_image(str(image_path), output_folder)

print("="*50)
print(f"Done! Enhanced images saved in: {output_folder}")


Found 34 images
Processed: 5949345152205785900_121.jpg
Processed: 5949345152205785903_121.jpg
Processed: 5949345152205785887_121.jpg
Processed: 5949345152205785888_121.jpg
Processed: 5949345152205785882_121.jpg
Processed: 5949345152205785896_121.jpg
Processed: 5949345152205785904_121.jpg
Processed: 5949345152205785907_121.jpg
Processed: 5949345152205785891_121.jpg
Processed: 5949345152205785890_121.jpg
Processed: 5949345152205785884_121.jpg
Processed: 5949345152205785897_121.jpg
Processed: 5949345152205785880_121.jpg
Processed: 5949345152205785908_121.jpg
Processed: 5949345152205785895_121.jpg
Processed: 5949345152205785911_121.jpg
Processed: 5949345152205785883_121.jpg
Processed: 5949345152205785902_121.jpg
Processed: 5949345152205785889_121.jpg
Processed: 5949345152205785909_121.jpg
Processed: 5949345152205785906_121.jpg
Processed: 5949345152205785893_121.jpg
Processed: 5949345152205785894_121.jpg
Processed: 5949345152205785885_121.jpg
Processed: 5949345152205785881_121.jpg
Processed

# DeepSeek OCR - Model Loading

## Load Model and Tokenizer

In [2]:
from transformers import AutoModel, AutoTokenizer
import torch

# تحميل المودل
print("Loading model...")
tokenizer = AutoTokenizer.from_pretrained('deepseek-ai/DeepSeek-OCR', trust_remote_code=True)
model = AutoModel.from_pretrained(
    'deepseek-ai/DeepSeek-OCR',
    trust_remote_code=True,
    torch_dtype=torch.bfloat16
).eval().cuda()

print("Model loaded! Running OCR...")

Loading model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using a model of type deepseek_vl_v2 to instantiate a model of type DeepseekOCR. This is not supported for all configurations of models and can yield errors.
Some weights of DeepseekOCRForCausalLM were not initialized from the model checkpoint at deepseek-ai/DeepSeek-OCR and are newly initialized: ['model.vision_model.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded! Running OCR...


# text extracting and cleaning

In [7]:
import pandas as pd
import re
import torch
import gc
from pathlib import Path
from PIL import Image
import sys
from io import StringIO
import signal
from contextlib import contextmanager

class TimeoutException(Exception):
    pass

@contextmanager
def time_limit(seconds):
    def signal_handler(signum, frame):
        raise TimeoutException("Timed out")
    signal.signal(signal.SIGALRM, signal_handler)
    signal.alarm(seconds)
    try:
        yield
    finally:
        signal.alarm(0)

df = pd.DataFrame(columns=['image_name', 'extracted_text', 'cleaned_text', 'status'])

enhanced_folder = "/content/test_images_enhanced"
image_files = list(Path(enhanced_folder).glob('*.jpg')) + list(Path(enhanced_folder).glob('*.png'))

print(f"Found {len(image_files)} images\n")

for idx, image_path in enumerate(image_files, 1):
    image_name = image_path.name
    print(f"[{idx}/{len(image_files)}] {image_name}")

    old_stdout = sys.stdout
    captured_output = None

    try:
        torch.cuda.empty_cache()
        gc.collect()

        with time_limit(30):
            captured_output = StringIO()
            sys.stdout = captured_output

            try:
                result = model.infer(
                    tokenizer,
                    prompt="<image>\nFree OCR.",
                    image_file=str(image_path),
                    output_path="/content/output",
                    base_size=512,
                    crop_mode=False,
                    save_results=False
                )
            finally:
                sys.stdout = old_stdout

            printed_text = captured_output.getvalue()

            extracted = printed_text
            if result:
                extracted += "\n" + str(result)

            cleaned = re.sub(r'directly resize|BASE:.*|NO PATCHES|PATCHES:.*|={5,}|torch\.Size.*|The attention.*|Setting `pad.*', '', extracted, flags=re.MULTILINE)
            cleaned = re.sub(r'[^\w\s\u0600-\u06FF.,!?-]', ' ', cleaned)
            cleaned = re.sub(r'\b[a-zA-Z\u0600-\u06FF]\b', ' ', cleaned)
            cleaned = re.sub(r'\s+', ' ', cleaned)
            cleaned = cleaned.strip()

            if len(cleaned) > 10:
                status = "SUCCESS"
                print(f"  SUCCESS: {cleaned[:50]}...")
            else:
                status = "EMPTY"
                print(f"  EMPTY")

            new_row = pd.DataFrame([{
                'image_name': image_name,
                'extracted_text': extracted,
                'cleaned_text': cleaned,
                'status': status
            }])
            df = pd.concat([df, new_row], ignore_index=True)

            del result, captured_output

    except TimeoutException:
        sys.stdout = old_stdout
        print(f"  TIMEOUT")
        new_row = pd.DataFrame([{
            'image_name': image_name,
            'extracted_text': 'TIMEOUT',
            'cleaned_text': 'TIMEOUT',
            'status': 'FAIL'
        }])
        df = pd.concat([df, new_row], ignore_index=True)

    except Exception as e:
        sys.stdout = old_stdout
        print(f"  ERROR: {e}")
        new_row = pd.DataFrame([{
            'image_name': image_name,
            'extracted_text': f'ERROR: {str(e)}',
            'cleaned_text': 'ERROR',
            'status': 'FAIL'
        }])
        df = pd.concat([df, new_row], ignore_index=True)

print("\n" + "="*50)
print(f"Total: {len(df)}")
print(f"Success: {len(df[df['status'] == 'SUCCESS'])}")
print(f"Empty: {len(df[df['status'] == 'EMPTY'])}")
print(f"Failed: {len(df[df['status'] == 'FAIL'])}")

df.to_csv('ocr_results_cleaned.csv', index=False, encoding='utf-8-sig')
print("\nSaved: ocr_results_cleaned.csv")


Found 34 images

[1/34] 5949345152205785900_121.jpg


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  SUCCESS: خصوصات قويه من نابس ون علي المنتجات الأفضل مبيعا ب...
[2/34] 5949345152205785903_121.jpg


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  SUCCESS: كاس مصر Top Latest People Media Lists 5 118 151 11...
[3/34] 5949345152205785887_121.jpg


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  SUCCESS: Music Chats News Music Love ThoN... 2d If you clos...
[4/34] 5949345152205785888_121.jpg


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  SUCCESS: . . r0ktech 3d When you re coding with music and b...
[5/34] 5949345152205785882_121.jpg


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  SUCCESS: Plant Based News PlantB... 1d LOVE this sign. Plan...
[6/34] 5949345152205785896_121.jpg


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  SUCCESS: Iman 49cashmere 2s Replying to Snipergoat8675 Herb...
[7/34] 5949345152205785904_121.jpg


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  SUCCESS: كاس مصر Top Latest People Media Lists Gamal Gamalx...
[8/34] 5949345152205785907_121.jpg


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  SUCCESS: People View all قوائد الخس الصحية - مفيد لتنزيل ال...
[9/34] 5949345152205785891_121.jpg


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  SUCCESS: For You Trending News Sports Enterta 257 17.4K 81....
[10/34] 5949345152205785890_121.jpg


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  SUCCESS: Putin and Russia From acorns to mighty oaks ... 17...
[11/34] 5949345152205785884_121.jpg


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  SUCCESS: technology Top Latest People Media Lists technical...
[12/34] 5949345152205785897_121.jpg


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  SUCCESS: Karin Magana HAUSA_PRO... 2m ShareX Is Redefining ...
[13/34] 5949345152205785880_121.jpg


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  SUCCESS: Gully Foyle UKTrade Good morning nowak_paul - seem...
[14/34] 5949345152205785908_121.jpg


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  SUCCESS: Ezzat443 1h لیلی_عبداللطيف كاذبه ورب الكعبه قال ال...
[15/34] 5949345152205785895_121.jpg


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  SUCCESS: DNG Comics dngcomics 3d Imagine the feeling of rea...
[16/34] 5949345152205785911_121.jpg


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  SUCCESS: ليلي_عبداللطيف Samestery307 9h ليلي_عبداللطيف إن س...
[17/34] 5949345152205785883_121.jpg


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  SUCCESS: technology Top - 410 - 12.2K - 7.3K - 1.1B - 587K ...
[18/34] 5949345152205785902_121.jpg


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  TIMEOUT
[19/34] 5949345152205785889_121.jpg


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  SUCCESS: Putin and Russia Mewtenant Spot sleepawa... 17m Re...
[20/34] 5949345152205785909_121.jpg


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  SUCCESS: ليلي_عبداللطيف Top Latest People Media Lists almas...
[21/34] 5949345152205785906_121.jpg


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  TIMEOUT
[22/34] 5949345152205785893_121.jpg


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  SUCCESS: For You Trending News Sports Enter Sports Fabrizio...
[23/34] 5949345152205785894_121.jpg


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  SUCCESS: Who to follow - Mohamed Alrefaee follows شبكة رصد ...
[24/34] 5949345152205785885_121.jpg


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  SUCCESS: Python Developer Py... 18 Dec Layers of AI Artific...
[25/34] 5949345152205785881_121.jpg


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  SUCCESS: Food Blogger foodietechl... 2d What will you pair ...
[26/34] 5949345152205785910_121.jpg


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  SUCCESS: 7 أشياء كان النبي ﷺ يفعلها قبل النوم - كان النبي ﷺ...
[27/34] 5949345152205785901_121.jpg


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  SUCCESS: هشتاقك_بسعر_مميز... Top Latest People Media Lists ...
[28/34] 5949345152205785912_121.jpg


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  SUCCESS: Top Latest People Media Lists میز rere2062004 6h ل...
[29/34] 5949345152205785892_121.jpg


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  SUCCESS: For You Trending 1.3K 5.6K 7.7.6K 4.6M News Sports...
[30/34] 5949345152205785913_121.jpg


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  TIMEOUT
[31/34] 5949345152205785886_121.jpg


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  SUCCESS: Anissa SimplyAnnis 35m young woman at night music ...
[32/34] 5949345152205785898_121.jpg


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  SUCCESS: breakingNews china technology Industry Economy She...
[33/34] 5949345152205785899_121.jpg


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  SUCCESS: Ser Senseiy YF John Whick Not the AI saying exactl...
[34/34] 5949345152205785905_121.jpg


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  SUCCESS: كاس مصر Top Latest People Media Lists مربى درجة با...

Total: 34
Success: 31
Empty: 0
Failed: 3

Saved: ocr_results_cleaned.csv


# classyfaing using sentence transformers

In [None]:
!pip uninstall -y transformers peft accelerate
!pip install transformers==4.46.3 peft==0.13.2 accelerate==1.1.1

In [1]:
!pip install sentence-transformers

from sentence_transformers import SentenceTransformer, util
import torch
print("Loading classification model...")
classifier_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')


Loading classification model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/402 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

🔍 Classifying texts...


NameError: name 'df' is not defined

In [6]:
import pandas as pd
df=pd.read_csv('/content/ocr_results_cleaned.csv')
df.head()


Unnamed: 0,image_name,extracted_text,cleaned_text,status
0,5949345152205785900_121.jpg,directly resize\n=====================\nBASE: ...,خصوصات قويه من نابس ون علي المنتجات الأفضل مبي...,SUCCESS
1,5949345152205785903_121.jpg,directly resize\n=====================\nBASE: ...,كاس مصر Top Latest People Media Lists 5 118 15...,SUCCESS
2,5949345152205785887_121.jpg,directly resize\n=====================\nBASE: ...,Music Chats News Music Love ThoN... 2d If you ...,SUCCESS
3,5949345152205785888_121.jpg,directly resize\n=====================\nBASE: ...,. . r0ktech 3d When you re coding with music a...,SUCCESS
4,5949345152205785882_121.jpg,directly resize\n=====================\nBASE: ...,Plant Based News PlantB... 1d LOVE this sign. ...,SUCCESS


In [7]:
def classify_with_embeddings(text, categories):
    text_embedding = classifier_model.encode(text, convert_to_tensor=True)
    category_embeddings = classifier_model.encode(categories, convert_to_tensor=True)

    similarities = util.cos_sim(text_embedding, category_embeddings)[0]

    scores = similarities.cpu().tolist()
    results = list(zip(categories, scores))
    results.sort(key=lambda x: x[1], reverse=True)

    return results[0][0], results[0][1]

categories = [
    "تكنولوجيا", "رياضة", "سياسة", "صحة", "طعام",
    "تعليم", "اقتصاد", "سياحة", "دين", "أدب",
    "فن", "علوم", "تاريخ", "قانون", "بيئة",
    "مجتمع", "سيارات", "أزياء", "تسويق", "عقارات", "إعلانات"
]

print("="*50)
print("Classifying texts...")
print("="*50)

df['category'] = None
df['confidence_score'] = None

for idx, row in df.iterrows():
    text = row['cleaned_text']

    if text and text.strip() and text != 'TIMEOUT' and text != 'ERROR':
        category, score = classify_with_embeddings(text, categories)
        df.at[idx, 'category'] = category
        df.at[idx, 'confidence_score'] = round(score, 4)

        print(f"{row['image_name']} -> {category} ({score*100:.1f}%)")
    else:
        df.at[idx, 'category'] = "unknown"
        df.at[idx, 'confidence_score'] = 0.0
        print(f"{row['image_name']} -> No text found")

print("\n" + "="*50)
print("Classification Complete!")
print("="*50)

print("\nFinal DataFrame:")
print(df[['image_name', 'category', 'confidence_score']])

print("\nSummary by Category:")
print(df['category'].value_counts())

df.to_csv('/content/classification_results.csv', index=False, encoding='utf-8-sig')
df.to_excel('/content/classification_results.xlsx', index=False)

print("\nResults saved!")


Classifying texts...
5949345152205785900_121.jpg -> صحة (41.7%)
5949345152205785903_121.jpg -> مجتمع (31.4%)
5949345152205785887_121.jpg -> أدب (27.5%)
5949345152205785888_121.jpg -> فن (32.6%)
5949345152205785882_121.jpg -> طعام (44.9%)
5949345152205785896_121.jpg -> مجتمع (34.0%)
5949345152205785904_121.jpg -> دين (31.7%)
5949345152205785907_121.jpg -> صحة (44.9%)
5949345152205785891_121.jpg -> رياضة (25.1%)
5949345152205785890_121.jpg -> سياسة (25.6%)
5949345152205785884_121.jpg -> تكنولوجيا (43.8%)
5949345152205785897_121.jpg -> تكنولوجيا (39.8%)
5949345152205785880_121.jpg -> طعام (36.4%)
5949345152205785908_121.jpg -> دين (53.7%)
5949345152205785895_121.jpg -> فن (38.5%)
5949345152205785911_121.jpg -> مجتمع (56.4%)
5949345152205785883_121.jpg -> تكنولوجيا (54.8%)
5949345152205785902_121.jpg -> No text found
5949345152205785889_121.jpg -> سياسة (19.9%)
5949345152205785909_121.jpg -> مجتمع (41.1%)
5949345152205785906_121.jpg -> No text found
5949345152205785893_121.jpg -> رياضة (37

# **Trying another method**
# model loading

In [9]:
!pip install easyocr

import easyocr
reader = easyocr.Reader(['ar', 'en'])

Collecting easyocr
  Downloading easyocr-1.7.2-py3-none-any.whl.metadata (10 kB)
Collecting python-bidi (from easyocr)
  Downloading python_bidi-0.6.7-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Collecting pyclipper (from easyocr)
  Downloading pyclipper-1.4.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (8.6 kB)
Collecting ninja (from easyocr)
  Using cached ninja-1.13.0-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (5.1 kB)
Downloading easyocr-1.7.2-py3-none-any.whl (2.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m47.7 MB/s[0m eta [36m0:00:00[0m
[?25hUsing cached ninja-1.13.0-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (180 kB)
Downloading pyclipper-1.4.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (978 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m978.2/978.2 kB[0m [31m58.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownlo



Progress: |██████████████████████████████████████████████████| 100.0% Complete



Progress: |██████████████████████████████████████████████████| 100.0% Complete

In [11]:
import easyocr
import pandas as pd
import re
import gc
import torch
from pathlib import Path

reader = easyocr.Reader(['ar', 'en'], gpu=True)

def clean_text(text):
    if not text: return ""

    replacements = {
        '٠': '0', '١': '1', '٢': '2', '٣': '3', '٤': '4',
        '٥': '5', '٦': '6', '٧': '7', '٨': '8', '٩': '9'
    }
    for arabic_num, english_num in replacements.items():
        text = text.replace(arabic_num, english_num)

    text = re.sub(r'directly resize|BASE:.*|NO PATCHES|PATCHES:.*|={5,}|torch\.Size.*|The attention.*|Setting `pad.*', '', text, flags=re.MULTILINE)

    text = re.sub(r'[^\w\s\u0600-\u06FF.,!?-]', ' ', text)

    text = re.sub(r'\b[a-zA-Z\u0600-\u06FF]\b', ' ', text)

    text = re.sub(r'\s+', ' ', text)

    return text.strip()

df = pd.DataFrame(columns=['image_name', 'extracted_text', 'cleaned_text', 'confidence'])

enhanced_folder = "/content/test_images_enhanced"
image_files = list(Path(enhanced_folder).glob('*.jpg')) + list(Path(enhanced_folder).glob('*.png'))

print(f"Found {len(image_files)} images\n")

for idx, image_path in enumerate(image_files, 1):
    image_name = image_path.name
    print(f"[{idx}/{len(image_files)}] {image_name}")

    try:
        results = reader.readtext(str(image_path), detail=1, paragraph=False)

        full_text = []
        confidences = []

        for (bbox, text, prob) in results:
            full_text.append(text)
            confidences.append(prob)

        extracted = " ".join(full_text)

        avg_conf = sum(confidences) / len(confidences) if confidences else 0.0

        cleaned = clean_text(extracted)

        if len(cleaned) > 10:
            print(f"  SUCCESS ({avg_conf:.2f}) {cleaned[:50]}...")
        else:
            print(f"  EMPTY")

        new_row = pd.DataFrame([{
            'image_name': image_name,
            'extracted_text': extracted,
            'cleaned_text': cleaned,
            'confidence': round(avg_conf, 4)
        }])
        df = pd.concat([df, new_row], ignore_index=True)

    except Exception as e:
        print(f"  Error: {e}")
        new_row = pd.DataFrame([{
            'image_name': image_name,
            'extracted_text': 'ERROR',
            'cleaned_text': 'ERROR',
            'confidence': 0.0
        }])
        df = pd.concat([df, new_row], ignore_index=True)

    gc.collect()
    torch.cuda.empty_cache()

print("\n" + "="*50)
print(f"Total: {len(df)}")
print(f"Success: {len(df[df['confidence'] > 0])}")

df.to_csv('easyocr_results.csv', index=False, encoding='utf-8-sig')
print("\nSaved: easyocr_results.csv")


Found 34 images

[1/34] 5949345152205785900_121.jpg
  SUCCESS (0.68) 7 49 0540547... مفيز بسعل هشتاقك 6 Top Latest Peop...
[2/34] 5949345152205785903_121.jpg


  df = pd.concat([df, new_row], ignore_index=True)


  SUCCESS (0.65) 7 49 مصر Top Latest People Media Lists 1218 151 11...
[3/34] 5949345152205785887_121.jpg
  SUCCESS (0.66) 7 29 5 655 music Top Latest People Media Lists Mus...
[4/34] 5949345152205785888_121.jpg
  SUCCESS (0.70) 7 29 5 655 music Top Latest People Media Lists . ....
[5/34] 5949345152205785882_121.jpg
  SUCCESS (0.66) 7 30 755 food 6- Top Latest People Media Lists Pla...
[6/34] 5949345152205785896_121.jpg
  SUCCESS (0.62) 7 33 holiday 6 Top Latest People Media Lists Iman ...
[7/34] 5949345152205785904_121.jpg
  SUCCESS (0.67) 7 49 دصر Top Latest People Media Lists Gamal Gamal...
[8/34] 5949345152205785907_121.jpg
  SUCCESS (0.76) 7 49 755 مصر Top Latest People Media Lists People ...
[9/34] 5949345152205785891_121.jpg
  SUCCESS (0.62) 7 27 5 655 Search For Vou Trending News Sports Ent...
[10/34] 5949345152205785890_121.jpg
  SUCCESS (0.61) 7 28 5 Putin and Russia Top Latest People Media Li...
[11/34] 5949345152205785884_121.jpg
  SUCCESS (0.35) 7 30 5 655 technology 6 Top

# **classyfaing using another encoder**
# loading the model

In [1]:
!pip install -U FlagEmbedding

from FlagEmbedding import FlagModel
import numpy as np

model = FlagModel('BAAI/bge-large-en-v1.5',
                  query_instruction_for_retrieval="Represent this sentence for searching relevant passages:",
                  use_fp16=True)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


cryptocurrency: 0.6445
science: 0.4614
politics: 0.4578
entertainment: 0.4333
sports: 0.4214


In [3]:
import pandas as pd
df=pd.read_csv('/content/easyocr_results.csv')
df.head()

Unnamed: 0,image_name,extracted_text,cleaned_text,confidence
0,5949345152205785900_121.jpg,7:49 0540547... مفيز بسعل # هشتاقك ٦ Top Lates...,7 49 0540547... مفيز بسعل هشتاقك 6 Top Latest ...,0.6824
1,5949345152205785903_121.jpg,7:49 مصر Top Latest People Media Lists 1218 15...,7 49 مصر Top Latest People Media Lists 1218 15...,0.6548
2,5949345152205785887_121.jpg,7:29 ٥ ٦٥٥ music Top Latest People Media Lists...,7 29 5 655 music Top Latest People Media Lists...,0.6563
3,5949345152205785888_121.jpg,7:29 ٥ ٦٥٥ music Top Latest People Media Lists...,7 29 5 655 music Top Latest People Media Lists...,0.7049
4,5949345152205785882_121.jpg,7:30 ٧٥٥ food ٦- Top Latest People Media Lists...,7 30 755 food 6- Top Latest People Media Lists...,0.6648


# classifying

In [4]:

def classify_bge(text, categories):
    text_emb = model.encode([text])
    cat_embs = model.encode(categories)

    similarities = text_emb @ cat_embs.T
    scores = similarities[0].tolist()

    results = list(zip(categories, scores))
    results.sort(key=lambda x: x[1], reverse=True)

    return results[0][0], results[0][1]

categories = [
    "تكنولوجيا", "رياضة", "سياسة", "صحة", "طعام",
    "تعليم", "اقتصاد", "سياحة", "دين", "أدب",
    "فن", "علوم", "تاريخ", "قانون", "بيئة",
    "مجتمع", "سيارات", "أزياء", "تسويق", "عقارات", "إعلانات"
]

print("Classifying texts...")

df['category'] = None
df['confidence_score'] = None

for idx, row in df.iterrows():
    text = row['cleaned_text']

    if text and text.strip() and text != 'TIMEOUT' and text != 'ERROR':
        category, score = classify_bge(text, categories)

        df.at[idx, 'category'] = category
        df.at[idx, 'confidence_score'] = round(score, 4)

        print(f"{row['image_name']} -> {category} ({score*100:.1f}%)")
    else:
        df.at[idx, 'category'] = "unknown"
        df.at[idx, 'confidence_score'] = 0.0
        print(f"{row['image_name']} -> No text found")

print("\nClassification Complete!")

print("\nFinal DataFrame:")
print(df[['image_name', 'category', 'confidence_score']])

print("\nSummary by Category:")
print(df['category'].value_counts())

df.to_csv('classification_results_bge.csv', index=False, encoding='utf-8-sig')
df.to_excel('classification_results_bge.xlsx', index=False)

print("\nResults saved!")

Classifying texts...
5949345152205785900_121.jpg -> مجتمع (65.4%)
5949345152205785903_121.jpg -> مجتمع (62.0%)
5949345152205785887_121.jpg -> طعام (52.8%)
5949345152205785888_121.jpg -> طعام (51.0%)
5949345152205785882_121.jpg -> فن (46.8%)
5949345152205785896_121.jpg -> سياحة (48.3%)
5949345152205785904_121.jpg -> دين (64.2%)
5949345152205785907_121.jpg -> صحة (64.3%)
5949345152205785891_121.jpg -> دين (45.6%)
5949345152205785890_121.jpg -> تعليم (48.7%)
5949345152205785884_121.jpg -> فن (52.5%)
5949345152205785897_121.jpg -> صحة (45.9%)
5949345152205785880_121.jpg -> فن (48.5%)
5949345152205785908_121.jpg -> اقتصاد (64.2%)
5949345152205785895_121.jpg -> دين (51.5%)
5949345152205785911_121.jpg -> تعليم (65.2%)
5949345152205785883_121.jpg -> فن (43.2%)
5949345152205785902_121.jpg -> مجتمع (57.4%)
5949345152205785889_121.jpg -> طعام (51.7%)
5949345152205785909_121.jpg -> مجتمع (66.7%)
5949345152205785906_121.jpg -> تعليم (62.5%)
5949345152205785893_121.jpg -> فن (58.5%)
5949345152205785