In [13]:
from transformers import AutoTokenizer, AutoModel, logging
import pandas as pd
from tqdm import tqdm
from pathlib import Path
import re
import torch
import gc


print(torch.__version__)
print("Built with CUDA support:", torch.version.cuda is not None)

2.5.1
Built with CUDA support: True


In [2]:

tokenizer = AutoTokenizer.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True)
model = AutoModel.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True, low_cpu_mem_usage=True, device_map='cuda', use_safetensors=True, pad_token_id=tokenizer.eos_token_id)
#model = AutoModel.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True, low_cpu_mem_usage=True, use_safetensors=True, pad_token_id=tokenizer.eos_token_id)
model = model.eval().cuda()
#model = model.eval()





In [None]:
# ─── SUPPRESS TRANSFORMERS WARNINGS ───────────────────────────────────────
# Reduce verbosity to hide pad_token_id and attention_mask warnings
logging.set_verbosity_error()

# ─── CONFIG ────────────────────────────────────────────────────────────────
local_dir  = "got_ocr2_0"            # path to your GOT‑OCR2 model folder
frames_dir = Path.cwd() / "frames"   # directory containing video_* folders
output_csv = Path.cwd() / "ocr_results.csv"  # where to save the CSV

# ─── LOAD MODEL ────────────────────────────────────────────────────────────
# Load tokenizer and model as before
tokenizer = AutoTokenizer.from_pretrained(local_dir, trust_remote_code=True)
model     = AutoModel.from_pretrained(
    local_dir,
    trust_remote_code=True,
    use_safetensors=True,
    device_map="auto",
    pad_token_id=tokenizer.eos_token_id  # set pad_token_id in config
)
model.config.pad_token_id = tokenizer.eos_token_id
model.eval()

def clear_cache():
    """Free up GPU & Python memory."""
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

def frame_index(path: Path):
    """Extract trailing integer from filename, or return None."""
    m = re.search(r"(\d+)(?=\.jpg$)", path.name)
    return int(m.group(1)) if m else None

# ─── MAIN OCR LOOP ─────────────────────────────────────────────────────────
records = []
video_dirs = sorted(frames_dir.glob("video_*"))
print(f"Found {len(video_dirs)} video directories: {[d.name for d in video_dirs]}")

with torch.no_grad():
    for video_dir in tqdm(video_dirs, desc="Videos", unit="video"):
        clear_cache()
        frames = sorted(video_dir.glob("*.jpg"), key=lambda p: frame_index(p) or -1)
        for frame_path in tqdm(frames, desc=f" Frames in {video_dir.name}", unit="frame"):
            clear_cache()
            try:
                result = model.chat(
                    tokenizer,
                    str(frame_path),
                    ocr_type="ocr"
                )
            except Exception as e:
                print(f"[WARN] {video_dir.name}/{frame_path.name} failed: {e}")
                result = "<ERROR>"
            records.append({"video": video_dir.name, "frame": frame_path.name, "text": result})

# ─── SAVE RESULTS ──────────────────────────────────────────────────────────
df = pd.DataFrame(records, columns=["video", "frame", "text"])
df.to_csv(output_csv, index=False)
print(f"\n Finished! Wrote {len(df)} rows to {output_csv}")


Some parameters are on the meta device because they were offloaded to the cpu.


Found 3 video directories: ['video_1', 'video_2', 'video_3']


 Frames in video_1: 100%|██████████| 95/95 [03:02<00:00,  1.92s/frame]
 Frames in video_2: 100%|██████████| 108/108 [01:13<00:00,  1.48frame/s]
 Frames in video_3: 100%|██████████| 193/193 [02:24<00:00,  1.34frame/s]
Videos: 100%|██████████| 3/3 [06:40<00:00, 133.37s/video]


✔ Finished! Wrote 396 rows to c:\Users\Mert\OneDrive\Desktop\Deep_Learning_Project\NoHateZone\OCR_impl\ocr_results.csv





In [3]:
from transformers import AutoTokenizer, AutoModel, logging
import pandas as pd
from tqdm import tqdm
from pathlib import Path
import re
import torch
import gc

# ─── CONFIG ────────────────────────────────────────────────────────────────
logging.set_verbosity_error()

cwd        = Path.cwd()
local_dir  = cwd  / "got_ocr2_0"   # path to your GOT‑OCR2 model folder
frames_dir = cwd  / "frames"       # directory containing video_* folders
output_csv = cwd  / "ocr_results.csv"  # where to save the CSV

# ─── LOAD MODEL ────────────────────────────────────────────────────────────
tokenizer = AutoTokenizer.from_pretrained(local_dir, trust_remote_code=True)
model     = AutoModel.from_pretrained(
    local_dir,
    trust_remote_code=True,
    use_safetensors=True,
    device_map="auto",
    pad_token_id=tokenizer.eos_token_id
)
model.config.pad_token_id = tokenizer.eos_token_id
model.eval()

def clear_cache():
    """Free up GPU & Python memory."""
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

def frame_index(path: Path):
    """Extract trailing integer from filename, or return -1."""
    m = re.search(r"(\d+)(?=\.jpg$)", path.name)
    return int(m.group(1)) if m else -1

# ─── MAIN OCR LOOP ─────────────────────────────────────────────────────────
records = []
if not frames_dir.exists():
    raise FileNotFoundError(f"Frames directory not found: {frames_dir}")

# find all subfolders named video_* (e.g. hate_video_1, non_hate_video_2)
video_dirs = sorted(
    [d for d in frames_dir.iterdir() if d.is_dir() and re.match(r"(?:non_)?hate_video_\d+$", d.name)],
    key=lambda d: int(re.search(r"(\d+)$", d.name).group(1))
)
print(f"Found {len(video_dirs)} video directories: {[d.name for d in video_dirs]}")

with torch.no_grad():
    for video_dir in tqdm(video_dirs, desc="Videos", unit="video"):
        clear_cache()
        # gather all .jpg frames in sorted order
        frames = sorted(video_dir.glob("*.jpg"), key=frame_index)
        for frame_path in tqdm(frames, desc=f"Frames in {video_dir.name}", unit="frame", leave=False):
            clear_cache()
            try:
                # perform OCR
                result = model.chat(
                    tokenizer,
                    str(frame_path),
                    ocr_type="ocr"
                )
            except Exception as e:
                print(f"[WARN] {video_dir.name}/{frame_path.name} failed: {e}")
                result = ""
            # record the output
            records.append({
                "video": video_dir.name,
                "frame": frame_path.name,
                "text":  result.strip()
            })

# ─── SAVE RESULTS ──────────────────────────────────────────────────────────
df = pd.DataFrame(records, columns=["video", "frame", "text"])
df.to_csv(output_csv, index=False)
print(f"\n✓ Finished! Wrote {len(df)} rows to {output_csv}")


Found 2 video directories: ['hate_video_1', 'non_hate_video_425']


Videos: 100%|██████████| 2/2 [02:28<00:00, 74.20s/video] 


✓ Finished! Wrote 183 rows to c:\Users\Mert\OneDrive\Desktop\Deep_Learning_Project\NoHateZone\OCR_impl\ocr_results.csv





In [4]:
df

Unnamed: 0,video,frame,text
0,hate_video_1,video_1_frame0.jpg,20935021/36129/20150530 11:12
1,hate_video_1,video_1_frame30.jpg,"Along time ago in a galaxy far, faraway. . ."
2,hate_video_1,video_1_frame60.jpg,"Along time ago in a galaxy far, faraway. . ."
3,hate_video_1,video_1_frame90.jpg,"Along time ago in a galaxy far, faraway. . ."
4,hate_video_1,video_1_frame120.jpg,"Along time ago in a galaxy far, faraway. . ."
...,...,...,...
178,non_hate_video_425,video_425_frame2490.jpg,12
179,non_hate_video_425,video_425_frame2520.jpg,5
180,non_hate_video_425,video_425_frame2550.jpg,COPA
181,non_hate_video_425,video_425_frame2580.jpg,5
