In [41]:
import zipfile
import pandas as pd
import numpy as np
from PIL import Image
import io
import cv2  # Optional (if you prefer OpenCV)
import json
import os

#### **MultiOFF**

In [42]:
dataframes = {}  # Stores CSV data with filenames as keys

zip_path = "../external_data/MultiOFF/annotations.zip"

try:
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        # List all CSV files in the ZIP archive
        csv_files = [f for f in zip_ref.namelist() if f.endswith('.csv')]
        
        if not csv_files:
            print("No CSV files found in the ZIP archive.")
        else:
            for csv_file in csv_files:
                with zip_ref.open(csv_file) as f:
                    # Read CSV into a DataFrame
                    df = pd.read_csv(f)
                    dataframes[csv_file] = df
                    print(f"Successfully read: {csv_file}")

except FileNotFoundError:
    print(f"Error: The file {zip_path} does not exist.")
except zipfile.BadZipFile:
    print("Error: The ZIP file is corrupted or invalid.")

Successfully read: Training_meme_dataset.csv
Successfully read: Testing_meme_dataset.csv
Successfully read: Validation_meme_dataset.csv


In [43]:
training_multioff = dataframes["Training_meme_dataset.csv"]
validation_multioff = dataframes["Validation_meme_dataset.csv"]
test_multioff = dataframes["Testing_meme_dataset.csv"]

#Remove the label column for each dataset
training_multioff = training_multioff.drop(columns=["label"])
validation_multioff = validation_multioff.drop(columns=["label"])
test_multioff = test_multioff.drop(columns=["label"])


print(training_multioff.head())
print(validation_multioff.head())
print(test_multioff.head())


#Concatenate all the dataframes into one
all_data = pd.concat([training_multioff, validation_multioff, test_multioff], ignore_index=True)
# Reset the index of the concatenated dataframe
all_data.reset_index(drop=True, inplace=True)

#Toal length of the datasets combined together written explicity

print("\n\n Total length of the datasets combined together:",len(all_data))


        image_name                                           sentence
0  LJ3r8Gy.jpg.png  OFFICIAL BERNIE SANDERS DRINKING GAME ! Every ...
1      qDnIIHA.png  2:28 PM THIS IS A WALL INSIDE A NAZI GAS CHAMB...
2      1JQk5NF.png                o shit waddup ! BERNIE SANDERS COM 
3        iMMNq.png  `` MITT ROMNEY IS THE WORST REPUBLICAN IN THE ...
4      jAi3iI1.png  Anonymous ( ID : duqdA1io a 08/05/16 ( Fri ) 1...
    image_name                                           sentence
0  Xxc4mjq.png  WE LIKE IKE I LIKE IKE FRANK CULOTTA REPUBLICA...
1  ZffTHk2.png                                   Glory to Bern . 
2  TyYDiSx.png  My mom got kicked out of her emotionally abusi...
3  FsAbNtn.png          J. TRUMP DONALD MA DE N MEXIC i RN 47333 
4  ERe2yR0.png  score hidden ] 5 hours ago My friend committed...
    image_name                                           sentence
0  jyxHhiB.png  3 hrs Black nurse in Connecticut asked me if T...
1  we4hhWi.png  I do n't believe that women have any

In [44]:
zip_path = '../external_data/MultiOFF/labelled_images.zip'
subfolder = 'Labelled Images/'  # Must match the exact casing in the ZIP

# Simple list to store PIL images
image_list_mff = []

with zipfile.ZipFile(zip_path, 'r') as zip_file:
    # Get all files in the subfolder with common image extensions
    for file in zip_file.namelist():
        if file.startswith(subfolder) and file.lower().endswith(('.png', '.jpg', '.jpeg')):
            with zip_file.open(file) as img_file:
                try:
                    img = Image.open(io.BytesIO(img_file.read())).convert('RGB')
                    image_list_mff.append(img)
                    print(f"Loaded: {file.split('/')[-1]}")  # Show filename without path
                except Exception as e:
                    print(f"Skipped {file}: {str(e)}")

print(f"\nTotal images loaded: {len(image_list_mff)}")

Loaded: z5sSuk3.png
Loaded: zQPb3nD.png
Loaded: ykGRBWn.png
Loaded: ZMaAKc9.png
Loaded: Zh4CZSe.png
Loaded: zOImoh5.png
Loaded: ZOSUjXw.png
Loaded: zRgnrbT.png
Loaded: z1gozo1.png
Loaded: ZEjNdo0.png
Loaded: yZLLqn4.png
Loaded: zKbUmLB.png
Loaded: yWd3NEB.png
Loaded: ZbqRtuL.png
Loaded: ZwFfyVJ.png
Loaded: zq7bXGu.png
Loaded: ysAk127.png
Loaded: zoIImWJ.png
Loaded: ZWajHQq.png
Loaded: zZIOwqg.png
Loaded: YJ1QbDX.png
Loaded: zFnJ1x3.png
Loaded: zPT9dFf.png
Loaded: zr5VeQM.png
Loaded: yLF1GSc.png
Loaded: zl617iP.png
Loaded: YW7mx62.png
Loaded: Z1VeVXi.png
Loaded: zAUFD7y.png
Loaded: zs6maXI.png
Loaded: y898T2X.png
Loaded: ztdf8dZ.png
Loaded: ZAloCy9.png
Loaded: xGBX7zN.png
Loaded: XMReWCt.png
Loaded: ykaOeQZ.png
Loaded: xYTrJJp.png
Loaded: xtvEjIx.png
Loaded: ZffTHk2.png
Loaded: xrh5RQD.png
Loaded: yiS7xVy.png
Loaded: yGHKtzg.png
Loaded: xLXvr9t.png
Loaded: YIi9Bga.png
Loaded: XYERAZg.png
Loaded: yDO396v.png
Loaded: XwccSUy.png
Loaded: xJiXDnY.png
Loaded: xyi7yMK.png
Loaded: xQkfJse.png


#### **HarMeme**

In [45]:

annotations_dir = '../external_data/HarMeme/mmf/data/datasets/memes/defaults/annotations'

data = {}  # Stores data with filenames as keys

# Check if the directory exists
if not os.path.isdir(annotations_dir):
    print(f"Directory not found: {annotations_dir}")
else:
    # List all .jsonl files in the directory
    jsonl_files = [f for f in os.listdir(annotations_dir) if f.endswith('.jsonl')]
    
    if not jsonl_files:
        print(f"No .jsonl files found in {annotations_dir}")
    else:
        for file_name in jsonl_files:
            file_path = os.path.join(annotations_dir, file_name)
            data[file_name] = []
            
            # Read each line of the JSONL file
            with open(file_path, 'r', encoding='utf-8') as f:
                for line in f:
                    try:
                        # Parse each line as a JSON object
                        json_obj = json.loads(line)
                        data[file_name].append(json_obj)
                    except json.JSONDecodeError as e:
                        print(f"Error parsing line in {file_name}: {str(e)}")
            
            print(f"Loaded {len(data[file_name])} entries from {file_name}")

Loaded 354 entries from test.jsonl
Loaded 3013 entries from train.jsonl
Loaded 177 entries from val.jsonl


In [46]:
image_dir = '../external_data/HarMeme/mmf/data/datasets/memes/defaults/images'

images_har = []
valid_extensions = ('.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tiff')

for filename in os.listdir(image_dir):
    if filename.lower().endswith(valid_extensions):
        img_path = os.path.join(image_dir, filename)
        try:
            img = Image.open(img_path)  # RGB format
            images_har.append(img)
        except Exception as e:
            print(f"Error loading {filename}: {str(e)}")

print(f"Loaded {len(images_har)} images.")

Loaded 3544 images.


#### **Fine-Tuning the TrOCR with HarMeme and Multioff datasets combined**

In [47]:
# View the data in HarMeme and Multioff
all_data

Unnamed: 0,image_name,sentence
0,LJ3r8Gy.jpg.png,OFFICIAL BERNIE SANDERS DRINKING GAME ! Every ...
1,qDnIIHA.png,2:28 PM THIS IS A WALL INSIDE A NAZI GAS CHAMB...
2,1JQk5NF.png,o shit waddup ! BERNIE SANDERS COM
3,iMMNq.png,`` MITT ROMNEY IS THE WORST REPUBLICAN IN THE ...
4,jAi3iI1.png,Anonymous ( ID : duqdA1io a 08/05/16 ( Fri ) 1...
...,...,...
738,fyAh3I0.png,WHAT IF POKEMON GO WAS RELEASED TO D US FRO TH...
739,CMQqhImUkAAKeno.jpg,Donald Trump 's hair looks like someone tried ...
740,tINjUCc.png,MAMAS Who am I supposed to vote for ? Am l sup...
741,eI2N5iQ.png,EcakpnBeHn rnacoBnl 18 minutes ago We will hav...


In [48]:
image_list_mff

[<PIL.Image.Image image mode=RGB size=617x529>,
 <PIL.Image.Image image mode=RGB size=2048x1653>,
 <PIL.Image.Image image mode=RGB size=500x493>,
 <PIL.Image.Image image mode=RGB size=822x551>,
 <PIL.Image.Image image mode=RGB size=960x809>,
 <PIL.Image.Image image mode=RGB size=1439x1393>,
 <PIL.Image.Image image mode=RGB size=497x727>,
 <PIL.Image.Image image mode=RGB size=540x960>,
 <PIL.Image.Image image mode=RGB size=1426x1502>,
 <PIL.Image.Image image mode=RGB size=1440x2392>,
 <PIL.Image.Image image mode=RGB size=750x669>,
 <PIL.Image.Image image mode=RGB size=736x414>,
 <PIL.Image.Image image mode=RGB size=1142x782>,
 <PIL.Image.Image image mode=RGB size=1024x769>,
 <PIL.Image.Image image mode=RGB size=480x446>,
 <PIL.Image.Image image mode=RGB size=500x263>,
 <PIL.Image.Image image mode=RGB size=400x400>,
 <PIL.Image.Image image mode=RGB size=1344x960>,
 <PIL.Image.Image image mode=RGB size=1562x520>,
 <PIL.Image.Image image mode=RGB size=750x804>,
 <PIL.Image.Image image mode