# Symbols

Run large language models on memes with symbols

Documentation: 
* API: https://platform.openai.com/docs/api-reference/streaming
* image processing: https://platform.openai.com/docs/guides/vision

In [127]:
import base64
import json
import openai
import os
import pickle
import regex
import time
from IPython.display import HTML

## 1. Set the API key location

In [2]:
KEYFILE = ".openaikey_20241031"

In [3]:
def get_openai_key(keyfile):
    infile = open(keyfile, "r")
    lines = infile.readlines()
    infile.close()
    return lines[-1].split("=")[-1].strip()

In [4]:
os.environ["OPENAI_API_KEY"] = get_openai_key(KEYFILE)

## 2. Image analysis with ChatGPT

In [9]:
image_dir = "../../data/symbols/final_dataset"
chatgpt_dir = "chatgpt_output"
prompt = "What is in this image?"
client = openai.OpenAI()

In [7]:
def sort_file_names_numerically(list_of_files):
    return sorted(list_of_files, key=lambda x: int(os.path.splitext(x)[0]))

### 2.1 Perform image analysis 

In [53]:
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        base64_data = base64.b64encode(image_file.read()).decode('utf-8')
        image_file.close()
        return base64_data

In [54]:
def analyze_image(client, image_path, prompt):
    base64_image = encode_image(image_path)
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": [
            { "type": "text", "text": prompt },
            { "type": "image_url", "image_url": 
                { "url":  f"data:image/jpeg;base64,{base64_image}" },
            },
        ]}]
    )
    return response

In [74]:
def store_response(file_name, response):
    logfile_name = os.path.join(chatgpt_dir, regex.sub("jpg", "pkl", file_name))
    with open(logfile_name, "wb") as logfile:
        pickle.dump(response, logfile)
        #print(response, file=logfile)
        logfile.close()

In [75]:
for file_name in sorted(os.listdir(image_dir), key=lambda x: int(regex.sub("...g$", "", x))):
    if regex.search(".jpg$", file_name):
        image_path = os.path.join(image_dir, file_name)
        response = analyze_image(client, image_path, prompt)
        print(file_name, end=" ")
        store_response(file_name, response)
        time.sleep(1)

2.jpg 3.jpg 4.jpg 5.jpg 6.jpg 7.jpg 8.jpg 9.jpg 10.jpg 11.jpg 12.jpg 13.jpg 14.jpg 15.jpg 16.jpg 17.jpg 18.jpg 19.jpg 20.jpg 21.jpg 22.jpg 23.jpg 24.jpg 25.jpg 26.jpg 27.jpg 28.jpg 29.jpg 30.jpg 31.jpg 32.jpg 33.jpg 34.jpg 35.jpg 36.jpg 37.jpg 38.jpg 39.jpg 40.jpg 41.jpg 42.jpg 43.jpg 44.jpg 46.jpg 47.jpg 48.jpg 49.jpg 50.jpg 51.jpg 52.jpg 53.jpg 54.jpg 55.jpg 56.jpg 57.jpg 58.jpg 59.jpg 61.jpg 62.jpg 63.jpg 64.jpg 65.jpg 66.jpg 67.jpg 68.jpg 69.jpg 70.jpg 71.jpg 72.jpg 73.jpg 74.jpg 75.jpg 76.jpg 77.jpg 78.jpg 79.jpg 80.jpg 81.jpg 82.jpg 83.jpg 84.jpg 85.jpg 86.jpg 87.jpg 88.jpg 89.jpg 90.jpg 91.jpg 92.jpg 93.jpg 94.jpg 95.jpg 96.jpg 97.jpg 98.jpg 99.jpg 100.jpg 101.jpg 102.jpg 103.jpg 104.jpg 106.jpg 107.jpg 108.jpg 109.jpg 111.jpg 112.jpg 113.jpg 114.jpg 115.jpg 116.jpg 117.jpg 118.jpg 119.jpg 120.jpg 121.jpg 122.jpg 123.jpg 124.jpg 125.jpg 126.jpg 127.jpg 128.jpg 129.jpg 130.jpg 131.jpg 132.jpg 133.jpg 134.jpg 135.jpg 136.jpg 137.jpg 138.jpg 139.jpg 140.jpg 141.jpg 142.jpg 143.jpg 

### 2.2 Analyze responses

In [5]:
def get_response_texts():
    response_texts = {}
    for file_name in sort_file_names_numerically(os.listdir(chatgpt_dir)):
        if os.path.splitext(file_name)[-1] in [".pkl"]:
            with open(os.path.join(chatgpt_dir, file_name), "rb") as infile:
                my_object = pickle.load(infile)
                infile.close()
            response_texts[file_name] = my_object.choices[0].message.content
    return response_texts

In [16]:
def select_refused_analyses(response_texts):
    return {file_name: text
            for file_name, text in response_texts.items() 
                if len(text) < 100 and 
                regex.search("sorry", text, regex.IGNORECASE)}

In [41]:
def count_images_with_errors():
    return len(os.listdir(os.path.join(image_dir, "../images_with_errors")))

In [138]:
response_texts = get_response_texts()
refused_analyses = select_refused_analyses(response_texts)

In [43]:
print("number of successful analyses:", len(response_texts) - len(refused_analyses))
print("number of refused analyses:   ", len(refused_analyses))
print("broken image files:           ", count_images_with_errors())

number of successful analyses: 431
number of refused analyses:    96
broken image files:            15


In [19]:
symbol_file_name = "ontox_dict.json"

def read_symbol_file(symbol_file_name):
    with open(os.path.join(image_dir, "..", symbol_file_name), "r") as infile:
        symbol_df = json.load(infile)
        infile.close()
    return symbol_df

In [22]:
def get_symbols_per_meme(symbol_df):
    return {file: symbol_df[symbol]["Title"] 
            for symbol in symbol_df 
            if "Referenced_in_meme" in symbol_df[symbol] 
            for file in symbol_df[symbol]["Referenced_in_meme"]}

In [176]:
def check_analyses_for_symbols(response_texts, symbols_per_meme):
    matched_memes = {}
    missing_images = []
    for pkl_file_name in response_texts:
        if pkl_file_name not in refused_analyses:
            base_file_name = os.path.splitext(pkl_file_name)[0]
            symbol_title = symbol_titles.get(base_file_name + ".png")
            if not symbol_title:
                missing_images.append(base_file_name)
            elif regex.search(symbol_title, response_texts[pkl_file_name], regex.IGNORECASE):
                matched_memes[symbol_title] = matched_memes.get(symbol_title, 0) + 1
    if missing_images:
        print(f"{len(missing_images)} memes not found in symbol database! {missing_images}")
    return matched_memes

In [178]:
def show_texts_with_symbols(response_texts, symbols_per_meme):
    for pkl_file_name in response_texts:
        if pkl_file_name not in refused_analyses:
            base_file_name = os.path.splitext(pkl_file_name)[0]
            symbol_title = symbol_titles.get(base_file_name + ".png")
            if symbol_title:
                if matches := list(regex.finditer(symbol_title, response_texts[pkl_file_name], regex.IGNORECASE)):
                    text = response_texts[pkl_file_name]
                    for match in matches[::-1]:
                        start = match.start()
                        end = match.end()
                        text = text[:start] + "<span style=\"color:blue\">" + text[start: end] +"</span>" + text[end:]
                    matches = list(regex.finditer("text", text, regex.IGNORECASE))
                    for match in matches[::-1]:
                        start = match.start()
                        end = match.end()
                        text = text[:start] + "<span style=\"color:red\">" + text[start: end] +"</span>" + text[end:]
                    display(HTML(pkl_file_name + ": " + text))

In [177]:
symbol_df = read_symbol_file(symbol_file_name)
symbols_per_meme = get_symbols_per_meme(symbol_df)
matched_memes = check_analyses_for_symbols(response_texts, symbols_per_meme)

18 memes not found in symbol database! ['524', '525', '526', '527', '528', '529', '531', '532', '533', '534', '535', '536', '537', '538', '539', '540', '541', '542']


In [181]:
print("number of memes with a matched symbol:", sum([count for file_name, count in matched_memes.items()]))
print("number of symbols with a matched meme:", len(matched_memes))

number of memes with a matched symbol: 145
number of symbols with a matched meme: 30


Result of manual analysis of output of `show_texts_with_symbols(response_texts, symbols_per_meme)`:
* only 9 of the 145 found symbols are solely based on visual clues
* the other 136 symbol matches are based on text in the memes
* the exceptions are meme numbers 62, 63, 65, 68, 70, 73 (Pepe the Frog), 146, 149, (Kekistan flag) and 391 (LGTB flag)

In [186]:
# show_texts_with_symbols(response_texts, symbols_per_meme)

## 3. Image analysis with Llava:34b

In [149]:
llava_dir = "llava_output"
llava_prompt = "What is in this image? In particular mention all the text you can find in the image"

In [169]:
def get_llava_response_texts():
    llava_response_texts = {}
    for file_name in sort_file_names_numerically(os.listdir(llava_dir)):
        with open(os.path.join(llava_dir, file_name), "r") as infile:
            llava_response_texts[file_name.replace("txt", "pkl")] = " ".join([sentence.strip() for sentence in infile.readlines()])
            infile.close()
    return llava_response_texts

In [170]:
llava_response_texts = get_llava_response_texts()
llava_refused_analyses = select_refused_analyses(llava_response_texts)

In [171]:
print("number of successful analyses:", len(llava_response_texts) - len(llava_refused_analyses))
print("number of refused analyses:   ", len(llava_refused_analyses))
print("broken image files:           ", count_images_with_errors())

number of successful analyses: 527
number of refused analyses:    0
broken image files:            15


In [173]:
llava_matched_memes = check_analyses_for_symbols(llava_response_texts, symbols_per_meme)

18 memes not found in symbol database! ['524.png', '525.png', '526.png', '527.png', '528.png', '529.png', '531.png', '532.png', '533.png', '534.png', '535.png', '536.png', '537.png', '538.png', '539.png', '540.png', '541.png', '542.png']


In [183]:
print("number of memes with a matched symbol:", sum([count for file_name, count in llava_matched_memes.items()]))
print("number of symbols with a matched meme:", len(llava_matched_memes))

number of memes with a matched symbol: 108
number of symbols with a matched meme: 27


In [187]:
# show_texts_with_symbols(llava_response_texts, symbols_per_meme)