## Import

In [1]:
import anthropic
import google.generativeai as genai
from google.api_core.exceptions import ResourceExhausted
import glob
import os
from tqdm import tqdm
import base64
from time import sleep
import json
import PIL
import matplotlib.pyplot as plt

In [2]:
DATA_PATH = "data"

In [3]:
prompt="Generate a question/answer pair about this slide.\n* The question must be phrased such that a user could find the slide relevant to the question from a large collection of slides.\n* The question should not be trivial and must need the visual information from the slide to be answered correctly.\n* The question and answer must never refer directly to the slide or assume that only one slide is provided. The slide and the question will be provided to the end user along other slides that can be from other presentations.\n* The question must be phrased without refering to this slide. Instead of saying \"[question] d'après la diapositive ?\" or \"[question] d'après la présentation ?\" the question will be asked directly \"[question] ?\".\n* Both the question and the answer must be in French, in a json format {\"question\": ..., \"answer\": ...}\nIf there is no question/answer pair for the slide that has all the characteristics needed, please return the empty dictionnary {}. Do not overthink it.\nExamples:\n{\"question\": \"Quel est le pourcentage de baisse des achats de dessous en 2013 ?\", \"answer\": \"1,7%\"}\n{\"question\": \"Quels parcs d'attractions se situent dans la catégorie \"Sportif / Sensation\" avec un univers de marque faible ?\", \"answer\": \"Aqualand et Bassin Aventure sont les parcs d'attractions de la catégorie \"Sportif / Sensation\" avec un univers de marque faible.\"}\n{\"question\": \"En programmation, quelles sont les deux structures de traitement conditionnel ?\", \"answer\": \"Les deux structures de traitement conditionnel sont la structure de sélection simple et la structure de sélection multiple.\"}\n{\"question\": \"Quelle est la capitalisation boursière mondiale du Bitcoin en milliards de dollars en avril 2021 ?\", \"answer\": \"La capitalisation boursière mondiale du Bitcoin était de 1 179 milliards de dollars en avril 2021.\"}"

## Preprocess

In [4]:
def get_data(dir) :
    """
    get file names of the directory dir
    Input:
    dir: str-like, name of the slideshow
    Output :
    fnames: list[str], list of fnames
    """
    key = lambda name: int(name.replace(f"{DATA_PATH}/{dir}\\slide_", "").replace(".jpg", ""))
    fnames = sorted(glob.glob(f"{DATA_PATH}/{dir}/*.jpg"), key=key)
    return fnames

In [5]:
def encode_image64(image_path):
  with open(image_path, "rb") as image_file:
    return base64.b64encode(image_file.read()).decode('utf-8')

In [6]:
def encode_imgpil(image_path) :
    return PIL.Image.open(image_path)

In [7]:
def send_prompt_to_gemini(model, image, prompt) :
    response = model.generate_content([prompt, image])
    return response.text

In [8]:
def prompt_to_json(model, img, prompt) :
    try :
        text = send_prompt_to_gemini(model, img, prompt)
    except json.JSONDecodeError as e : 
        print(e) 
        text = text[7:-3]
    return json.loads(text)

In [9]:
def generate_qa(model, prompt=prompt) :
    global last_dir
    listdir = sorted(os.listdir(DATA_PATH), key=lambda dir: int(dir))
    #prompt = "Generate only one question/answer pair about this slide. The question should be specific and descriptive enough that a user could find the relevant slide from a large collection of slides, for instance it should name the brand or company it relates to if that is relevant. The question should not be trivial and must need the visual information to be answered correctly. Both the question and the answer must be in French, in a format [{\"question\": ..., \"answer\": ...}]. Your answer have to be return only in this format. Becareful to put double quotes on keyword question and answer in the json format. It's forbiden to generate more than one pair of question/answer"
    for i in tqdm(range(last_dir+1, len(listdir))) :
        dir = listdir[i]
        fnames = get_data(dir)
        qas = []

        for fname in fnames :
            try :
                img = encode_imgpil(fname)
                try : qas.append(prompt_to_json(model, img, prompt))
                except ValueError as e : #If ValueError we retry to send prompt
                    try : qas.append(prompt_to_json(model, img, prompt))
                    except ValueError as e : qas.append("{}") #If it's again a ValueError we pass the slide
                sleep(5)
            except PIL.UnidentifiedImageError as e : qas.append("{}")
            except ResourceExhausted as e : return "Resource Exhausted raised"
            
        json_path = f"{DATA_PATH}/{dir}/qa_{dir}.json"
        if os.path.exists(json_path) : os.remove(json_path)

        with open(json_path, "w") as f :
            json.dump(qas, f)
        last_dir += 1

        with open("last_dir.txt", "w") as f :
            f.write(str(last_dir))
        

## Execution Q/A

1/ Init model :

In [10]:
genai.configure()
model = genai.GenerativeModel(model_name="gemini-1.5-flash")

2-A/ If you generate question/answer from the begining, execute this cell :

In [11]:
last_dir = -1

2-B/ If there is a break in the loop of generation, please execute this cell :

In [13]:
with open("last_dir.txt", "r") as f :
    last_dir = int(f.read())

3/ Generate Q/A cell :

In [14]:
generate_qa(model)

 89%|████████▉ | 75/84 [2:29:26<17:56, 119.56s/it]  


'Resource Exhausted raised'

## Post-processing

Delete slides with empty question/answer :

In [None]:
deleted = 0
for dir in sorted(os.listdir(DATA_PATH), key=lambda dir: int(dir)) :
    with open(f"{DATA_PATH}/{dir}/qa_{dir}.json", "r") as f :
        qas = json.load(f)
    with open(f"{DATA_PATH}/{dir}/{dir}.json", "r") as f:
        meta_data = json.load(f)

    not_empty_qa = [qa for qa in qas if qa and qa != "{}"]
    idx_empty_qa = [i for i,qa in enumerate(qas) if not qa or qa =="{}"]
    meta_data["len"] = int(meta_data["len"])-len(idx_empty_qa)
    deleted += len(idx_empty_qa)

    sorted_fnames = get_data(dir)
    for idx_slide in idx_empty_qa :
        try :
            os.remove(sorted_fnames[idx_slide])
        except FileNotFoundError as e : raise e

    with open(f"{DATA_PATH}/{dir}/{dir}.json", "w") as f :
        json.dump(meta_data, f)
    with open(f"{DATA_PATH}/{dir}/qa_{dir}.json", "w") as f:
        json.dump(not_empty_qa, f)

In [17]:
deleted

351

Verify if all questions/answers pairs have {"question", "answer} as a structure :

In [18]:
for dir in sorted(os.listdir(DATA_PATH), key=lambda dir: int(dir)) :
    with open(f"{DATA_PATH}/{dir}/qa_{dir}.json", "r") as f :
        qas = json.load(f)
    for pos,qa in enumerate(qas) :
        if list(qa.keys()) != ["question", "answer"] : print(f"{dir} : {qa}")