In [None]:
import os
os.chdir("/Users/antonioloison/Projects/illuin/quiz-generation")

In [None]:
import json
import re
import warnings
from typing import List, Optional, Union

from nltk import sent_tokenize
from tiktoken import Encoding, encoding_for_model
from transformers import AutoTokenizer, PreTrainedTokenizerBase

In [None]:
def get_token_nb(text: str, tokenizer: AutoTokenizer) -> int:
    token_nb = len(tokenizer.encode(text))
    return token_nb

In [None]:
# def get_splits(
#     transcripts: List[str],
#     tokenizer: AutoTokenizer,
#     max_length: int = 1000,
# ):
#     process_pile = [(transcript, "paragraph") for transcript in transcripts]
#     current_transcript = ""
#     splits = []
#     counter = 0
#     current_length = 0
#     while len(process_pile) > 0:
#         current_length = get_token_nb(current_transcript, tokenizer)

#         if current_length > max_length:
#             sentences = sent_tokenize(current_transcript)
#             process_pile = [(sent, "sentence") for sent in sentences] + process_pile
#             current_transcript = ""

#         else:
#             new_transcript = process_pile[0][0]
#             future_transcript = " ".join([current_transcript, new_transcript])
#             future_length = get_token_nb(future_transcript, tokenizer)
#             if future_length > max_length:
#                 if process_pile[0][1] == "paragraph":
#                     splits.append(current_transcript.strip())
#                     current_transcript = new_transcript
#                     process_pile.pop(0)
#                 elif process_pile[0][1] == "sentence":
#                     words = new_transcript.split()
#                     process_pile = [(word, "word") for word in words] + process_pile[1:]
#                 else:
#                     splits.append(current_transcript.strip())
#                     current_transcript = new_transcript
#                     process_pile.pop(0)

#             else:
#                 current_transcript = future_transcript
#                 process_pile.pop(0)
#         counter += 1
#         if counter > 10000:
#             raise ValueError("Infinite loop")

#     if current_length > 0:
#         splits.append(current_transcript.strip())

#     return splits

In [None]:
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")

In [None]:
def get_splits(
    transcripts: List[str],
    tokenizer: AutoTokenizer,
    max_length: int = 1000,
):
    transcripts_to_process = []
    max_length_tenth = max_length // 10
    false_max_length = max_length - max_length_tenth
    for transcript in transcripts:
        transcript_length = get_token_nb(transcript, tokenizer)
        if transcript_length > false_max_length:
            sentences = sent_tokenize(transcript)
            for sentence in sentences:
                sentence_length = get_token_nb(sentence, tokenizer)
                if sentence_length > false_max_length:
                    for word in sentence.split():
                        transcripts_to_process.append(word)
                else:
                    transcripts_to_process.append(sentence)
        else:
            transcripts_to_process.append(transcript)
    splits = []
    current_transcript = transcripts_to_process[0]
    for next_transcript in transcripts_to_process[1:]:
        future_transcript = current_transcript + " " + next_transcript
        future_length = get_token_nb(future_transcript, tokenizer)
        if future_length > false_max_length:
            splits.append(current_transcript)
            current_transcript = next_transcript
        else:
            current_transcript = future_transcript
    if get_token_nb(current_transcript, tokenizer) < max_length_tenth:
        splits[-1] += " " + current_transcript
    else:
        splits.append(current_transcript)
    return splits

In [None]:
get_splits(
    [
        "[Music]",
        "okay today we are on the last section of",
        "chapter 9 section 9.3 we're still",
        "talking about energy but today we're not",
        "gonna talk about energy in plants we're",
        "gonna talk about cellular respiration",
        "we're gonna talk about how that process",
        "works and I talked about photosynthesis",
        "we're talking about cellular respiration"
    ],
    tokenizer,
    30,
)

In [None]:
from quiz_generation.preprocessing.preprocessing import load_txt

In [None]:
all_transcripts = [
    load_txt("data/en/biology_high_school_class/transcript.txt"),
    load_txt("data/en/harvard_transcript/philosophy_lecture.txt"),
    load_txt("data/en/literature_class/transcript.txt"),
    load_txt("data/en/mit_videos_transcripts/clustering_transcript.txt")
]

In [None]:
for transcript in all_transcripts:
    splits = get_splits(
        transcript,
        tokenizer,
        1000
    )
    print([get_token_nb(split, tokenizer) for split in splits])