# Read & Write Questions

## Read Data

In [71]:
"""Read Data

Structure:
    1. Imports, Variables, Functions
    2. Load Data

"""

# 1. Imports, Variables, Functions
# imports
import sys, os, numpy as np, pandas as pd
import logging
import re

logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
# variables

specialization = "mathematics-for-ml"
course = "machine-learning-calculus"

data_path = os.path.join("..", "data", specialization, course)


# functions


# 2. Load Data

In [2]:
all_text = list()

for folder in os.listdir(data_path):
    # week folders
    folder_path = os.path.join(data_path, folder)
    print(folder_path)
    lecture = folder.split("/")[-1]
    lecture = re.sub(r"week-\d+-", "", lecture)
    tag = f"{specialization.replace('-', ' ')}::{course.replace('-', ' ')}::{lecture.replace('-', ' ').replace('_', ' ')}"

    if os.path.isdir(folder_path):
        # lesson folder
        for lesson in os.listdir(folder_path):
            lesson_path = os.path.join(folder_path, lesson)
            for file in os.listdir(lesson_path):

                if file.endswith(".txt"):
                    file_path = os.path.join(lesson_path, file)
                    with open(file_path, "r") as f:
                        text = f.read()
                        all_text.append((text, tag))
logging.info(f"Retrieving data for course {data_path.split('/')[-1]} ")
logging.info(f"Number of files: {len(all_text)}")

2024-07-15 14:31:48,741 - INFO - Retrieving data for course machine-learning-calculus 
2024-07-15 14:31:48,741 - INFO - Number of files: 60


../data/machine-learning/machine-learning-calculus/01_week-1-derivatives-and-optimization
../data/machine-learning/machine-learning-calculus/machine-learning-calculus-syllabus-parsed.json
../data/machine-learning/machine-learning-calculus/03_week-3-optimization-in-neural-networks-and-newtons-method
../data/machine-learning/machine-learning-calculus/02_week-2-gradients-and-gradient-descent


In [3]:
all_text

[("By now you have lots of tools\nto work with derivatives. But the question is, what\nare they useful for, other than calculating rates of change and more specifically, why are they useful\nin machine learning? Well, the main application\nhere in machine learning of derivatives is that they\nare used for optimization. Optimization is when\nyou want to find the maximum or the minimum\nvalue of a function. This is very important\nin machine learning because in machine learning, you want to find the model\nthat best fits your dataset, and in order to find this model what you do is you calculate an error function\nthat tells you how far are you from an ideal model. When you are able to minimize this error function then\nyou have the best model. Let me show you how. Consider\nthe following example. Imagine that you're\nsitting in a bench in a sauna and you\nstart feeling too hot so you're going\nto try to switch places to find the coldest\nspot on the bench. With you, you have\na thermomet

In [4]:
# from transformers import AutoTokenizer, AutoModelForCausalLM
# import torch

# model_id = "../Meta-Llama-3-8B-Instruct"

# tokenizer = AutoTokenizer.from_pretrained(model_id)
# model = AutoModelForCausalLM.from_pretrained(
#     model_id,
#     torch_dtype=torch.bfloat16,
#     device_map="auto",
# )

In [5]:
# import re

# prompt_1 = """Review Text: "%s"

# Task: Your task is to convert this text into Basic Note Type (front/back) Anki flashcards. Prioritize information regarding the imaging features of diseases, unique imaging findings, and methods of differentiating similar disease entities. Ensure that each flashcard is clearly written, and adheres to the specified formatting and reference criteria.

# Formatting Criteria:

# - Construct a table with three columns: "Front", “Back”, "Number".
# - Each row of the "Front" column should contain a single question testing the imaging features of disease, unique imaging findings, and methods of differentiating similar disease entities.
# - The “Back” column should contain the succinct answer to the question in the corresponding row of the “Front” column.
# - The "Number" column will serve to number each row, facilitating feedback.

# Reference Criteria for each "Statement":

# - Each flashcard should test a single concept.
# - Each flashcard MUST be able to stand alone. Include the subject of the flashcard somewhere in the text.
# - Keep ONLY simple, direct questions in the "Front" column.
# - Clear concise language but if required give plenty of context.
# - Output csv format like the example below.
# - Output at least %d rows of question/answers.

# Example:

# Front;Back;Number
# "How is necrotic tissue identified in acute pancreatitis on a CT scan?";"Lack of contrast enhancement.";1
# "Why should people create their own examples?";"Because Jake is too tired to think of good examples.";2

# """

# text, tag = all_text[0]

# # Split the text based on space and newline
# split_text = re.split(r"[\s\n]+", text)

# n_flashcards = int(len(split_text) / 65)
# n_flashcards = max(3, n_flashcards)


# messages = [
#     {
#         "role": "system",
#         "content": "You are a teacher who is writing anki cards for his students.",
#     },
#     {
#         "role": "user",
#         # "content": f"Please write an anki card on the following text:\n{text}",
#         "content": prompt_1 % (text, 20),
#     },
# ]


# input_ids = tokenizer.apply_chat_template(
#     messages, add_generation_prompt=True, return_tensors="pt"
# ).to(model.device)

# print(model.device)

# terminators = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")]

# outputs = model.generate(
#     input_ids,
#     max_new_tokens=1024,
#     eos_token_id=terminators,
#     do_sample=True,
#     temperature=0.6,
#     top_p=0.9,
# )
# response = outputs[0][input_ids.shape[-1] :]
# print(tokenizer.decode(response, skip_special_tokens=True))

### Write Questions

In [6]:
"""Write Questions

Structure: 
    1. Imports, Variables, Functions
    2. Load Model
    3. Parse Lecture Material 
    4. Generate Questions
    5. Save to DataFrame
    6. Save to File
"""

# 1. Imports, Variables, Functions
# imports
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import os, pandas as pd
from io import StringIO

# variables


prompt_1 = """Review Text: "%s"

Task: Your task is to convert this text into Basic Note Type (front/back) Anki flashcards. Prioritize information regarding the imaging features of diseases, unique imaging findings, and methods of differentiating similar disease entities. Ensure that each flashcard is clearly written, and adheres to the specified formatting and reference criteria.

Formatting Criteria:

- Construct a table with three columns: "Front", “Back”, "Number".
- Each row of the "Front" column should contain a single question testing the imaging features of disease, unique imaging findings, and methods of differentiating similar disease entities.
- The “Back” column should contain the succinct answer to the question in the corresponding row of the “Front” column.
- The "Number" column will serve to number each row, facilitating feedback.

Reference Criteria for each "Statement":

- Each flashcard should test a single concept.
- Each flashcard MUST be able to stand alone. Include the subject of the flashcard somewhere in the text.
- Keep ONLY simple, direct questions in the "Front" column.
- Clear concise language but if required give plenty of context. 
- Output csv format like the example below. 
- Output at least %d rows of question/answers.

Example:

Front;Back;Number
"How is necrotic tissue identified in acute pancreatitis on a CT scan?";"Lack of contrast enhancement.";1
"Why should people create their own examples?";"Because Jake is too tired to think of good examples.";2

"""


# functions
def clean_response(response_text):
    """Clean Response Text
    Args:
        response_text (str): response text from model
    Returns:
        response_df (df): cleaned response df
    """

    # clear response text
    # delete everything before Front;Back;Number
    response_text = response_text[response_text.find("Front;Back;Number") :]

    # Use StringIO to create a file-like object from the string
    _data = StringIO(response_text)

    # Read the data into a pandas dataframe
    df = pd.read_csv(_data, delimiter=";", quotechar='"', on_bad_lines="skip")

    return df


# 2. Load Model
model_id = os.path.join("..", "Meta-Llama-3-8B-Instruct")

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
2024-07-15 14:31:50,238 - INFO - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
Loading checkpoint shards: 100%|██████████| 4/4 [00:02<00:00,  1.63it/s]


In [27]:
from tqdm import tqdm

# 3. Parse Lecture Material
errors = list()
first_iteration = True
for i, (text, tag) in tqdm(enumerate(all_text), total=len(all_text)):

    # Split the text based on space and newline
    split_text = re.split(r"[\s\n]+", text)

    n_flashcards = int(len(split_text) / 180)
    n_flashcards = max(2, n_flashcards)

    messages = [
        {
            "role": "system",
            "content": "You are a teacher who is writing anki cards for his students.",
        },
        {
            "role": "user",
            # "content": f"Please write an anki card on the following text:\n{text}",
            "content": prompt_1 % (text, n_flashcards),
        },
    ]

    # 4. Generate Questions

    input_ids = tokenizer.apply_chat_template(
        messages, add_generation_prompt=True, return_tensors="pt"
    ).to(model.device)

    terminators = [
        tokenizer.eos_token_id,
        tokenizer.convert_tokens_to_ids("<|eot_id|>"),
    ]

    outputs = model.generate(
        input_ids,
        max_new_tokens=1024,
        eos_token_id=terminators,
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
    )
    response = outputs[0][input_ids.shape[-1] :]
    response = tokenizer.decode(response, skip_special_tokens=True)
    # print(tokenizer.decode(response, skip_special_tokens=True))

    # 5. Save to DataFrame
    if first_iteration:
        df_responses = clean_response(response_text=response)
        df_responses["tag"] = tag
        first_iteration = False
    else:
        try:
            df_response = clean_response(response_text=response)
            df_response["Deck_ID"] = tag
            df_responses = pd.concat([df_responses, df_response])
            del df_response
        except:
            print(f"Error in response {i}")
            errors.append(response)
    # 6. Save to File

  0%|          | 0/60 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
  2%|▏         | 1/60 [00:05<05:15,  5.35s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
  3%|▎         | 2/60 [00:11<05:28,  5.66s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
  5%|▌         | 3/60 [00:16<05:15,  5.53s/it]The attention mask and the pad token id were not set. As a consequen

In [30]:
df_responses.drop(["Number"], inplace=True, axis=1)
df_responses.to_csv("questions.csv", index=False)

In [35]:
df_responses["tag"].unique()

array(['machine learning::machine learning calculus::01 derivatives and optimization',
       'machine learning::machine learning calculus::03 optimization in neural networks and newtons method',
       'machine learning::machine learning calculus::02 gradients and gradient descent'],
      dtype=object)

In [41]:
df_responses["Deck_ID"].unique()

array(['mathematics for ml::machine learning calculus::01 derivatives and optimization',
       'mathematics for ml::machine learning calculus::03 optimization in neural networks and newtons method',
       'mathematics for ml::machine learning calculus::02 gradients and gradient descent'],
      dtype=object)

In [42]:
for id in df_responses["Deck_ID"].unique():
    print(f"Nº of levels {len(id.split('::'))}")

Nº of levels 3
Nº of levels 3
Nº of levels 3


In [84]:
# clean df from nans
df_responses.dropna(inplace=True)

In [88]:
import pandas as pd
import genanki
import random
import os

# Example DataFrame with questions and answers
output_path_decks = os.path.join("..", "generated_decks")

# Ensure the output directory exists
os.makedirs(output_path_decks, exist_ok=True)


# Function to generate unique model and deck IDs
def generate_id():
    return random.randrange(1 << 30, 1 << 31)


# Define the model for the notes
model_id = generate_id()
my_model = genanki.Model(
    model_id,
    "Simple Model",
    fields=[
        {"name": "Question"},
        {"name": "Answer"},
    ],
    templates=[
        {
            "name": "Card 1",
            "qfmt": "{{Question}}",
            "afmt": '{{FrontSide}}<hr id="answer">{{Answer}}',
        },
    ],
    css="""
    .card {
      font-family: arial;
      font-size: 20px;
      text-align: center;
    }
    """,
)

# Create decks
generated_decks = list()
d_decks = dict()
for deck_id in df_responses["Deck_ID"].unique():
    n_levels = len(deck_id.split("::"))
    for level in range(0, n_levels, 1):
        deck_name = "::".join(deck_id.split("::")[: level + 1])
        if deck_name not in generated_decks:
            created_deck = genanki.Deck(generate_id(), deck_name)
            generated_decks.append(deck_name)
            d_decks[deck_name] = created_deck

# Add flashcards to decks
for deck_id in df_responses["Deck_ID"].unique():
    # Query df
    query = f"Deck_ID == '{deck_id}'"
    df_query = df_responses.query(query)

    # Add flashcards to deck
    for index, row in df_query.iterrows():
        front = str(row["Front"]) if row["Front"] is not None else ""
        back = str(row["Back"]) if row["Back"] is not None else ""
        note = genanki.Note(model=my_model, fields=[front, back])
        d_decks[deck_id].add_note(note)

# Create a package and write it to an .apkg file
all_decks = list(d_decks.values())
package = genanki.Package(all_decks)

# Write to file

if specialization is not None:
    package.write_to_file(
        os.path.join(output_path_decks, f"{specialization}.{course}.apkg")
    )
else:
    package.write_to_file(os.path.join(output_path_decks, f"{course}.apkg"))

print("Anki package created successfully.")

Anki package created successfully.
