# Read & Write Questions

## Read Data

In [1]:
"""Read Data

Structure:
    1. Imports, Variables, Functions
    2. Load Data

"""

# 1. Imports, Variables, Functions
# imports
import sys, os, numpy as np, pandas as pd
import logging
import re

logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
# variables

specialization = "mathematics_for_ml"
course = "machine-learning-linear-algebra"

data_path = os.path.join("..", "data", specialization, course)


# functions


# 2. Load Data

In [2]:
all_text = list()

for folder in os.listdir(data_path):
    # week folders
    folder_path = os.path.join(data_path, folder)
    print(folder_path)
    lecture = folder.split("/")[-1]
    lecture = re.sub(r"week-\d+-", "", lecture)
    tag = f"{specialization.replace('_', ' ')}::{course.replace('-', ' ')}::{lecture.replace('-', ' ').replace('_', ' ')}"

    if os.path.isdir(folder_path):
        # lesson folder
        for lesson in os.listdir(folder_path):
            lesson_path = os.path.join(folder_path, lesson)
            for file in os.listdir(lesson_path):

                if file.endswith(".txt"):
                    file_path = os.path.join(lesson_path, file)
                    with open(file_path, "r") as f:
                        text = f.read()
                        all_text.append((text, tag))
logging.info(f"Retrieving data for course {data_path.split('/')[-1]} ")
logging.info(f"Number of files: {len(all_text)}")

2024-08-12 15:39:02,114 - INFO - Retrieving data for course machine-learning-linear-algebra 
2024-08-12 15:39:02,115 - INFO - Number of files: 60


../data/mathematics_for_ml/machine-learning-linear-algebra/01_week-1-systems-of-linear-equations
../data/mathematics_for_ml/machine-learning-linear-algebra/03_week-3-vectors-and-linear-transformations
../data/mathematics_for_ml/machine-learning-linear-algebra/02_week-2-solving-systems-of-linear-equations
../data/mathematics_for_ml/machine-learning-linear-algebra/04_week-4-determinants-and-eigenvectors


In [3]:
all_text

[(">> As I mentioned before, equations\nbehave a lot like sentences as they are statements that give you information. In this video, you will learn\nwhat a linear equation is and what a system of linear equations is. As a matter of fact, you will be solving\nyour first system of linear equations, which is extracting all the possible\ninformation from that system. Just like with systems of sentences,\nsystems of linear equations can also be singular or non singular based on\nhow much information they carry. And as you already learned these\nconcepts with real life sentences, you are more than ready to\ntackle them with equations. In the previous video, you saw sentences\nsuch as between the dog and the cat, one is black. For the rest of the course, youll focus on sentences that carry\nnumerical information, such as this one. The price of an apple and a banana is $10. This sentence can easily be\nturned into equations as follows, if a is the price of an apple and\nb is the price of a ban

In [4]:
# from transformers import AutoTokenizer, AutoModelForCausalLM
# import torch

# model_id = "../Meta-Llama-3-8B-Instruct"

# tokenizer = AutoTokenizer.from_pretrained(model_id)
# model = AutoModelForCausalLM.from_pretrained(
#     model_id,
#     torch_dtype=torch.bfloat16,
#     device_map="auto",
# )

In [5]:
# import re

# prompt_1 = """Review Text: "%s"

# Task: Your task is to convert this text into Basic Note Type (front/back) Anki flashcards. Prioritize information regarding the imaging features of diseases, unique imaging findings, and methods of differentiating similar disease entities. Ensure that each flashcard is clearly written, and adheres to the specified formatting and reference criteria.

# Formatting Criteria:

# - Construct a table with three columns: "Front", “Back”, "Number".
# - Each row of the "Front" column should contain a single question testing the imaging features of disease, unique imaging findings, and methods of differentiating similar disease entities.
# - The “Back” column should contain the succinct answer to the question in the corresponding row of the “Front” column.
# - The "Number" column will serve to number each row, facilitating feedback.

# Reference Criteria for each "Statement":

# - Each flashcard should test a single concept.
# - Each flashcard MUST be able to stand alone. Include the subject of the flashcard somewhere in the text.
# - Keep ONLY simple, direct questions in the "Front" column.
# - Clear concise language but if required give plenty of context.
# - Output csv format like the example below.
# - Output at least %d rows of question/answers.

# Example:

# Front;Back;Number
# "How is necrotic tissue identified in acute pancreatitis on a CT scan?";"Lack of contrast enhancement.";1
# "Why should people create their own examples?";"Because Jake is too tired to think of good examples.";2

# """

# text, tag = all_text[0]

# # Split the text based on space and newline
# split_text = re.split(r"[\s\n]+", text)

# n_flashcards = int(len(split_text) / 65)
# n_flashcards = max(3, n_flashcards)


# messages = [
#     {
#         "role": "system",
#         "content": "You are a teacher who is writing anki cards for his students.",
#     },
#     {
#         "role": "user",
#         # "content": f"Please write an anki card on the following text:\n{text}",
#         "content": prompt_1 % (text, 20),
#     },
# ]


# input_ids = tokenizer.apply_chat_template(
#     messages, add_generation_prompt=True, return_tensors="pt"
# ).to(model.device)

# print(model.device)

# terminators = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")]

# outputs = model.generate(
#     input_ids,
#     max_new_tokens=1024,
#     eos_token_id=terminators,
#     do_sample=True,
#     temperature=0.6,
#     top_p=0.9,
# )
# response = outputs[0][input_ids.shape[-1] :]
# print(tokenizer.decode(response, skip_special_tokens=True))

### Write Questions

In [12]:
"""Write Questions

Structure: 
    1. Imports, Variables, Functions
    2. Load Model
    3. Parse Lecture Material 
    4. Generate Questions
    5. Save to DataFrame
    6. Save to File
"""

# 1. Imports, Variables, Functions
# imports
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import os, pandas as pd
from io import StringIO

# variables

prompt_0 = """Review Text: "%s"
Task: You are tasked with summarizing the core theoretical information from a course lecture. Please identify and extract the key concepts, definitions, principles, and rules presented in the lecture. Focus on the general theory that can be applied universally, and exclude any specific examples, case studies, or situational applications unless they directly illustrate a fundamental concept. The summary should provide a clear and concise overview of the theoretical framework discussed in the lecture, ensuring that it is applicable in a wide range of contexts.
"""


prompt_1 = """Review Text: "%s"

Task: Your task is to convert this text into Basic Note Type (front/back) Anki flashcards. Prioritize information regarding the imaging features of diseases, unique imaging findings, and methods of differentiating similar disease entities. Ensure that each flashcard is clearly written, and adheres to the specified formatting and reference criteria.

Formatting Criteria:

- Construct a table with three columns: "Front", “Back”, "Number".
- Each row of the "Front" column should contain a single question testing the imaging features of disease, unique imaging findings, and methods of differentiating similar disease entities.
- The “Back” column should contain the succinct answer to the question in the corresponding row of the “Front” column.
- The "Number" column will serve to number each row, facilitating feedback.

Reference Criteria for each "Statement":

- Each flashcard should test a single concept.
- Each flashcard MUST be able to stand alone. Include the subject of the flashcard somewhere in the text.
- Keep ONLY simple, direct questions in the "Front" column.
- Clear concise language but if required give plenty of context. 
- Output csv format like the example below. Make sure text is properly formatted between "".
- Important: output no more than %d rows of question/answers.

Example:

Front;Back;Number
"How is necrotic tissue identified in acute pancreatitis on a CT scan?";"Lack of contrast enhancement.";1
"Why should people create their own examples?";"Because Jake is too tired to think of good examples.";2

"""

In [None]:
# functions
def clean_response(response_text):
    """Clean Response Text
    Args:
        response_text (str): response text from model
    Returns:
        response_df (df): cleaned response df
    """

    # clear response text
    # delete everything before Front;Back;Number
    response_text = response_text[response_text.find("Front;Back;Number") :]

    # Use StringIO to create a file-like object from the string
    _data = StringIO(response_text)

    # reset the file pointer to the beginning
    _data.seek(0)

    # read the lines in the data file
    lines = _data.readlines()

    # filter out the lines with no semicolon
    lines = [line for line in lines if ";" in line]

    # combine filtered lines back
    filtered_data = StringIO("".join(lines))

    # Read the data into a pandas dataframe
    df = pd.read_csv(filtered_data, delimiter=";", quotechar='"', on_bad_lines="skip")

    return df


# 2. Load Model
model_id = os.path.join("..", "Meta-Llama-3-8B-Instruct")

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

In [15]:
from tqdm import tqdm
import re
import pandas as pd

# Assuming prompt_0 and prompt_1 are defined elsewhere in your script
# Example:
# prompt_0 = "Summarize the theoretical information from the following text:\n%s"
# prompt_1 = "Please write %d anki cards based on the following theoretical summary:\n%s"

errors = list()
first_iteration = True

for i, (text, tag) in tqdm(enumerate(all_text), total=len(all_text)):
    # Step 1: Use prompt_0 to extract theoretical information
    messages = [
        {
            "role": "system",
            "content": "You are a teacher summarizing theoretical content from a lecture.",
        },
        {
            "role": "user",
            "content": prompt_0 % text,  # Apply prompt_0 to extract theory
        },
    ]

    # Generate theoretical summary
    input_ids = tokenizer.apply_chat_template(
        messages, add_generation_prompt=True, return_tensors="pt"
    ).to(model.device)

    outputs = model.generate(
        input_ids,
        max_new_tokens=2048,
        eos_token_id=tokenizer.eos_token_id,
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
    )
    theory_summary = tokenizer.decode(
        outputs[0][input_ids.shape[-1] :], skip_special_tokens=True
    )

    # Step 2: Use prompt_1 to generate Anki cards based on the theoretical summary
    split_text = re.split(r"[\s\n]+", theory_summary)
    n_flashcards = int(len(split_text) / 180)
    n_flashcards = max(2, n_flashcards)

    messages = [
        {
            "role": "system",
            "content": "You are a teacher who is writing Anki cards for his students.",
        },
        {
            "role": "user",
            "content": prompt_1 % (theory_summary, n_flashcards + 1),
        },
    ]

    # Generate Questions (Anki cards)
    input_ids = tokenizer.apply_chat_template(
        messages, add_generation_prompt=True, return_tensors="pt"
    ).to(model.device)

    outputs = model.generate(
        input_ids,
        max_new_tokens=2048,
        eos_token_id=tokenizer.eos_token_id,
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
    )
    response = outputs[0][input_ids.shape[-1] :]
    response_text = tokenizer.decode(response, skip_special_tokens=True)

    # Step 3: Save to DataFrame
    if first_iteration:
        df_responses = clean_response(response_text=response_text)
        df_responses["Deck_ID"] = tag
        first_iteration = False
    else:
        try:
            df_response = clean_response(response_text=response_text)
            df_response["Deck_ID"] = tag
            df_responses = pd.concat([df_responses, df_response])
            del df_response
        except Exception as e:
            print(f"Error in response {i}: {e}")
            errors.append(response_text)

    # Step 4: Save to File (if necessary)
    # df_responses.to_csv("anki_cards.csv", index=False)  # Save the dataframe to a CSV file

# End of loop

  0%|          | 0/60 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
  2%|▏         | 1/60 [00:12<12:19, 12.53s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain

In [16]:
df_responses

Unnamed: 0,Front,Back,Number,Deck_ID
0,What is a linear equation?,"A statement that gives information, written in...",1,mathematics for ml::machine learning linear al...
1,What are the three types of systems of linear ...,"Singular, Non-Singular, and Redundant",2,mathematics for ml::machine learning linear al...
2,What is the importance of understanding the th...,It provides the foundation for solving problem...,3,mathematics for ml::machine learning linear al...
0,What is a system of linear equations?,A set of equations in which each equation is a...,1,mathematics for ml::machine learning linear al...
1,What is the difference between a singular and ...,"A singular system has no unique solution, whil...",2,mathematics for ml::machine learning linear al...
...,...,...,...,...
1,What happens to the fundamental square when a ...,"It is sent to a set of measure zero, such as a...",2,mathematics for ml::machine learning linear al...
2,What is the relationship between the determina...,A matrix is singular if its determinant is zer...,3,mathematics for ml::machine learning linear al...
0,What are the key concepts covered in a linear ...,"Linear Systems of Equations, Matrices and Vect...",1,mathematics for ml::machine learning linear al...
1,What is the definition of a linear system of e...,A set of equations in which the coefficients o...,2,mathematics for ml::machine learning linear al...


In [7]:
# from tqdm import tqdm

# # 3. Parse Lecture Material
# errors = list()
# first_iteration = True
# for i, (text, tag) in tqdm(enumerate(all_text), total=len(all_text)):

#     # Split the text based on space and newline
#     split_text = re.split(r"[\s\n]+", text)

#     n_flashcards = int(len(split_text) / 180)
#     n_flashcards = max(2, n_flashcards)

#     messages = [
#         {
#             "role": "system",
#             "content": "You are a teacher who is writing anki cards for his students.",
#         },
#         {
#             "role": "user",
#             # "content": f"Please write an anki card on the following text:\n{text}",
#             "content": prompt_1 % (text, n_flashcards),
#         },
#     ]

#     # 4. Generate Questions

#     input_ids = tokenizer.apply_chat_template(
#         messages, add_generation_prompt=True, return_tensors="pt"
#     ).to(model.device)

#     terminators = [
#         tokenizer.eos_token_id,
#         tokenizer.convert_tokens_to_ids("<|eot_id|>"),
#     ]

#     outputs = model.generate(
#         input_ids,
#         max_new_tokens=2048,
#         eos_token_id=terminators,
#         do_sample=True,
#         temperature=0.6,
#         top_p=0.9,
#     )
#     response = outputs[0][input_ids.shape[-1] :]
#     response = tokenizer.decode(response, skip_special_tokens=True)
#     # print(tokenizer.decode(response, skip_special_tokens=True))

#     # 5. Save to DataFrame
#     if first_iteration:
#         df_responses = clean_response(response_text=response)
#         df_responses["Deck_ID"] = tag
#         first_iteration = False
#     else:
#         try:
#             df_response = clean_response(response_text=response)
#             df_response["Deck_ID"] = tag
#             df_responses = pd.concat([df_responses, df_response])
#             del df_response
#         except:
#             print(f"Error in response {i}")
#             errors.append(response)
#     # 6. Save to File

#     break

  0%|          | 0/60 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
  0%|          | 0/60 [00:23<?, ?it/s]


In [18]:
df_responses.drop(["Number"], inplace=True, axis=1)

logging.info(f"Number of unique tags: {df_responses['Deck_ID'].unique()}")

logging.info(f"Number of flashcards: {len(df_responses)}")

df_responses.dropna(inplace=True)
logging.info(f"Number of flashcards after nan cleaning: {len(df_responses)}")

2024-08-12 15:56:55,050 - INFO - Number of unique tags: ['mathematics for ml::machine learning linear algebra::01 systems of linear equations'
 'mathematics for ml::machine learning linear algebra::03 vectors and linear transformations'
 'mathematics for ml::machine learning linear algebra::02 solving systems of linear equations'
 'mathematics for ml::machine learning linear algebra::04 determinants and eigenvectors']
2024-08-12 15:56:55,051 - INFO - Number of flashcards: 180
2024-08-12 15:56:55,052 - INFO - Number of flashcards after nan cleaning: 180


In [19]:
import pandas as pd
import genanki
import random
import os

# Example DataFrame with questions and answers
output_path_decks = os.path.join("..", "generated_decks")

# Ensure the output directory exists
os.makedirs(output_path_decks, exist_ok=True)


# Function to generate unique model and deck IDs
def generate_id():
    return random.randrange(1 << 30, 1 << 31)


# Define the model for the notes
model_id = generate_id()
my_model = genanki.Model(
    model_id,
    "Simple Model",
    fields=[
        {"name": "Question"},
        {"name": "Answer"},
    ],
    templates=[
        {
            "name": "Card 1",
            "qfmt": "{{Question}}",
            "afmt": '{{FrontSide}}<hr id="answer">{{Answer}}',
        },
    ],
    css="""
    .card {
      font-family: arial;
      font-size: 20px;
      text-align: center;
    }
    """,
)

# Create decks
generated_decks = list()
d_decks = dict()
for deck_id in df_responses["Deck_ID"].unique():
    n_levels = len(deck_id.split("::"))
    for level in range(0, n_levels, 1):
        deck_name = "::".join(deck_id.split("::")[: level + 1])
        if deck_name not in generated_decks:
            created_deck = genanki.Deck(generate_id(), deck_name)
            generated_decks.append(deck_name)
            d_decks[deck_name] = created_deck

# Add flashcards to decks
for deck_id in df_responses["Deck_ID"].unique():
    # Query df
    query = f"Deck_ID == '{deck_id}'"
    df_query = df_responses.query(query)

    # Add flashcards to deck
    for index, row in df_query.iterrows():
        front = str(row["Front"]) if row["Front"] is not None else ""
        back = str(row["Back"]) if row["Back"] is not None else ""
        note = genanki.Note(model=my_model, fields=[front, back])
        d_decks[deck_id].add_note(note)

# Create a package and write it to an .apkg file
all_decks = list(d_decks.values())
package = genanki.Package(all_decks)

# Write to file

if specialization is not None:
    package.write_to_file(
        os.path.join(output_path_decks, f"{specialization}.{course}.apkg")
    )
else:
    package.write_to_file(os.path.join(output_path_decks, f"{course}.apkg"))

print("Anki package created successfully.")

Anki package created successfully.


In [20]:
df_responses

Unnamed: 0,Front,Back,Deck_ID
0,What is a linear equation?,"A statement that gives information, written in...",mathematics for ml::machine learning linear al...
1,What are the three types of systems of linear ...,"Singular, Non-Singular, and Redundant",mathematics for ml::machine learning linear al...
2,What is the importance of understanding the th...,It provides the foundation for solving problem...,mathematics for ml::machine learning linear al...
0,What is a system of linear equations?,A set of equations in which each equation is a...,mathematics for ml::machine learning linear al...
1,What is the difference between a singular and ...,"A singular system has no unique solution, whil...",mathematics for ml::machine learning linear al...
...,...,...,...
1,What happens to the fundamental square when a ...,"It is sent to a set of measure zero, such as a...",mathematics for ml::machine learning linear al...
2,What is the relationship between the determina...,A matrix is singular if its determinant is zer...,mathematics for ml::machine learning linear al...
0,What are the key concepts covered in a linear ...,"Linear Systems of Equations, Matrices and Vect...",mathematics for ml::machine learning linear al...
1,What is the definition of a linear system of e...,A set of equations in which the coefficients o...,mathematics for ml::machine learning linear al...
