#### Setup

In [None]:
%pip install openai
%pip install langchain

In [None]:
import pandas as pd
import numpy
import os
import re
from tqdm import tqdm

import langchain
from langchain.chains import LLMChain
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
from getpass import getpass
OPENAI_API_KEY = getpass()
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

··········


In [None]:
%cd drive/MyDrive/CS\ 182\ Final\ Project/Data\ Processing

/content/drive/.shortcut-targets-by-id/1hzhdcGA40OipfzF0SRT7omKvzCSi0q4r/CS 182 Final Project/Data Processing


#### Processing class-specific notes

In [None]:
### processing 126 notes ###

f = open("Raw Txt Files/126notes.txt", "r")
notes_126 = f.read()
texts_126 = re.split(r"\\section\*{", notes_126)[1:]
print(f"Number of 126 texts: {len(texts_126)}")

### processing 127 notes ###
f = open("Raw Txt Files/127reader.txt", "r")
reader_127 = f.read()
texts_127 = re.split(r"\\subsection\*{", reader_127)[1:]
print(f"Number of 127 texts: {len(texts_127)}")

### processing 189 (spring) notes ###
f = open("Raw Txt Files/189notes-spring.txt", "r")
notes_189_spring = f.read()
texts_189_spring = re.split(r"\\section\*{\d+ ", notes_189_spring)[1:]
print(f"Number of 189 (spring) texts: {len(texts_189_spring)}")

### processing 189 (fall) notes ###
f = open("Raw Txt Files/189notes-fall.txt", "r")
notes_189_fall = f.read()
texts_189_fall = re.split(r"\\section\*{\d+ ", notes_189_fall)[1:]
print(f"Number of 189 (fall) texts: {len(texts_189_fall)}")

### processing 182 notes ###
f = open("Raw Txt Files/182notes.txt", "r")
notes_182 = f.read()
texts_182 = re.split(r"\\section\*{\d+ ", notes_182)[1:]
print(f"Number of 182 texts: {len(texts_182)}")

Number of 126 texts: 104
Number of 127 texts: 71
Number of 189 (spring) texts: 26
Number of 189 (fall) texts: 35
Number of 182 texts: 63


#### Generating question-answer pairs

In [None]:
### defining the prompt structure and the chain ###

template = """

Given a paragraph from an upper-division EECS course notes, generate question-answer pairs in the following format

Question: ...
Answer: ...
Question: ...
Answer: ...

Do not ask simple and out-of-context questions. Do not ask questions about figures. Only ask questions about the technical concepts and math derivations and proofs.
The question should consist of two parts. In the first part define the theorems that are applicable to the question and are referenced in the question. The second part should be the actual question.
If there are mathematical derivations, make sure you include questions on derivations as well.
When citing theorem from a note, write out the full theorem rather than just using the number of the theorem in the note. Do not ask questions about specific examples.

Here is the text: {text}

"""

prompt = PromptTemplate(template=template, input_variables=["text"])
llm = ChatOpenAI(model="gpt-4-1106-preview", max_tokens=1500)
llm_chain = LLMChain(prompt=prompt, llm=llm)

In [None]:
def generate_qa(texts: list, llm_chain: langchain.chains.llm.LLMChain, num: int):
  """
  Given a list of paragraphs/texts, generate a list of questions
  and answers by running through the chain.
  We prompt the LLM NUM times with the same input text.

  return: questions (list), answers (list)
  """
  questions, answers = [], []
  for text in tqdm(texts):
    for _ in range(num):
      qa_pairs = llm_chain.run(text)
      if qa_pairs != '':
        for pair in qa_pairs.split("Question: ")[1:]:
          question, answer = pair.split("Answer: ")
          questions.append(question)
          answers.append(answer)
    dataset = pd.DataFrame({"questions": questions,
                          "answers": answers})
    dataset.to_csv(f"Training data/{course}.csv", index=False)
  return questions, answers

In [None]:
courses = {"127": texts_127}
# courses = {"126": texts_126, "127": texts_127, "189_fa": texts_189_fall,
#            "189_sp": texts_189_spring, "182": texts_182}

for course in courses:
  questions, answers = generate_qa(courses[course], llm_chain, 1)
  dataset = pd.DataFrame({"questions": questions,
                          "answers": answers})
  print(f"\nCourse: {course}")
  print(f"Number of generated QA pairs: {len(dataset)}")
  dataset.to_csv(f"Training data/{course}.csv", index=False)

100%|██████████| 71/71 [48:52<00:00, 41.30s/it]


Course: 127
Number of generated QA pairs: 296



