In [None]:
!pip install --upgrade langchain
!pip install transformers
!pip install pdfminer.six
!pip install tqdm

In [None]:
from langchain.prompts import ChatPromptTemplate
from langchain.llms import HuggingFacePipeline
from langchain import LLMChain
from langchain.chains import SequentialChain
import torch
from transformers import AutoTokenizer, pipeline, AutoModelForSeq2SeqLM
from tqdm.auto import tqdm
from pdfminer.high_level import extract_text
from glob import glob

# Pipeline with hugging face model + langchain

In [None]:
model_id = 'google/flan-t5-base'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id) # , load_in_8bit=True not working

pipe = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=300
)

local_llm = HuggingFacePipeline(pipeline=pipe)

In [None]:
def my_chain(text):

  # define the correct style the AI should assume for our task
  style = """American English \
  you are a very helpful assistant and help a professor of statistics and machine learning in creating material to help pupils in learning
  """

  # define the chain for the summary of our text
  summary_template = """Summarize the following text using this style: {style}\
  Text: {text}"""

  summary_prompt = ChatPromptTemplate.from_template(summary_template)

  summary_chain = LLMChain(prompt=summary_prompt,
              llm=local_llm,
              output_key="summary"
              )

  summary = summary_chain.run({"text": text, "style": style})

  # define the chain to formulate the question form the previous summary
  question_template = """From the the text formulate a proper question using this style: {style}\
  Text: {summary}"""

  question_prompt = ChatPromptTemplate.from_template(question_template)

  question_chain = LLMChain(prompt=question_prompt,
                      llm=local_llm,
                      output_key="question"
                      )
  question = question_chain.run({"summary": summary, "style": style})

  # define the chain for aswering the above question given the summary from the first block
  answer_template = """Try to asnwer this question using the information provided in the following text\
  Text: {summary}\
  Question: {question}"""

  answer_prompt = ChatPromptTemplate.from_template(answer_template)

  answer_chain = LLMChain(prompt=answer_prompt,
                      llm=local_llm,
                      output_key="answer"
                      )
  answer = answer_chain.run({"question": question, "summary": summary, "style": style})

  return (question, answer)

In [None]:
# divide text into smaller chunks:
def divide_text(text, section_size):
    sections = []
    start = 0
    end = section_size
    while start < len(text):
        section = text[start:end]
        sections.append(section)
        start = end
        end += section_size
    return sections

# build the flash cards
def build_flashcards(pdf_text,llm):

    SECTION_SIZE = 4000
    divided_sections = divide_text(pdf_text, SECTION_SIZE)
    generated_flashcards = []
    for i, text in tqdm(enumerate(divided_sections)):

        QA_pair = my_chain(text)
        generated_flashcards.append(" -> ".join(QA_pair))

    # # Save the cards to a text file
    with open("flashcards.txt", "w") as f:
        f.write("\n".join(generated_flashcards))

In [None]:
# find the path of the PDF file
pdf_file = glob("/content/**/*.pdf", recursive=True)
# extrect text from the file at the path
pdf_text = extract_text(pdf_file[0])

In [None]:
build_flashcards(pdf_text, local_llm)