In [1]:
from langchain_openai import ChatOpenAI
from langchain.llms import OpenAI
from PyPDF2 import PdfReader



In [4]:
# function to get llm response
def get_openai_response(question: str):
    llm = ChatOpenAI(model_name="gpt-3.5-turbo",
                 temperature = 0.6
                 )
    response = llm.invoke(question)
    return response

res = get_openai_response("what is the capital of India")

print(res)

content='The capital of India is New Delhi.' response_metadata={'finish_reason': 'stop', 'logprobs': None}


In [5]:
from langchain.schema import SystemMessage, HumanMessage, AIMessage
from langchain.prompts import PromptTemplate

In [9]:
# Use 1: when people want to share a webpage and ask you to generate question from here

import requests
from bs4 import BeautifulSoup

def get_article_text(url):
    # Fetch the webpage content
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to fetch the webpage. Status code: {response.status_code}")
        return None

    # Parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the main article body
    article_body = soup.find('body')

    # Extract text from the body
    article_text = article_body.get_text()

    return article_text

# lets try now
url = 'https://www.computerworld.com/article/3697649/what-are-large-language-models-and-how-are-they-used-in-generative-ai.html'
# article_text = get_article_text(url)
# if article_text:
#     main_text = article_text
#     print(article_text)


In [34]:
# Use 2. When people want to upload a pdf and ask you to generate question from it
import os

pdf_directory = os.getcwd()

from PyPDF2 import PdfReader
# Opening the pdf
for filename in os.listdir(pdf_directory):
    if filename.endswith(".pdf"):
        pdf_file_name = filename
        pdf_file = open(os.path.join(pdf_directory, filename), "rb")

# Reading the pdf
pdf_reader = PdfReader(pdf_file)
all_text = ""
# make it limited. min(5, len(pages))
for idx, page in enumerate(pdf_reader.pages):
    all_text += page.extract_text()
    if idx > 4:
        break

# print(all_text)

# Use 3. let the user input an story to generate questions from
    
# Use 4. let the use share a video link to generate questions from

/Users/atifhsn/Desktop/projects/UC llm 2.0


In [17]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
    length_function=len,
    is_separator_regex=False
)


In [19]:
texts = text_splitter.create_documents([all_text])

In [23]:
texts[0].page_content

'Risk Management Examination Manual of Credit Card Activities                                      Chapter II \n \n \n                                   \nII. CREDIT CARDS – GENERAL OVERVIEW                                                      \n \n \nWHAT IS A CREDIT CARD \n \nIn its non-physical form, a credit card repres ents a payment mechanism which facilitates both \nconsumer and commercial business transactions, including purchases and cash advances .  A \ncredit card generally operates as a substitute for cash or a check and most often provides an \nunsecured revolving line of credit.  The borrower is required to pay at least part of the card’s \noutstanding balance each billing cycle , depending on the terms as set forth in the cardholder \nagreement .  As the debt reduces, the available credit  increases for accounts in good standing.  \nThese complex financial arrangements have ever-shifting terms and prices.  A charge card'

In [40]:
# Defining which embeddings to use
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()

In [37]:
# store in vector db
from langchain.vectorstores import FAISS
db = FAISS.from_documents(texts, embeddings)


In [29]:
print(db.index.ntotal)

22


In [36]:
db.save_local(f'faiss_{pdf_file_name}_index')

In [38]:
from langchain.chains.question_answering import load_qa_chain
def get_conversational_chain():
    prompt_template = """Answer the question as detailed as possible from the provided context, make sure to provide all the details, if the answer is present in the document
    feel free to say, "try ansking something else, this information is not available", don't provide the wrong answer no matter what is present in the question\n\n
    Context:\n {context}?\n
    Question: \n{question}\n

    Answer:
    """
    model = ChatOpenAI(temperature=0.7)
    prompt = PromptTemplate(template=prompt_template, 
                            input_variables=["context", "Question"])
    chain = load_qa_chain(model, chain_type="stuff", prompt = prompt)

    return chain

In [48]:

# get the question, find top k  similar documents
user_input = "What is a credit card?"

loaded_db = FAISS.load_local(f'faiss_{pdf_file_name}_index', embeddings, allow_dangerous_deserialization=True)
docs = loaded_db.similarity_search(user_input)

chain = get_conversational_chain()
# give the answer

response = chain({"input_documents" : docs, 
                  "question": user_input,
                    })


In [52]:
print(response['output_text'])

A credit card represents a payment mechanism that facilitates both consumer and commercial business transactions, including purchases and cash advances. It generally operates as a substitute for cash or a check and most often provides an unsecured revolving line of credit. The borrower is required to pay at least part of the card's outstanding balance each billing cycle, depending on the terms as set forth in the cardholder agreement. As the debt reduces, the available credit increases for accounts in good standing. The physical form of a credit card is traditionally a thin, rectangular plastic card with a series of numbers on the front that represent various items such as the applicable network, bank, and account. The back of the card typically contains a magnetic stripe that electronically stores some of the account's information, as well as a cardholder signature box. Additionally, credit cards are often associated with Visa and MasterCard, which operate sophisticated payment networ

In [65]:
from langchain.chains import LLMChain
def get_topics_from_chunk(context: str):
    prompt_template = """
            I will give a context, and you have to tell me what top 3 topics the text might belong to.
            if you unable to find any, you can respond with <no_topic>, but dont output any rubbish topics.
            do not write anything other than the topics names. also, give the topics in a comma separted way.\n\n
            context:\n{context}\n
            Answer:
            """
    model = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.5)
    prompt = PromptTemplate(template=prompt_template, 
                            input_variables=['context'])
    response = LLMChain(llm=model, prompt=prompt)
    return response(context)
    

In [81]:
all_topics = []
from tqdm import tqdm
for t in tqdm(texts):
    response = get_topics_from_chunk(t.page_content)
    response = response['text'].split(", ")
    all_topics.extend([x.strip() for x in response])

100%|██████████| 22/22 [00:21<00:00,  1.01it/s]


In [87]:
from collections import Counter
most_common_words = Counter([x.lower() for  x in all_topics]).most_common(5)
most_common_words_without_count = [word for word, _ in most_common_words]


In [91]:
# finding the 
import random
toughness_input = random.choice(['Easy', 'Moderate', "Tough"])
print(toughness_input)

Easy


In [93]:
selected_topic = random.choice(most_common_words_without_count[:3])
selected_topic

'banking'

In [209]:
# from langchain.chains.qa_generation.base import QAGenerationChain
from langchain.chains import RetrievalQA
from typing import List

from langchain_core.output_parsers import JsonOutputParser

def generate_qa_pairs(context, topic):
    prompt_template = """
        Given a context, i want you to generate 2 questions of toughness level: Tough out of these three levels
        Easy, Moderate and Tough. The question must belong to topic: {topic}. 
        Make sure the answer to the question you generate belong to the context provided.
        give me the question answer pair in json format.\n\n
        context:\n{context}\n
        """
    model = ChatOpenAI(model='gpt-3.5-turbo', temperature=0.5)
    parser = JsonOutputParser()
    prompt = PromptTemplate(template = prompt_template,
                            input_variables=['context', 'topic'],
                            partial_variables={'format_instructions': parser.get_format_instructions()})
    # chat = RetrievalQA.from_chain_type(llm = llm, 
    #                                    chain_type="stuff",
    #                                    retriever = db.as_retriever(),
    #                                    return_source_documents=True,
    #                                    chain_type_kwargs={"prompt": prompt},
    #                                    )
    # response = load_qa_chain(llm=model, chain_type="stuff", prompt=prompt)
    # return response({"input_documents" : context, 
    #                  "topic": topic})


    chain = prompt | model | parser

    return chain.invoke({"context": context,
                  "topic": topic})
    


In [204]:
docs_for_questions = loaded_db.similarity_search(selected_topic, k=5)

In [210]:

response = generate_qa_pairs(docs_for_questions, selected_topic)

In [211]:
response


[{'question': 'What are some of the factors that have forced credit card issuers to be innovative with the credit card products offered?',
  'answer': 'Intense competition, market saturation, and changing consumer postures have forced issuers to be innovative with the credit card products offered.'},
 {'question': 'How has risk-based pricing allowed banks to issue cards to less-qualified applicants?',
  'answer': 'Risk-based pricing has allowed banks to issue cards to less-qualified applicants in exchange for a higher interest rate or other fees and to essentially offer customized card products.'}]

In [228]:
# get answer from user and give correctness score out of 100

question_1 = response[0]['question']
answer_1 = response[0]['answer']

In [254]:
# to find correctness we cant rely on cosine similarity. but need to build a chain/agent that evaluate the answer
user_answer_1 = "Fierce competition and market saturation are key factors."

In [255]:
user_answer_1_embedding = OpenAIEmbeddings().embed_query(user_answer_1)
answer_1_embedding = OpenAIEmbeddings().embed_query(answer_1)

In [256]:
from langchain.evaluation import load_evaluator


from langchain.evaluation import EmbeddingDistance
evaluator2 = load_evaluator(
    "pairwise_embedding_distance", distance_metric=EmbeddingDistance.COSINE
)

score_1 = evaluator2.evaluate_string_pairs(
    prediction=user_answer_1, prediction_b= answer_1
)
print(1-score_1['score'])

0.8662348409772439


In [259]:
from langchain.evaluation import load_evaluator
evaluator = load_evaluator("labeled_pairwise_string")

evaluator.evaluate_string_pairs(
    prediction=user_answer_1,
    prediction_b = user_answer_1,
    input=question_1,
    reference=answer_1,
)

{'reasoning': 'Both Assistant A and Assistant B provided the same response to the user\'s question. They both correctly identified "fierce competition" and "market saturation" as factors that have forced credit card issuers to be innovative with their products. However, neither of them mentioned the third factor, "changing consumer postures", which was included in the reference answer. Therefore, both responses lack depth and completeness. Given these considerations, my evaluation results in a tie. \n\nFinal Verdict: [[C]]',
 'value': None,
 'score': 0.5}