In [184]:
from langchain.document_loaders import CSVLoader,PyPDFLoader
from langchain.llms import openai
import os 
from dotenv import load_dotenv
from langchain.evaluation.qa import QAEvalChain
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings


In [185]:
load_dotenv()
os.environ["OPENAI_API_KEY"]=os.getenv("openai_api_key")
os.environ["LANGCHAIN_TRACING_V2"]=os.getenv("LANGCHAIN_TRACING_V2")
os.environ["LANGCHAIN_ENDPOINT"]=os.getenv("LANGCHAIN_ENDPOINT")
os.environ["LANGCHAIN_API_KEY"]=os.getenv("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_PROJECT"]=os.getenv("LANGCHAIN_PROJECT")

In [186]:
# Initializing PyPDFLoader to load the "Statistic.pdf" file
loader = PyPDFLoader("Statistic.pdf")
load = loader.load()
# Adding a commit related to the loaded content

In [190]:
from langchain.evaluation.qa import QAGenerateChain
llm=openai.OpenAI()
# Importing a module to generate question-answer chains
# using a language model (llm) from OpenAI
question = QAGenerateChain.from_llm(
    llm=openai.OpenAI()
)

In [191]:
question # this a prompt behind QAGenerateChain 

QAGenerateChain(prompt=PromptTemplate(input_variables=['doc'], template='You are a teacher coming up with questions to ask on a quiz. \nGiven the following document, please generate a question and answer based on that document.\n\nExample Format:\n<Begin Document>\n...\n<End Document>\nQUESTION: question here\nANSWER: answer here\n\nThese questions should be detailed and be based explicitly on information in the document. Begin!\n\n<Begin Document>\n{doc}\n<End Document>'), llm=OpenAI(client=<class 'openai.api_resources.completion.Completion'>, openai_api_key='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', openai_api_base='', openai_organization='', openai_proxy=''))

In [192]:
# Creating a list of dictionaries
# Each dictionary contains a key "doc" with a value from the subset of elements in the "load" list,
# specifically elements indexed from 50 to 59 (inclusive).
item=[{"doc":t} for t in load[50:60]]
# we did that because there is "doc" input_variable in QAGenerateChain 

In [193]:
item[0]

{'doc': Document(page_content=' 1.7 Big Data and Data Mining 19data collected can be substantial. For large retail companies, the sheer volume of data col-lected is hard to conceptualize, and figuring out how to effectively use these data to improve profitability is a challenge. Mass retailers such as Walmart capture data on 20 to 30 million transactions every day, telecommunication companies such as France Telecom and AT&T generate over 300 million call records per day, and Visa processes 6800 payment transac-tions per second or approximately 600 million transactions per day. In addition to the sheer volume and speed with which companies now collect data, more complicated types of data are now available and are proving to be of great value to businesses. Text data are collected by monitoring what is being said about a com-pany’s products or services on social media such as Twitter. Audio data are collected from service calls (on a service call, you will often hear “this call may be mo

In [194]:
examples=question.apply_and_parse(item)
# Applying the question-answer generation and parsing function to the list of dictionaries ('item')
# The 'question' object generated earlier is being used here to apply and parse questions on the documents.



In [195]:
# Generating a 'vector' by extracting chromatic features from the initial three documents in the 'load' list
# Using the 'Chroma' method and incorporating 'OpenAIEmbeddings' for embedding purposes
vector=Chroma.from_documents(
    documents=load[:3],
    embedding=OpenAIEmbeddings()
)
# we are doing this we want to see if the predicted answer from embedding is same as we get from original document 

In [196]:
examples[0]


{'qa_pairs': {'query': 'What is the definition of big data according to many data analysts?',
  'answer': 'Many data analysts define big data by referring to the three v’s of data: volume, velocity, and variety. Volume refers to the amount of available data, velocity refers to the speed at which data is collected and processed, and variety refers to the different data types.'}}

When we attempt to use this example with the RetrievalQa chain to get an answer, an error might occur. This is because the function required by RetrievalQa only accepts a "query" as input. To address this, we need to transform the above example variable into a suitable query format that RetrievalQa can process

In [197]:
modified_data = [{"query": item['qa_pairs']['query'], "answer": item['qa_pairs']['answer']} for item in examples]


In [198]:
modified_data[:3]

[{'query': 'What is the definition of big data according to many data analysts?',
  'answer': 'Many data analysts define big data by referring to the three v’s of data: volume, velocity, and variety. Volume refers to the amount of available data, velocity refers to the speed at which data is collected and processed, and variety refers to the different data types.'},
 {'query': 'What is one advantage that data mining has over classical statistics?',
  'answer': 'Data mining has the advantage of having the ability to partition the data set so that a model developed for the training data set may be tested for reliability on other data.'},
 {'query': 'What is an example of unethical statistical behavior according to the American Statistical Association\'s "Ethical Guidelines for Statistical Practice"?',
  'answer': 'An example of unethical statistical behavior according to the American Statistical Association\'s "Ethical Guidelines for Statistical Practice" is running multiple tests until 

In [199]:
# Creating a RetrievalQA chain using specific parameters:
# 'chain_type' specifies the type of chain ('stuff' in this case)
# 'retriever' utilizes the vector (extracted features) converted as a retriever
# 'llm' denotes the language model (llm) used within the RetrievalQA chain
qachain=RetrievalQA.from_chain_type(
    chain_type="stuff",
    retriever=vector.as_retriever(),
    llm=llm
)

In [200]:
 #Applying the RetrievalQA chain ('qachain') to generate predictions on modified data
prediction=qachain.apply(modified_data)

AuthenticationError: Incorrect API key provided: sk-8y8iE***************************************MZdG. You can find your API key at https://platform.openai.com/account/api-keys.

In [None]:
prediction[:3]

[{'query': 'How many transactions does Visa process per second?',
  'answer': 'Approximately 6800 transactions per second.',
  'result': " I don't know."},
 {'query': 'What is one advantage that data mining has over classical statistics?',
  'answer': 'Data mining has the advantage of being able to partition the data set so that a model developed for the training data set can be tested for reliability on other data.',
  'result': ' Data mining can turn raw medical records into medical knowledge, which can be used to detect trends in medical practice and even alter medical practice. Classical statistics is not as well-suited to this task.'},
 {'query': 'According to the American Statistical Association\'s "Ethical Guidelines for Statistical Practice," what is considered unethical statistical behavior?',
  'answer': 'Unethical statistical behavior includes running multiple tests until a desired result is obtained, discarding data to provide an average that is higher than the original res

In [None]:
# Creating an evaluation chain for question-answering using an LLM (llm)
evalutions=QAEvalChain.from_llm(
    llm=llm,
)

In [None]:
# Using the initialized evaluations chain ('evaluations') to evaluate the generated predictions ('prediction')
evaluate=evalutions.evaluate(modified_data,prediction)

In [None]:
evaluate[:3] 

[{'results': ' INCORRECT'}, {'results': ' CORRECT'}, {'results': ' CORRECT'}]

In [None]:
for i, eg in enumerate(examples):
    print(f"Example {i}:")
    print("Question: " + prediction[i]['query'])
    print("Real Answer: " + prediction[i]['answer'])
    print("Predicted Answer: " + prediction[i]['result'])
    print("Predicted Grade: " + evaluate[i]["results"])
    print()

Example 0:
Question: How many transactions does Visa process per second?
Real Answer: Approximately 6800 transactions per second.
Predicted Answer:  I don't know.
Predicted Grade:  INCORRECT

Example 1:
Question: What is one advantage that data mining has over classical statistics?
Real Answer: Data mining has the advantage of being able to partition the data set so that a model developed for the training data set can be tested for reliability on other data.
Predicted Answer:  Data mining can turn raw medical records into medical knowledge, which can be used to detect trends in medical practice and even alter medical practice. Classical statistics is not as well-suited to this task.
Predicted Grade:  CORRECT

Example 2:
Question: According to the American Statistical Association's "Ethical Guidelines for Statistical Practice," what is considered unethical statistical behavior?
Real Answer: Unethical statistical behavior includes running multiple tests until a desired result is obtained

{'doc': Document(page_content='MachineLearning-Lecture01  \nInstructor (Andrew Ng):  Okay. Good morning. Welcome to CS229, the machine \nlearning class. So what I wanna do today is ju st spend a little time going over the logistics \nof the class, and then we\'ll start to  talk a bit about machine learning.  \nBy way of introduction, my name\'s  Andrew Ng and I\'ll be instru ctor for this class. And so \nI personally work in machine learning, and I\' ve worked on it for about 15 years now, and \nI actually think that machine learning is th e most exciting field of all the computer \nsciences. So I\'m actually always excited about  teaching this class. Sometimes I actually \nthink that machine learning is not only the most exciting thin g in computer science, but \nthe most exciting thing in all of human e ndeavor, so maybe a little bias there.  \nI also want to introduce the TAs, who are all graduate students doing research in or \nrelated to the machine learni ng and all aspects of ma

AttributeError: type object 'LLMChain' has no attribute 'output_parser'

AIMessage(content='Hello there! How can I assist you today?')

langchain.evaluation.qa.eval_chain.QAEvalChain