In [55]:
from langchain.document_loaders import PyPDFLoader
from langchain.docstore.document import Document
from langchain.prompts import PromptTemplate

In [56]:
file_path = 'data/stats.pdf'
loader = PyPDFLoader(file_path)
data = loader.load()

In [57]:
question_gen = ""
for page in data:
    question_gen += page.page_content

In [58]:
from langchain_community.llms import Ollama
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [98]:
llm = Ollama(
    model="llama3.2", 
    base_url="http://127.0.0.1:11434",
    temperature=0.3,
    num_predict=2000,
    top_p=0.9,
    repeat_penalty=1.1
)

In [99]:
splitter_ques_gen = RecursiveCharacterTextSplitter(
    chunk_size=10000,
    chunk_overlap=200,
    separators=["\n\n", "\n", " ", ""]
)

In [100]:
chunk_ques_gen = splitter_ques_gen.split_text(question_gen)

In [101]:
from langchain.docstore.document import Document

In [102]:
document_ques_gen = [Document(page_content = t) for t in chunk_ques_gen]

In [103]:
document_ques_gen

[Document(metadata={}, page_content='ImportanceandtheuseofcorrelationinStatistics\nIntroduction\nCorrelationisastatistical measurethat expressestheextent towhichtwovariablesarelinearlyrelated. It isacommontool fordescribingsimplerelationshipswithout makingastatement about causeandeffect.Correlationcoefficientsrangefrom-1to1, withavalueof 0indicatingnolinearrelationshipbetweenthetwovariables, avalueof 1indicatingaperfect positivelinearrelationship, andavalueof -1indicatingaperfectnegativelinearrelationship.\nCorrelationisimportantinstatisticsbecauseitcanbeusedto\n1. Identifyrelationshipsbetweenvariables:Correlationcanbeusedtoidentifywhetherthereisarelationshipbetweentwovariables, andif so, whethertherelationshipispositiveornegative. Thisinformationcanbeuseful forunderstandingtherelationshipsbetweendifferent factorsinacomplexsystem.\n2. Makepredictions:If thereisastrongcorrelationbetweentwovariables, it ispossibletousethevalueof onevariabletopredictthevalueof theothervariable. Thiscanbeu

In [104]:
type(document_ques_gen[0])

langchain_core.documents.base.Document

In [105]:
prompt_template = """
You are an expert at creating questions based on statistics materials and documentation.
Your goal is to prepare a coder or programmer for their exam and statistics tests.
You do this by asking questions about the text below:

------------
{text}
------------

Create questions that will prepare the coders or programmers for their tests.
Make sure not to lose any important information.

QUESTIONS:
"""

In [106]:
PROMPT_QUESTIONS = PromptTemplate(template=prompt_template, input_variables=['text'])

In [107]:
refine_template = ("""
You are an expert at creating practice questions based on statistics material and documentation.
Your goal is to help a coder or programmer prepare for a statistics test.
We have received some practice questions to a certain extent: {existing_answer}.
We have the option to refine the existing questions or add new ones.
(only if necessary) with some more context below.
------------
{text}
------------

Given the new context, refine the original questions in English.
If the context is not helpful, please provide the original questions.
QUESTIONS:
"""
)

In [108]:
REFINE_PROMPT_QUESTIONS = PromptTemplate(
    input_variables=["existing_answer", "text"],
    template=refine_template,
)

In [109]:
from langchain.chains.summarize import load_summarize_chain

In [110]:
ques_gen_chain = load_summarize_chain(llm = llm, 
                                          chain_type = "refine", 
                                          verbose = True, 
                                          question_prompt=PROMPT_QUESTIONS, 
                                          refine_prompt=REFINE_PROMPT_QUESTIONS)

In [111]:
ques = ques_gen_chain.run(document_ques_gen)

print(ques)



[1m> Entering new RefineDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
You are an expert at creating questions based on statistics materials and documentation.
Your goal is to prepare a coder or programmer for their exam and statistics tests.
You do this by asking questions about the text below:

------------
ImportanceandtheuseofcorrelationinStatistics
Introduction
Correlationisastatistical measurethat expressestheextent towhichtwovariablesarelinearlyrelated. It isacommontool fordescribingsimplerelationshipswithout makingastatement about causeandeffect.Correlationcoefficientsrangefrom-1to1, withavalueof 0indicatingnolinearrelationshipbetweenthetwovariables, avalueof 1indicatingaperfect positivelinearrelationship, andavalueof -1indicatingaperfectnegativelinearrelationship.
Correlationisimportantinstatisticsbecauseitcanbeusedto
1. Identifyrelationshipsbetweenvariables:Correlationcanbeusedtoidentifywhetherthereisarelations

In [112]:
print(ques)

Here are some questions based on the provided text:

**Section 1: Understanding Correlation**

1. What is correlation in statistics, and what does it measure?
2. How do you interpret a correlation coefficient value of -1, 0, or 1?
3. Can you give an example of how correlation can be used to identify a relationship between variables?

**Section 2: Importance of Correlation**

4. Why is correlation important in statistics? Provide at least two reasons.
5. How does correlation help in making predictions in different fields (e.g., business, finance, medicine)?
6. Can you think of an example where correlation can be used to develop causal models?

**Section 3: Real-World Applications**

7. A company wants to allocate marketing resources based on the relationship between advertising spending and sales. How would they use correlation in this scenario?
8. A financial analyst wants to build a diversified portfolio of stocks with lower overall risk. How would they use correlation in this scenari

In [114]:
from langchain_community.embeddings import OllamaEmbeddings
embeddings = OllamaEmbeddings(
    model="nomic-embed-text",
    base_url="http://localhost:11434"
)

In [115]:
from langchain.vectorstores import FAISS

In [116]:
vector_store = FAISS.from_documents(document_ques_gen, embeddings)


In [117]:
vector_store

<langchain_community.vectorstores.faiss.FAISS at 0x125c6f750>

In [118]:
ques

'Here are some questions based on the provided text:\n\n**Section 1: Understanding Correlation**\n\n1. What is correlation in statistics, and what does it measure?\n2. How do you interpret a correlation coefficient value of -1, 0, or 1?\n3. Can you give an example of how correlation can be used to identify a relationship between variables?\n\n**Section 2: Importance of Correlation**\n\n4. Why is correlation important in statistics? Provide at least two reasons.\n5. How does correlation help in making predictions in different fields (e.g., business, finance, medicine)?\n6. Can you think of an example where correlation can be used to develop causal models?\n\n**Section 3: Real-World Applications**\n\n7. A company wants to allocate marketing resources based on the relationship between advertising spending and sales. How would they use correlation in this scenario?\n8. A financial analyst wants to build a diversified portfolio of stocks with lower overall risk. How would they use correlati

In [119]:
ques_list = ques.split("\n")

In [121]:
def filter_questions(text_list):
    questions = []
    for line in text_list:
        line = line.strip()
        if line.endswith('?'):
            questions.append(line)
    return questions

In [123]:
filtered_questions = filter_questions(ques_list)
print(filtered_questions)

['1. What is correlation in statistics, and what does it measure?', '2. How do you interpret a correlation coefficient value of -1, 0, or 1?', '3. Can you give an example of how correlation can be used to identify a relationship between variables?', '5. How does correlation help in making predictions in different fields (e.g., business, finance, medicine)?', '6. Can you think of an example where correlation can be used to develop causal models?', '7. A company wants to allocate marketing resources based on the relationship between advertising spending and sales. How would they use correlation in this scenario?', '8. A financial analyst wants to build a diversified portfolio of stocks with lower overall risk. How would they use correlation in this scenario?', '9. A medical researcher wants to develop public health campaigns to discourage smoking. How would they use correlation in this scenario?', '11. Can you explain why a strong correlation between ice cream sales and shark attacks doe

In [124]:
from langchain.chains import RetrievalQA

In [125]:
answer_generation_chain = RetrievalQA.from_chain_type(llm=llm, 
                                               chain_type="stuff", 
                                               retriever=vector_store.as_retriever())

In [127]:
# Answer each question and save to a file
for question in filtered_questions:
    print("Question: ", question)
    answer = answer_generation_chain.run(question)
    print("Answer: ", answer)
    print("--------------------------------------------------\\n\\n")
    # Save answer to file
    with open("answers.txt", "a") as f:
        f.write("Question: " + question + "\\n")
        f.write("Answer: " + answer + "\\n")
        f.write("--------------------------------------------------\\n\\n")

Question:  1. What is correlation in statistics, and what does it measure?
Answer:  Correlation in statistics is a statistical measure that expresses the extent to which two variables are linearly related. It measures the strength and direction of the relationship between two variables, with values ranging from -1 to 1. A value of 0 indicates no linear relationship, a value of 1 indicates a perfect positive linear relationship, and a value of -1 indicates a perfect negative linear relationship.
--------------------------------------------------\n\n
Question:  2. How do you interpret a correlation coefficient value of -1, 0, or 1?
Answer:  Based on the provided context, here are the interpretations for each correlation coefficient value:

* A correlation coefficient value of -1 indicates a perfect negative linear relationship between two variables.
* A correlation coefficient value of 0 indicates no linear relationship between two variables.
* A correlation coefficient value of 1 indica