In [4]:
# Let's install the Kaggle library in order to import the dataset to train our model
!pip install -q kaggle

In [5]:
!kaggle datasets download -d praneshmukhopadhyay/amazon-questionanswer-dataset

Dataset URL: https://www.kaggle.com/datasets/praneshmukhopadhyay/amazon-questionanswer-dataset
License(s): CC0-1.0
Downloading amazon-questionanswer-dataset.zip to /content
 97% 415M/426M [00:09<00:00, 18.6MB/s]
100% 426M/426M [00:09<00:00, 48.8MB/s]


In [9]:
!unzip amazon-questionanswer-dataset.zip

Archive:  amazon-questionanswer-dataset.zip
  inflating: multi_answers.csv       
  inflating: multi_questions.csv     
  inflating: single_qna.csv          


In [None]:
# Let's install Kaggle CLI
!pip install --quiet kaggle

# Let's create a ~/.kaggle folder and move kaggle.json there
import os
os.makedirs('/root/.kaggle', exist_ok=True)
!cp /content/kaggle.json /root/.kaggle/

# Let's permissions
!chmod 600 /root/.kaggle/kaggle.json

# Lets download the dataset from Kaggle



In [10]:
!ls

amazon-questionanswer-dataset.zip  multi_questions.csv	single_qna.csv
multi_answers.csv		   sample_data


In [12]:
import pandas as pd

df_multi_answers = pd.read_csv("multi_answers.csv")
df_multi_questions = pd.read_csv("multi_questions.csv")
df_single_qna = pd.read_csv("single_qna.csv")

print("=== multi_answers.csv ===")
display(df_multi_answers.head(3))

print("=== multi_questions.csv ===")
display(df_multi_questions.head(3))

print("=== single_qna.csv ===")
display(df_single_qna.head(3))


=== multi_answers.csv ===


Unnamed: 0,QuestionID,AnswerText,AnswererID,AnswerTime,AnswerType,AnswerScore
0,C1Q1,"It's all custom mounting, where there's a will...",AQZ8QLPPYA359,"December 29, 2014",,
1,C1Q1,You will need to drill another hole in Mud fla...,A246IDL7UXVCQO,"December 29, 2014",,
2,C1Q1,"It's been a while since I installed them, but ...",A3BWPG98KF0TAV,"July 19, 2013",?,0.5428


=== multi_questions.csv ===


Unnamed: 0,QuestionID,QuestionType,Category,AskerID,QuestionTime,QuestionText
0,C1Q1,yes/no,Automotive,A365S8H55GGXPD,"July 19, 2013",will they fit 2013 f350 dually
1,C1Q2,yes/no,Automotive,AXOOEUYEJ87ZB,"May 7, 2014",will they fit 2014 mazda 3 sport?
2,C1Q3,open-ended,Automotive,AN2AMELSNPN99,"June 20, 2014",Do they fit a 1998 GMC Sierra 3 door?


=== single_qna.csv ===


Unnamed: 0,QuestionType,Asin,AnswerTime,UnixTime,Question,AnswerType,Answer,Category
0,yes/no,B00004U9JP,"Jun 27, 2014",1403852000.0,I have a 9 year old Badger 1 that needs replac...,?,I replaced my old one with this without a hitch.,Appliances
1,open-ended,B00004U9JP,"Apr 28, 2014",1398668000.0,model number,,This may help InSinkErator Model BADGER-1: Bad...,Appliances
2,yes/no,B00004U9JP,"Aug 25, 2014",1408950000.0,can I replace Badger 1 1/3 with a Badger 5 1/2...,?,Plumbing connections will vary with different ...,Appliances


In [13]:
# Here's an example join on "QuestionID"
df_joine


Joined DataFrame shape: (4019744, 11)


Unnamed: 0,QuestionID,QuestionType,Category,AskerID,QuestionTime,QuestionText,AnswerText,AnswererID,AnswerTime,AnswerType,AnswerScore
0,C1Q1,yes/no,Automotive,A365S8H55GGXPD,"July 19, 2013",will they fit 2013 f350 dually,"It's all custom mounting, where there's a will...",AQZ8QLPPYA359,"December 29, 2014",,
1,C1Q1,yes/no,Automotive,A365S8H55GGXPD,"July 19, 2013",will they fit 2013 f350 dually,You will need to drill another hole in Mud fla...,A246IDL7UXVCQO,"December 29, 2014",,
2,C1Q1,yes/no,Automotive,A365S8H55GGXPD,"July 19, 2013",will they fit 2013 f350 dually,"It's been a while since I installed them, but ...",A3BWPG98KF0TAV,"July 19, 2013",?,0.5428
3,C1Q1,yes/no,Automotive,A365S8H55GGXPD,"July 19, 2013",will they fit 2013 f350 dually,1 pair rear flaps and mounting hardware.,1,"July 19, 2013",?,0.5565
4,C1Q1,yes/no,Automotive,A365S8H55GGXPD,"July 19, 2013",will they fit 2013 f350 dually,I didn't buy these for myself I bought them fo...,A1MGZTOLD2C0VS,"July 19, 2013",?,0.4623


In [16]:
def create_qa_chunk(row):
    question = str(row["QuestionText"])
    answers_list = row["AnswerText"]

    # Casting each answer to string
    answers_list = [str(ans) for ans in answers_list]

    answers_str = "\n- " + "\n- ".join(answers_list)
    combined_text = f"Q: {question}\nAnswers:{answers_str}"
    return combined_text


In [17]:
def create_qa_chunk(row):
    question = str(row["QuestionText"])
    answers_list = row["AnswerText"]

    # Replace NaN floats with a placeholder as we walk through data processing step
    answers_list = [str(ans) if pd.notna(ans) else "No Answer" for ans in answers_list]

    answers_str = "\n- " + "\n- ".join(answers_list)
    combined_text = f"Q: {question}\nAnswers:{answers_str}"
    return combined_text


In [18]:
# Convert AnswerText to string, replacing NaN
df_joined["AnswerText"] = df_joined["AnswerText"].fillna("No Answer").astype(str)


In [19]:
# 1. Group all answers for each question
df_grouped = (
    df_joined.groupby(["QuestionID", "QuestionText"])["AnswerText"]
    .apply(list)
    .reset_index()
)

# 2. Let's combine them into one chunk of text
def create_qa_chunk(row):
    question = str(row["QuestionText"])
    answers_list = row["AnswerText"]
    answers_str = "\n- " + "\n- ".join(answers_list)
    combined_text = f"Q: {question}\nAnswers:{answers_str}"
    return combined_text

df_grouped["combined_text"] = df_grouped.apply(create_qa_chunk, axis=1)
df_grouped.head()


Unnamed: 0,QuestionID,QuestionText,AnswerText,combined_text
0,C10Q1,Would a sennheiser HMD280-13 and this adapter ...,[I took a photo: <http://imgur.com/G48f1C4>I b...,Q: Would a sennheiser HMD280-13 and this adapt...
1,C10Q10,I just received my first tube/stick of Fast Fr...,[I've never actually used guitar honey but I w...,Q: I just received my first tube/stick of Fast...
2,C10Q100,Are these in a pair?,[You need to use speaker cables....NOT guitar ...,Q: Are these in a pair?\nAnswers:\n- You need ...
3,C10Q1000,Can I use such a LED strip (to replace old inc...,[The specifications for the 5Meter LED light s...,Q: Can I use such a LED strip (to replace old ...
4,C10Q1001,What is the width of the string spacing at the...,"[Black pegs. Thank you,AdamGTRSTORE646-460-847...",Q: What is the width of the string spacing at ...


In [20]:
!pip install --quiet langchain==0.0.137 faiss-cpu sentence-transformers


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.6/153.6 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m518.3/518.3 kB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.0/90.0 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m70.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m59.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.6/49.6 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following depe

In [22]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

# 1. Now let's pick an embedding model from Hugging Face
embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
hf_embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)

# 2. Converting combined_text into a list
docs = df_grouped["combined_text"].tolist()

# 3. Building FAISS index
vectorstore = FAISS.from_texts(docs, hf_embeddings)
print("FAISS index created with", len(docs), "entries.")


FAISS index created with 172617 entries.


In [23]:
query = "Will these mud flaps fit a Ford F150?"
results = vectorstore.similarity_search(query, k=2)

for i, doc in enumerate(results):
    print(f"--- Result {i+1} ---")
    print(doc.page_content)
    print()

--- Result 1 ---
Q: will these work with mud flaps?
Answers:
- Mine were OEM Ford liners. Included all new hardware too.
- Ford FactoryVery good fit
- These are OEM not aftermarket
- Ford OEM accessory.
- I think so, but they couldn't fit more perfectly. I would definately recommend them.
- They are factory wheel liners from ford.  They fit and work perfectly.
- Original Ford factory liners
- They are Ford OEM.
- they are original ford.  You can order a truck with them installed or the dealer sells and installs the same part.
- Yes they're aftermarket.
- Yes, I have a 2013 F150 Platinum model.  They fit perfectly, and they are easy to install without removing the wheels.
- Yes they do ...
- Yes, just put a set on my 2013 f150...Matt
- Yes, they are on mine.
- Yes, they will fit very well. Had to jack up the back of the truck from under the trailer hitch for acsess. Do it on a warm day, you don't want hard plastic.
- YES THEY WILL!THANKS
- yes
- Yes they will fit... 2014 is the last yea

In [29]:
# Ignoring this snippet as it is a debugging step
# from huggingface_hub import login
# login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [33]:
!pip install huggingface_hub
from huggingface_hub import login

login()




VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [35]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch

MODEL_ID = "tiiuae/falcon-7b-instruct"

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=False)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float16,
    device_map="auto"
)

llama_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer
)


tokenizer_config.json:   0%|          | 0.00/1.13k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.73M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/281 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/17.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.48G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/117 [00:00<?, ?B/s]

Device set to use cuda:0


In [36]:


def ask_model(question: str) -> str:
    """
    A placeholder function that sends 'question' to the model/pipeline
    and returns the response as a string.
    """
    # If using Transformers pipeline directly:
    # result = falcon_pipeline(question)[0]["generated_text"]
    # return result

    # OR if using a chain:
    # result = chain.run(question)
    # return result

    # Demo stub: just echo the question
    return "Demo response for: " + question

# A list of queries to test various capabilities of the model
test_queries = [
    # General Knowledge
    "Who was the 16th President of the United States, and what were his main accomplishments?",
    # Reasoning & Problem-Solving
    "If a car travels at 60 mph, how long does it take to go 180 miles?",
    # Summarization
    "Summarize the following text: 'Large language models are transforming how we build AI applications.'",
    # Creative Prompt
    "Write a short story about a time-traveling inventor who lands in ancient Egypt. Keep it under 200 words.",
    # Technical Explanation
    "Explain what a binary search algorithm is and how it works in simple terms.",
    # Multi-step Instructions
    "Give me a step-by-step guide to baking chocolate chip cookies.",
    # Opinion/Argumentation
    "Should governments prioritize spending on space exploration or healthcare? Provide arguments for both sides.",
    # Code Generation
    "Write a Python function to check if a string is a palindrome.",
    # Domain Knowledge
    "What is photosynthesis, and why is it important for life on Earth?",
    # Additional
    "Explain the concept of quantum mechanics to a 10-year-old."
]

# Now let's loop through each query, ask the model, and print the answer
for idx, query in enumerate(test_queries, 1):
    print(f"--- Query {idx}: {query}")
    response = ask_model(query)
    print("Model Response:", response, "\n")


--- Query 1: Who was the 16th President of the United States, and what were his main accomplishments?
Model Response: Demo response for: Who was the 16th President of the United States, and what were his main accomplishments? 

--- Query 2: If a car travels at 60 mph, how long does it take to go 180 miles?
Model Response: Demo response for: If a car travels at 60 mph, how long does it take to go 180 miles? 

--- Query 3: Summarize the following text: 'Large language models are transforming how we build AI applications.'
Model Response: Demo response for: Summarize the following text: 'Large language models are transforming how we build AI applications.' 

--- Query 4: Write a short story about a time-traveling inventor who lands in ancient Egypt. Keep it under 200 words.
Model Response: Demo response for: Write a short story about a time-traveling inventor who lands in ancient Egypt. Keep it under 200 words. 

--- Query 5: Explain what a binary search algorithm is and how it works in s

In [37]:
from transformers import pipeline

falcon_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=256,
    temperature=0.7
)


Device set to use cuda:0


In [38]:
# Let's pick some general questions as example queries
test_queries = [
    "Who was the 16th President of the United States, and what were his main accomplishments?",
    "If a car travels at 60 mph, how long does it take to go 180 miles?",
    "Write a short story about a time-traveling inventor who lands in ancient Egypt. Keep it under 200 words.",
    "Explain what a binary search algorithm is and how it works.",
    "Should governments prioritize spending on space exploration or healthcare? Provide arguments for both sides."
]

# Here's the unction to query the Falcon pipeline
def ask_falcon(question: str) -> str:
    # The pipeline returns a list of dicts
    output = falcon_pipeline(question)
    # Extract the generated text from the first dict
    return output[0]["generated_text"]

# Loop through queries, get model-generated answers
for idx, q in enumerate(test_queries, 1):
    print(f"--- Question {idx} ---")
    print("Q:", q)
    answer = ask_falcon(q)
    print("A:", answer)
    print()

    #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_009#


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


--- Question 1 ---
Q: Who was the 16th President of the United States, and what were his main accomplishments?


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


A: Who was the 16th President of the United States, and what were his main accomplishments?
The 16th President of the United States was Abraham Lincoln. His main accomplishments include abolishing slavery in the United States, preserving the Union during the Civil War, and implementing policies that led to economic recovery after the war.

--- Question 2 ---
Q: If a car travels at 60 mph, how long does it take to go 180 miles?


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


A: If a car travels at 60 mph, how long does it take to go 180 miles?
To calculate the time it takes to travel 180 miles at 60 mph, you can use the formula: Distance = Speed x Time. In this case, the distance is 180 miles and the speed is 60 mph. So, the time it takes is 180 miles ÷ 60 mph = 3 hours.

--- Question 3 ---
Q: Write a short story about a time-traveling inventor who lands in ancient Egypt. Keep it under 200 words.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


A: Write a short story about a time-traveling inventor who lands in ancient Egypt. Keep it under 200 words.
Dr. Aiden awoke to the sound of his time-traveling device, the Arcane Anomaly, emitting a faint humming. He rubbed his eyes, adjusting to the dim light filtering in through the window. Outside, the ancient Egyptian city of Thebes stretched before him, its temples and pyramids casting long shadows. With a curious mind and a pocket full of modern tools, Aiden ventured out to explore this ancient world. He met wise priests and learned the secrets of the afterlife, all while marveling at the marvels of this bygone era.

--- Question 4 ---
Q: Explain what a binary search algorithm is and how it works.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


A: Explain what a binary search algorithm is and how it works.
A binary search algorithm is a type of search algorithm that searches a sorted list or array for a specific value or element. It works by repeatedly checking the middle element of the list or array until the target value is found or the search is complete. The algorithm takes advantage of the fact that the list or array is sorted, as it eliminates the need to compare each element with the target value. The middle element is compared with the target value, and if they match, the algorithm returns the index of the target value. If the middle element is less than the target value, the algorithm proceeds to the left half of the list or array. If the middle element is greater than the target value, the algorithm proceeds to the right half of the list or array. This process is repeated until the target value is found or the search is complete.

--- Question 5 ---
Q: Should governments prioritize spending on space exploration or h

In [40]:
from langchain.llms import HuggingFacePipeline
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate

# Wrapping our pipeline in a LangChain LLM
llm = HuggingFacePipeline(pipeline=falcon_pipeline)

# Let's define how we want to prompt the model
prompt = PromptTemplate(
    input_variables=["question"],
    template="You are a helpful AI assistant. Answer the question: {question}"
)

chain = LLMChain(
    llm=llm,
    prompt=prompt
)

test_queries = [
    "What are the most cost-effective ways to set up a new small business office, including furniture, equipment, and supplies.",
    "How do I choose the right printer for a small or home office, balancing upfront cost vs. ongoing ink expenses."
]

for idx, q in enumerate(test_queries, 1):
    print(f"--- Question {idx} ---")
    print("Q:", q)
    # chain.run(...) passes the 'question' to our prompt template
    answer = chain.run({"question": q})
    print("A:", answer)
    print()


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


--- Question 1 ---
Q: What are the most cost-effective ways to set up a new small business office, including furniture, equipment, and supplies.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


A: 
There are several cost-effective ways to set up a new small business office. First, consider using used or refurbished furniture and equipment, which can often be found at a lower cost than new items. Second, look for deals on office supplies, such as discount stores or online retailers. Third, consider renting a shared office space instead of purchasing or leasing expensive commercial property. Finally, think creatively about repurposing existing items for office use, such as using a storage shed for file cabinets or using a bookshelf as a partition for a work area.

--- Question 2 ---
Q: How do I choose the right printer for a small or home office, balancing upfront cost vs. ongoing ink expenses.
A: 
When choosing a printer for a small or home office, consider the following factors: 1) Your printing needs - determine if you need a printer that can handle large volumes or if a single-use printer is sufficient. 2) Upfront cost - compare the cost of purchasing a printer versus the o