In [1]:
!pip install transformers datasets faiss-cpu torch


Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp312-cp312-macosx_11_0_arm64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


In [9]:
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
import faiss
import numpy as np


  from .autonotebook import tqdm as notebook_tqdm


In [10]:
if torch.backends.mps.is_available():
    device = torch.device("mps")  # Apple Silicon GPU
    print("Using MPS (Apple GPU)")
else:
    device = torch.device("cpu")
    print("Using CPU")


Using MPS (Apple GPU)


In [11]:
# Load the Q&A dataset from the CSV file
df = pd.read_csv("qa_dataset.csv")

# Show first few rows to verify the dataset structure
df.head()


Unnamed: 0,Question,Answer
0,What is concrete made of?,"Concrete is made of cement, water, sand, and a..."
1,What is the role of a civil engineer on a cons...,"A civil engineer designs, supervises, and ensu..."
2,What does PPE stand for in construction?,"PPE stands for Personal Protective Equipment, ..."
3,What is the purpose of a foundation in constru...,A foundation distributes the weight of the str...
4,What is curing in concrete construction?,Curing is the process of maintaining moisture ...


In [12]:
# Initialize BERT Tokenizer and Model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')


In [15]:
# Function to encode text with BERT
def encode_text(texts, tokenizer, model):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1).numpy()
    return embeddings


In [16]:
# Encode the questions to obtain their embeddings
questions = df["Question"].tolist()  # Updated column name to 'Question'
question_embeddings = encode_text(questions, tokenizer, bert_model)

# Create FAISS index
dimension = question_embeddings.shape[1]  # BERT output dimension
index = faiss.IndexFlatL2(dimension)
index.add(np.array(question_embeddings))  # Add embeddings to FAISS index

# Verify index size
print(f"Number of vectors in the FAISS index: {index.ntotal}")


Number of vectors in the FAISS index: 123


In [17]:
# Function to retrieve the most relevant answer from the dataset
def retrieve_answer(query, tokenizer, model, index, df, top_k=1):
    query_embedding = encode_text([query], tokenizer, model)
    D, I = index.search(query_embedding, top_k)  # D: distances, I: indices of the top_k closest questions
    answer = df.iloc[I[0][0]]['Answer']  # Updated column name to 'Answer'
    return answer


In [18]:
# Example query
query = "How can I build a home?"

# Retrieve the answer
answer = retrieve_answer(query, tokenizer, bert_model, index, df)
print(f"Answer: {answer}")


Answer: To build a house, you need to secure land, get necessary permits, hire architects and contractors, lay the foundation, construct walls and roof, install utilities (plumbing, electrical), and finish with interior and exterior finishes.


In [19]:
# Save FAISS index and BERT model for later use
faiss.write_index(index, "qa_faiss.index")
bert_model.save_pretrained("bert_qa_model")
tokenizer.save_pretrained("bert_qa_tokenizer")


('bert_qa_tokenizer/tokenizer_config.json',
 'bert_qa_tokenizer/special_tokens_map.json',
 'bert_qa_tokenizer/vocab.txt',
 'bert_qa_tokenizer/added_tokens.json')

In [20]:
# Load the saved FAISS index and BERT model
index = faiss.read_index("qa_faiss.index")
bert_model = BertModel.from_pretrained("bert_qa_model")
tokenizer = BertTokenizer.from_pretrained("bert_qa_tokenizer")


In [25]:
# Example query to test the loaded model
query = "how to build a home What should I do get injured on a construction site?and how to build a home"

# Retrieve the answer
answer = retrieve_answer(query, tokenizer, bert_model, index, df)
print(f"Answer: {answer}")


Answer: In case of a fire, activate the alarm, evacuate immediately, call emergency services, and do not attempt to fight the fire unless it is small and manageable.


In [26]:
from transformers import BartForConditionalGeneration

# Load a pre-trained BART model for generation
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')

# Function to generate answers with BART
def generate_answer(query, tokenizer, model):
    inputs = tokenizer(query, return_tensors="pt")
    summary_ids = model.generate(inputs['input_ids'], max_length=50, num_beams=4, early_stopping=True)
    generated_text = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return generated_text

# Test the generation model
query = "What is a gable roof?"
answer = generate_answer(query, tokenizer, model)
print(f"Generated Answer: {answer}")


Generated Answer: [unused1] ottawa is a gable roof?? [unused735]bby? [unused939] roof? [unused134]? 12? m? is a what is a is a worsened roof is a roof? g? [unused3] what is [unused1]
