<a href="https://colab.research.google.com/github/Annrosejojue/DataScience/blob/main/NLP_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install sentence-transformers faiss-cpu openai nltk


Collecting faiss-cpu
  Downloading faiss_cpu-1.13.1-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.6 kB)
Downloading faiss_cpu-1.13.1-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (23.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m38.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.13.1


In [3]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

from sentence_transformers import SentenceTransformer
import faiss
import numpy as np


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [4]:
documents = [
    "The Eiffel Tower is located in Paris, France. It was built in 1889.",
    "The Great Wall of China stretches over 13,000 miles.",
    "Python is a popular programming language created by Guido van Rossum.",
    "Mount Everest is the tallest mountain in the world, located in the Himalayas.",
    "The Amazon rainforest is the largest rainforest on Earth.",
    "Shakespeare wrote many famous plays including Hamlet and Macbeth.",
    "The Moon orbits the Earth every 27 days.",
    "The Pacific Ocean is the largest ocean on Earth.",
    "Albert Einstein developed the theory of relativity.",
    "The Statue of Liberty is located in New York Harbor."
]

chunks = []
for doc in documents:
    for sent in nltk.sent_tokenize(doc):
        chunks.append(sent)

print("Total chunks:", len(chunks))


Total chunks: 11


In [5]:
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(chunks)
embeddings = np.array(embeddings).astype('float32')


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [6]:
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)
print("Index size:", index.ntotal)


Index size: 11


In [7]:
query = "Where is the Eiffel Tower located?"
query_embedding = model.encode([query]).astype('float32')

distances, indices = index.search(query_embedding, k=3)
retrieved_chunks = [chunks[i] for i in indices[0]]
print("Retrieved:", retrieved_chunks)


Retrieved: ['The Eiffel Tower is located in Paris, France.', 'It was built in 1889.', 'The Statue of Liberty is located in New York Harbor.']


In [8]:
from transformers import pipeline

qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")

context = " ".join(retrieved_chunks)
result = qa_pipeline(question=query, context=context)
print("Answer:", result["answer"])


Device set to use cpu


Answer: Paris, France


In [9]:
def heuristic_score(answer, context):
    import re
    ans_tokens = set(re.findall(r"\w+", answer.lower()))
    ctx_tokens = set(re.findall(r"\w+", context.lower()))
    overlap = ans_tokens.intersection(ctx_tokens)
    ratio = len(overlap) / max(len(ans_tokens), 1)
    if ratio > 0.8: return 5
    elif ratio > 0.5: return 4
    elif ratio > 0.3: return 3
    elif ratio > 0.1: return 2
    else: return 1

score = heuristic_score(result["answer"], context)
print("Score:", score)


Score: 5


In [10]:
def rag_pipeline(query, k=3):
    query_embedding = model.encode([query]).astype('float32')
    distances, indices = index.search(query_embedding, k)
    retrieved_chunks = [chunks[i] for i in indices[0]]

    context = " ".join(retrieved_chunks)
    result = qa_pipeline(question=query, context=context)
    answer = result["answer"]

    score = heuristic_score(answer, context)

    return {"query": query, "answer": answer, "retrieved": retrieved_chunks, "score": score}

print(rag_pipeline("Where is the Eiffel Tower located?"))


{'query': 'Where is the Eiffel Tower located?', 'answer': 'Paris, France', 'retrieved': ['The Eiffel Tower is located in Paris, France.', 'It was built in 1889.', 'The Statue of Liberty is located in New York Harbor.'], 'score': 5}
