In [5]:
import requests

def fetch_wikivoyage_data(query):
    url = f"https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "format": "json",
        "titles": query,
        "prop": "extracts",
        "explaintext": True,
    }
    response = requests.get(url, params = params)
    data = response.json()
    pages = data["query"]["pages"]
    for page_id, page_data in pages.items():
        return page_data.get("extract", "No data found")
    

In [6]:
travel_info = fetch_wikivoyage_data("Paris")
print(travel_info)

Paris (French pronunciation: [paʁi] ) is the capital and largest city of France. With an official estimated population of 2,102,650 residents in January 2023 in an area of more than 105 km2 (41 sq mi), Paris is the fourth-largest city in the European Union and the 30th most densely populated city in the world in 2022. Since the 17th century, Paris has been one of the world's major centres of finance, diplomacy, commerce, culture, fashion, and gastronomy. For its leading role in the arts and sciences, as well as its early and extensive system of street lighting, in the 19th century, it became known as the City of Light.
The City of Paris is the centre of the Île-de-France region, or Paris Region, with an official estimated population of 12,271,794 inhabitants in January 2023, or about 19% of the population of France. The Paris Region had a nominal GDP of €765 billion (US$1.064 trillion when adjusted for PPP) in 2021, the highest in the European Union. According to the Economist Intellig

In [7]:
def split_into_sections(text):
    sections = text.split("\n==")
    cleaned_sections = [sec.replace("==", "").strip() for sec in sections if sec.strip()]
    return cleaned_sections

documents = split_into_sections(travel_info)
for doc in documents:
    print(doc[:100])

Paris (French pronunciation: [paʁi] ) is the capital and largest city of France. With an official es
Etymology 

The ancient oppidum that corresponds to the modern city of Paris was first mentioned in 
History
= Origins =

The Parisii, a sub-tribe of the Celtic Senones, inhabited the Paris area from around th
= High and Late Middle Ages to Louis XIV =

By the end of the 12th century, Paris had become the pol
= 18th and 19th centuries =

Paris grew in population from about 400,000 in 1640 to 650,000 in 1780.
= 20th and 21st centuries =

By 1901, the population of Paris had grown to about 2,715,000. At the b
Geography
= Location =

Paris is located in northern central France, in a north-bending arc of the river Seine
= Climate =

Paris has an oceanic climate within the Köppen climate classification, typical of weste
Administration
= City government =

For almost all of its long history, except for a few brief periods, Paris was g
= Métropole du Grand Paris =

In January 2016, the Métropo

In [8]:
import re

In [9]:
def clean_text(text):
    # Remove pronunciation guides and references in square brackets
    text = re.sub(r"\[.*?\]", "", text)
    # Remove special characters and excessive whitespace
    text = re.sub(r"==+", "", text)  # Remove section markers
    text = re.sub(r"\s{2,}", " ", text)  # Replace multiple spaces with one
    text = re.sub(r"[^a-zA-Z0-9.,!?'\s]", "", text)  # Remove non-alphanumeric
    return text.strip()

cleaned_documents = [clean_text(doc) for doc in documents]

In [10]:
for doc in cleaned_documents:
    print(doc[:100])

Paris French pronunciation  is the capital and largest city of France. With an official estimated po
Etymology The ancient oppidum that corresponds to the modern city of Paris was first mentioned in th
History
Origins  The Parisii, a subtribe of the Celtic Senones, inhabited the Paris area from around the mid
High and Late Middle Ages to Louis XIV  By the end of the 12th century, Paris had become the politic
18th and 19th centuries  Paris grew in population from about 400,000 in 1640 to 650,000 in 1780. A n
20th and 21st centuries  By 1901, the population of Paris had grown to about 2,715,000. At the begin
Geography
Location  Paris is located in northern central France, in a northbending arc of the river Seine, who
Climate  Paris has an oceanic climate within the Kppen climate classification, typical of western Eu
Administration
City government  For almost all of its long history, except for a few brief periods, Paris was gover
Mtropole du Grand Paris  In January 2016, the Mtropole du 

In [11]:
def split_into_paragraphs(documents):
    all_paragraphs = []
    for text in documents:
        paragraphs = text.split("\n")  # Use single newline as delimiter
        paragraphs = [para.strip() for para in paragraphs if len(para.strip()) > 10]  # Relax length condition
        all_paragraphs.extend(paragraphs)
    return all_paragraphs

paragraphs = split_into_paragraphs(cleaned_documents)
print(f"Total paragraphs: {len(paragraphs)}")  # Check total count
for paragraph in paragraphs[:5]:
    print(paragraph)  # Preview first 5 paragraphs


Total paragraphs: 173
Paris French pronunciation  is the capital and largest city of France. With an official estimated population of 2,102,650 residents in January 2023 in an area of more than 105 km2 41 sq mi, Paris is the fourthlargest city in the European Union and the 30th most densely populated city in the world in 2022. Since the 17th century, Paris has been one of the world's major centres of finance, diplomacy, commerce, culture, fashion, and gastronomy. For its leading role in the arts and sciences, as well as its early and extensive system of street lighting, in the 19th century, it became known as the City of Light.
The City of Paris is the centre of the ledeFrance region, or Paris Region, with an official estimated population of 12,271,794 inhabitants in January 2023, or about 19 of the population of France. The Paris Region had a nominal GDP of 765 billion US1.064 trillion when adjusted for PPP in 2021, the highest in the European Union. According to the Economist Intelli

In [12]:
def tag_sections(paragraphs):
    tagged_data = []
    for para in paragraphs:
        if "museum" in para.lower() or "art" in para.lower():
            tag = "Culture & Art"
        elif "transport" in para.lower() or "metro" in para.lower():
            tag = "Transportation"
        elif "history" in para.lower() or "roman" in para.lower():
            tag = "History"
        else:
            tag = "General"
        tagged_data.append({"tag": tag, "content": para})
    return tagged_data

tagged_paragraphs = tag_sections(paragraphs)
for data in tagged_paragraphs[:5]:
    print(data)  # Preview tagged data

{'tag': 'Culture & Art', 'content': "Paris French pronunciation  is the capital and largest city of France. With an official estimated population of 2,102,650 residents in January 2023 in an area of more than 105 km2 41 sq mi, Paris is the fourthlargest city in the European Union and the 30th most densely populated city in the world in 2022. Since the 17th century, Paris has been one of the world's major centres of finance, diplomacy, commerce, culture, fashion, and gastronomy. For its leading role in the arts and sciences, as well as its early and extensive system of street lighting, in the 19th century, it became known as the City of Light."}
{'tag': 'General', 'content': 'The City of Paris is the centre of the ledeFrance region, or Paris Region, with an official estimated population of 12,271,794 inhabitants in January 2023, or about 19 of the population of France. The Paris Region had a nominal GDP of 765 billion US1.064 trillion when adjusted for PPP in 2021, the highest in the Eu

In [20]:
def structure_data(tagged_data):
    structured_data = []
    for data in tagged_data:
        tag = data["tag"]
        content = data["content"]
        structured_data.append({"tag": tag, "content": content})  # Flatten each document with its tag
    return structured_data

structured_data = structure_data(tagged_paragraphs)
for entry in structured_data:
    print(entry["tag"], len(entry["content"]))

Culture & Art 613
General 486
Transportation 626
Culture & Art 575
General 1105
History 475
History 719
General 131
History 682
Culture & Art 623
General 795
General 407
Culture & Art 526
General 449
General 728
Culture & Art 882
Culture & Art 902
History 899
General 385
Culture & Art 558
Culture & Art 1378
General 439
Culture & Art 497
Culture & Art 366
Culture & Art 597
Transportation 961
General 598
Culture & Art 1103
Culture & Art 505
Culture & Art 1179
General 316
General 184
Culture & Art 537
Transportation 658
General 293
General 302
General 404
General 456
General 308
General 14
History 1014
Culture & Art 445
General 733
Culture & Art 620
Transportation 570
General 19
Culture & Art 600
Culture & Art 424
General 1043
Culture & Art 515
Culture & Art 409
General 598
General 352
Culture & Art 664
General 560
Culture & Art 369
General 371
Culture & Art 1188
Culture & Art 1024
Culture & Art 416
General 555
Culture & Art 532
General 370
Culture & Art 693
General 270
Culture & Art 657


In [21]:
structured_data

[{'tag': 'Culture & Art',
  'content': "Paris French pronunciation  is the capital and largest city of France. With an official estimated population of 2,102,650 residents in January 2023 in an area of more than 105 km2 41 sq mi, Paris is the fourthlargest city in the European Union and the 30th most densely populated city in the world in 2022. Since the 17th century, Paris has been one of the world's major centres of finance, diplomacy, commerce, culture, fashion, and gastronomy. For its leading role in the arts and sciences, as well as its early and extensive system of street lighting, in the 19th century, it became known as the City of Light."},
 {'tag': 'General',
  'content': 'The City of Paris is the centre of the ledeFrance region, or Paris Region, with an official estimated population of 12,271,794 inhabitants in January 2023, or about 19 of the population of France. The Paris Region had a nominal GDP of 765 billion US1.064 trillion when adjusted for PPP in 2021, the highest in

In [14]:
!pip install -U sentence-transformers


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m


In [15]:
pip install faiss-cpu


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [24]:
#Generating Embeddings

from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

model = SentenceTransformer('all-MiniLM-L6-v2')

# Extract documents and tags
documents = [entry['content'] for entry in structured_data] 
tags = [entry['tag'] for entry in structured_data]  

embeddings = model.encode(documents)

np.save("embeddings.npy", embeddings)
with open("tags.txt", "w") as f:
    f.writelines([f"{tag}\n" for tag in tags])


In [26]:
# Initialize FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

# Save the FAISS index
faiss.write_index(index, "travel_index.faiss")
print("FAISS index created and saved.")

FAISS index created and saved.


In [27]:
#Query the Retriever
def query_faiss(query, model, index, documents):
    query_embedding = model.encode([query])

    # Search for the top 5 most similar documents
    distances, indices = index.search(query_embedding, 5)
    results = [documents[i] for i in indices[0]]
    return results

# Load FAISS index and embeddings
index = faiss.read_index("travel_index.faiss")
documents = np.load("embeddings.npy", allow_pickle=True)

# Query example
query = "What are some cultural landmarks in Paris?"
results = query_faiss(query, model, index, documents)
for result in results:
    print(result)

[ 8.80297795e-02  4.75664176e-02  4.06345613e-02 -3.33682299e-02
  6.22046217e-02  1.43154915e-02 -9.73886054e-04 -2.18058354e-03
  1.40293012e-03  3.31718056e-03 -3.05512659e-02 -4.08219621e-02
 -8.91585797e-02 -5.55065013e-02 -6.00134321e-02 -9.37565342e-02
 -3.14940093e-03 -4.84918319e-02  4.53695506e-02  1.44780567e-03
  4.30394970e-02 -2.34289505e-02  1.75301917e-02  1.52461845e-02
 -5.84957786e-02  3.58258113e-02 -3.46053503e-02 -1.38599770e-02
 -1.39928507e-02 -5.14954999e-02  1.70105435e-02  2.95079816e-02
 -1.60857011e-02  3.85809429e-02  4.28228155e-02 -2.64446810e-02
  3.72945294e-02  2.71246750e-02  3.09087895e-03  3.37954052e-02
 -3.80276777e-02 -5.92976063e-02 -7.45588914e-02  3.41760553e-02
 -1.11286435e-02 -4.81479336e-03  2.65130377e-03  4.84848283e-02
  1.97203998e-02 -3.38738337e-02  6.26016259e-02 -2.35964656e-02
 -7.54995570e-02 -8.66538137e-02 -3.92836370e-02  1.17537871e-01
  3.47257629e-02  3.85621227e-02  6.17331461e-05  4.90590297e-02
 -1.88097190e-02 -9.46909

In [1]:
#Set up the Generator(LLM)
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-j-6B")

def generate_response(prompt, model, tokenizer):
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(inputs["input_ids"], max_length=200, do_sample=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example usage
prompt = "Based on these documents, suggest cultural activities in Paris:\n" + "\n".join(results)
response = generate_response(prompt, model, tokenizer)
print(response)

Some weights of the model checkpoint at EleutherAI/gpt-j-6B were not used when initializing GPTJForCausalLM: ['transformer.h.0.attn.bias', 'transformer.h.0.attn.masked_bias', 'transformer.h.1.attn.bias', 'transformer.h.1.attn.masked_bias', 'transformer.h.10.attn.bias', 'transformer.h.10.attn.masked_bias', 'transformer.h.11.attn.bias', 'transformer.h.11.attn.masked_bias', 'transformer.h.12.attn.bias', 'transformer.h.12.attn.masked_bias', 'transformer.h.13.attn.bias', 'transformer.h.13.attn.masked_bias', 'transformer.h.14.attn.bias', 'transformer.h.14.attn.masked_bias', 'transformer.h.15.attn.bias', 'transformer.h.15.attn.masked_bias', 'transformer.h.16.attn.bias', 'transformer.h.16.attn.masked_bias', 'transformer.h.17.attn.bias', 'transformer.h.17.attn.masked_bias', 'transformer.h.18.attn.bias', 'transformer.h.18.attn.masked_bias', 'transformer.h.19.attn.bias', 'transformer.h.19.attn.masked_bias', 'transformer.h.2.attn.bias', 'transformer.h.2.attn.masked_bias', 'transformer.h.20.attn.bi

NameError: name 'results' is not defined

In [None]:
#Integrate the RAG Pipeline

def rag_pipeline(query, retriever, generator_model, tokenizer):
    results = query_faiss(query, retriever, index, documents)

    prompt = "Based on these documents, suggest travel recommendations:\n" + "\n".join(results)

    response = generate_response(prompt, generator_model, tokenizer)
    return response

# Example query
final_response = rag_pipeline("Suggest family-friendly activities in Paris", index, model, tokenizer)
print(final_response)

In [None]:
#Build an API for Interaction
from flask import Flask, request, jsonify
app = Flask(__name__)

@app.route("/recommend", methods=["POST"])
def recommend():
    query = request.json.get("query", "")
    response = rag_pipeline(query, index, model, tokenizer)
    return jsonify({"recommendation": response})

if __name__ == "__main__":
    app.run(debug=True)