In [42]:
import os
import pandas as pd
places_api_key = os.getenv('places_api_key')
rapid_api_key = os.getenv('rapid_api_key')

## 2. Data Integration Workflow

### Local Attractions and Activities Databases

In [32]:
# Google Places API
import requests

def fetch_google_places(activity, api_key):
    url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
    params = {
        'query': f'{activity} in barcelona',
        'key': places_api_key
    }

    response = requests.get(url, params=params)
    if response.status_code == 200:
        places = response.json().get('results', [])
        return [{'name': place['name'], 'address': place.get('formatted_address'), 'rating': place.get('rating')} for place in places]

    else:
        print(f"Error fetching data from Google Places API: {response.status_code}")
        return []


In [33]:
activity_dict = {"sightseeing":["Historical Sites", "Iconic Landmarks", "City Tours", "Amusement parks"], 
                 "dinning":["Local Cuisine", "Food Tours", "Cooking Classes", " Wineries and breweries"], 
                 "adventure":["Water Sports", "Extreme Sports", "Winter Sports", "Cycling and Biking"],
                 "relaxation":["Spas", "Wellness Centers", "Yoga and Meditation", "Cruises"], 
                 "culture":["Museums", "Galleries", "Theater Performances", "Cultural Tours"]}

In [35]:
result_dict = {}
for k,v in activity_dict.items():
    activity = f"{k} activities including {v[0]}, {v[1]}, {v[2]}, {v[3]}"
    tourist_spots = fetch_google_places(activity, places_api_key)
    result_dict[k] = tourist_spots

In [43]:
result_list = []

for activity, places in result_dict.items():
    for place in places:
        place["activity"] = activity
        result_list.append(place)

result_df = pd.DataFrame(result_list)

In [50]:
result_df.to_csv('../data/activities.csv', index=False)

### Travel Guides and Blogs

In [None]:
import requests
from bs4 import BeautifulSoup

url = 'FILL_WITH_URL'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Extract blog content
blogs = soup.find_all('p')  # Assuming blog content is wrapped in <p> tags
for blog in blogs:
    print(blog.text)

### Although this script fetched me some blogs, I had to do some manual data scraping to get more data. Therefore, the blogs data is stored in the travel_iterenary.json file

## Load Data

In [None]:
activities_df = pd.read_csv("../data/activities.csv")

In [None]:
file_path = "../data/travel_iterenary.json"

with open(file_path, "rt") as f_in:
    docs_raw = json.load(f_in)
    
def extract_text(data_entry):
    docs_fin = {}
    plan = ""
    
    for day in data_entry['Iterenary']:  # Fixed 'days' to 'day'
        itinerary = "Travel itinerary for " + data_entry.get('duration', '') + " days."
        
        plan += day['Plan']
        plan += "\n"
        
        docs_fin["itinerary"] = itinerary
        docs_fin["plan"] = plan
        
    return docs_fin
docs_fin = [extract_text(doc) for doc in docs_raw]

## Ingest Data

### ElasticSearch

In [None]:
#check that elastic search works
!curl http://localhost:9200/

In [None]:
from elasticsearch import Elasticsearch
es_client = Elasticsearch("http://localhost:9200")
es_client.info()

In [None]:
#define index mapping
index_mapping = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "itinerary": {"type": "text"},  
            "plan": {"type": "text"}        
            }
    }
}

index_name = "tip-index"

# Check if the index already exists
# Check if the index already exists
es_client.indices.delete(index=index_name, ignore_unavailable=True)

In [None]:
#create index
es_client.indices.create(index=index_name, body=index_mapping)

In [None]:
#index document
for duration in tqdm(docs_fin):
    es_client.index(index=index_name, document=duration)

### ElasticSearch as VectorDB

In [None]:
doc_vec = []
for doc in docs_fin:
    doc["itin_vector"] = model.encode(doc['itinerary']).tolist()
    doc_vec.append(doc)

In [None]:
#define index mapping
vec_index_mapping = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "itinerary": {"type":"text"},  
            "plan": {"type":"text"},
            "itin_vector": {"type":"dense_vector", "dims":768, "index":True, "similarity":"cosine"} 
            }
    }
}

index_name = "vec-index"

# Check if the index already exists
es_client.indices.delete(index=index_name, ignore_unavailable=True)
#create index
es_client.indices.create(index=index_name, body=vec_index_mapping)

In [None]:
for doc in tqdm(doc_vec):
    es_client.index(index=index_name, document=doc)

### FAISS (Facebook AI Simlarity Search)

In [None]:
# Function to extract relevant text from each dictionary entry
def extract_text(data_entry):

    result = data_entry.get('itinerary', '') + "\n" + data_entry.get('plan', '')
    return result.strip()

# Extract all text data to be indexed
text_data = [extract_text(entry) for entry in docs_fin]

# Generate embeddings for each text using the model
embeddings = model.encode(text_data)

# Convert embeddings to a numpy array and ensure it's float32 (FAISS requires this format)
embeddings = np.array(embeddings, dtype='float32')  # Ensure embeddings are float32

# Initialize the FAISS index
dimension = embeddings.shape[1]  # Dimension of the embedding vector
index = faiss.IndexFlatL2(dimension)  # L2 distance metric (Euclidean)

# Add the embeddings to the FAISS index
index.add(embeddings)

# Save the index for future use
faiss.write_index(index, 'faiss_index_file.index')

print(f"Indexed {len(embeddings)} items into FAISS.")
