In [None]:
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

OPENAI_API_KEY  = os.getenv('OPENAI_API_KEY')
PINECONE_API_KEY= os.getenv('PINECONE_API_KEY')

In [None]:
OPENAI_API_KEY

In [None]:
PINECONE_API_KEY

In [None]:
# Data paths
# Define base data directory
DATA_DIR = os.path.join(os.getcwd(), 'data')


# Create temporary directories for extracted files
LANDMARKS_DIR = os.path.join(DATA_DIR, 'landmarks_extracted')


# Look for images in html files

In [None]:
import json
import pandas as pd
from typing import Dict
from bs4 import BeautifulSoup
import re

# Putting the data into DataFrames would be a good approach for several reasons:
 1. Easy data manipulation and analysis
 2. Built-in methods for handling missing values
 3. Efficient filtering and sorting
 4. Simple integration with other data processing libraries
 5. Good for structured data representation

-  We can create DataFrames for each type of data:
    - landmarks_df: to store landmark information
    - municipalities_df: to store municipality data 
    - news_df: to store news articles

- This will make it easier to:
    - Clean and preprocess the data
    - Extract relevant features
    - Prepare data for vector embeddings
    - Track metadata

- The probelm with the landmark name is that sometimes it contain information about the town (like its name) and sometimes it doesn't.

- I might need to group the landmarks by town and then clean the names of the landmarks. How could i do this if the landmark_name not all have the town name. 


## Using BEAUTIFULSOUP TO SCRAPE INFORMATION FORM THE landmark_texts[html] TO get the coordinates and location. 

- Get matadata from HTML using Beautifulsoup



-  Scrapping for towns, coordinates, contexts, key features

# Load corrected Dataset

In [None]:
#lOAD DATASET
landmarks_df_corr = pd.read_csv('data/Landmark_Processd/landmarks_corrected_GPT3_LASTEST.csv')

In [None]:
# REMOVE KEY FEATURE AND IMAGES
landmarks_df_corr = landmarks_df_corr.drop(['key_features', 'images'], axis=1)

In [None]:
landmarks_df_corr.head(1)

In [None]:
len(landmarks_df_corr)

# Scape for images in html

In [None]:
def extract_images_from_html(html_content: str, landmark_name: str) -> Dict:
    """Extract images and their descriptions from Wikipedia HTML content
    
    Args:
        html_content (str): Raw HTML content
        landmark_name (str): Name of the landmark for reference
        
    Returns:
        dict: Dictionary containing landmark name and list of image information
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    
    result = {
        'name': landmark_name,
        'images': []
    }
    
    try:
        # Method 1: Find all image containers (figures and divs)
        image_containers = soup.find_all(['figure', 'div'], class_=['thumb', 'image', 'thumbinner'])
        
        # Method 2: Find all img tags directly
        img_tags = soup.find_all('img')
        
        # Method 3: Find gallery sections
        galleries = soup.find_all(['ul', 'div'], class_=['gallery', 'gallery mw-gallery-traditional'])
        
        # Process containers
        for container in image_containers:
            img = container.find('img')
            if img:
                image_data = process_image(img, container)
                if image_data:
                    result['images'].append(image_data)
        
        # Process direct img tags
        for img in img_tags:
            if not img.parent.name in ['figure', 'div'] or not any(c in img.parent.get('class', []) for c in ['thumb', 'image', 'thumbinner']):
                image_data = process_image(img)
                if image_data:
                    result['images'].append(image_data)
        
        # Process galleries
        for gallery in galleries:
            gallery_images = gallery.find_all('img')
            for img in gallery_images:
                image_data = process_image(img)
                if image_data:
                    result['images'].append(image_data)
        
        # Remove duplicates based on URL
        seen_urls = set()
        unique_images = []
        for img in result['images']:
            if img['url'] not in seen_urls:
                seen_urls.add(img['url'])
                unique_images.append(img)
        result['images'] = unique_images
        
        # If no images found, add placeholder
        if not result['images']:
            result['images'].append({
                'url': 'None found',
                'width': 'N/A',
                'height': 'N/A',
                'alt_text': 'No images available for this landmark',
                'caption': 'No images found'
            })
        
    except Exception as e:
        print(f"Error extracting images for {landmark_name}: {str(e)}")
        result['images'].append({
            'url': 'Error occurred',
            'width': 'N/A',
            'height': 'N/A',
            'alt_text': f'Error processing images: {str(e)}',
            'caption': 'Error occurred while processing'
        })
    
    return result

def process_image(img, container=None):
    """Helper function to process individual images"""
    try:
        # Get image URL
        src = img.get('src', '')
        if src.startswith('//'):
            src = 'https:' + src
        
        # Fix duplicate path segments in Wikipedia URLs
        if '/wikipedia/commons/' in src:
            # Split URL at /commons/ and take everything after it
            base_parts = src.split('/commons/')
            if len(base_parts) > 1:
                # Remove any duplicate filename at the end
                file_path = base_parts[1]
                if '/' in file_path:
                    # Keep only the first occurrence of the filename
                    file_segments = file_path.split('/')
                    unique_segments = []
                    seen = set()
                    for segment in file_segments:
                        if segment not in seen:
                            unique_segments.append(segment)
                            seen.add(segment)
                    file_path = '/'.join(unique_segments)
                src = f"https://upload.wikimedia.org/wikipedia/commons/{file_path}"
        
        # Get image dimensions
        width = img.get('width', '')
        height = img.get('height', '')
        
        # Get alt text
        alt_text = img.get('alt', '')
        
        # Look for caption
        caption = None
        if container:
            caption_elem = container.find(['figcaption', 'div'], class_=['thumbcaption', 'caption'])
            if caption_elem:
                caption = caption_elem.get_text(strip=True)
        
        # Only add if it's a content image (skip icons, thumbnails, etc.)
        if (not width or not height) or (int(width) > 50 and int(height) > 50):
            if not src.endswith(('.svg', '.gif')) and 'icon' not in src.lower():
                return {
                    'url': src,
                    'width': width or 'unknown',
                    'height': height or 'unknown',
                    'alt_text': alt_text,
                    'caption': caption
                }
    except Exception as e:
        print(f"Error processing image: {str(e)}")
        return None
    
    return None

# Process landmarks
landmarks_images_data = []

print("Processing landmarks for images...")
for filename in os.listdir(LANDMARKS_DIR):
    if filename.endswith('.txt'):
        landmark_name = (filename.replace('.txt', '')
                        .replace('(', '')
                        .replace(')', '')
                        .replace(',', '')
                        .replace('-', '_')
                        .lower())
        
        file_path = os.path.join(LANDMARKS_DIR, filename)
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                html_content = file.read()
                images_info = extract_images_from_html(html_content, landmark_name)
                landmarks_images_data.append(images_info)
                
        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")

# Create DataFrame
landmarks_images_df = pd.DataFrame(landmarks_images_data)

# Display results
print("\nDataFrame Shape:", landmarks_images_df.shape)
print("\nSample landmarks with images:")
print(f"Total landmarks processed: {len(landmarks_images_df)}")
print("\nSample of first landmark's images:")
if not landmarks_images_df.empty:
    first_landmark = landmarks_images_df.iloc[0]
    print(f"\nLandmark: {first_landmark['name']}")
    print(f"Number of images: {len(first_landmark['images'])}")
    if first_landmark['images']:
        print("\nFirst image details:")
        print(json.dumps(first_landmark['images'][0], indent=2))



In [None]:
# Save to CSV
#landmarks_images_df.to_csv('processed_landmarks_images.csv', index=False)

In [None]:
landmarks_images_df.head()

# Join datasets

In [None]:
# Add images column from landmark_images_df to landmarks_df
landmarks_df_corr_final = landmarks_df_corr.merge(
    landmarks_images_df[['name', 'images']], 
    on='name',
    how='left'
)

In [None]:
landmarks_df_corr_final.head(1)

In [None]:
# Save final landmarks DataFrame to CSV
#landmarks_df_corr_final.to_csv('landmarks_with_imagesgpt3_latest.csv', index=False)


In [None]:
# Create a plot of primary and secondary categories
import matplotlib.pyplot as plt
import seaborn as sns

# Create figure with two subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Plot primary categories
primary_counts = landmarks_df_corr['primary_category'].value_counts()
sns.barplot(x=primary_counts.values, y=primary_counts.index, ax=ax1)
ax1.set_title('Primary Categories')
ax1.set_xlabel('Count')

# Plot secondary categories 
secondary_counts = landmarks_df_corr['secondary_category'].value_counts()
sns.barplot(x=secondary_counts.values, y=secondary_counts.index, ax=ax2)
ax2.set_title('Secondary Categories')
ax2.set_xlabel('Count')

plt.tight_layout()
plt.show()


## New DATASETS 
- Cultural Events
- Restaurants
- Hotels and Airbnbs
- Para la naturaleza and DRNA webpage
- Events

# Concenrs
1) Do I need to spend time on a code to scrape the websites?
2) Since is just for the proyect, focus on finding a static list of datasets (Hotels,Airbnb, Resutarunts and Events)

- I will have to focus on giving a lot of good promts so I can always have the control of the conversation when the user interacts with the chatbot.
- If i dont find a good dataset of hotels and Airbnb, make sure the promts is setup to provide the user a link to booking and airbnb.
- Similar to events and activities, go to "https://app.voyturisteando.com/directorio"
- Provide similar tools that can help him improve the planning.https://mapa.plateapr.com/
- Can can agent help look for nearby spot accounting on the location of the user? 

In [None]:
landmarks_df_corr_final.head(2)

In [None]:
landmarks_df_corr_final.info()

## Clean Nans - csv file

In [None]:
def fill_missing_values(df):
    """
    Fill missing or blank values in DataFrame with specific placeholders
    """
    # Create a copy to avoid modifying original
    filled_df = df.copy()
    
    # Define placeholder values for each column
    placeholders = {
        'name': "Unnamed Landmark",
        'town': "Location to be verified",
        'latitude': 0.0,
        'longitude': 0.0,
        'content': "No detailed description available",
        'direction': "N/A",
        'primary_category': "Uncategorized",
        'secondary_category': "General",
        'website': "No website listed",
        'visit_duration': "Visit duration varies",
        'hours': "Contact location for current hours",
        'admission': "Contact location for current prices",
        'chatbot_tags': str(["needs_update"]),  # Convert to string representation
        'images': str([])  # Convert to string representation
    }
    
    # Fill missing values column by column
    for column in filled_df.columns:
        if column in ['latitude', 'longitude']:
            # Handle numeric columns
            filled_df[column] = pd.to_numeric(filled_df[column], errors='coerce').fillna(placeholders[column])
        else:
            # Handle string columns (including string representations of lists/dicts)
            filled_df[column] = filled_df[column].fillna(placeholders[column])
    
    return filled_df

# Apply the function to your DataFrame
cleaned_landmarks_df = fill_missing_values(landmarks_df_corr_final)

# Print summary of changes
print("\nMissing values before cleaning:")
print(landmarks_df_corr_final.isnull().sum())

print("\nMissing values after cleaning:")
print(cleaned_landmarks_df.isnull().sum())

# Display a sample to verify the changes
print("\nSample of cleaned data:")
print(cleaned_landmarks_df.head(1))



In [None]:
cleaned_landmarks_df.head()

In [None]:
# Optional: save cleaned DataFrame
#cleaned_landmarks_df.to_csv('landmarks_with_imagesgpt3_latest_fill_NANS.csv', index=False)

# Organize data for VectorData Base

In [None]:
def create_structured_landmarks_data(landmarks_df):
    """Convert landmarks DataFrame into structured format optimized for vector database"""
    
    structured_data = []
    
    for _, row in landmarks_df.iterrows():
        try:
            # Handle chatbot_tags - convert string representation to list
            if isinstance(row['chatbot_tags'], str):
                # Remove any extra quotes and convert to list
                tags_str = row['chatbot_tags'].replace('"', '').replace('[', '').replace(']', '')
                chatbot_tags = [tag.strip() for tag in tags_str.split(',') if tag.strip()]
            else:
                chatbot_tags = []
            
            # Handle images - ensure it's a list
            if isinstance(row['images'], str):
                try:
                    landmark_images = eval(row['images'])
                except:
                    landmark_images = []
            else:
                landmark_images = row['images'] if isinstance(row['images'], list) else []
            
            # Get up to 3 image URLs for the text embedding
            image_urls = []
            for img in landmark_images[:3]:  # Limit to first 3 images
                if isinstance(img, dict) and 'url' in img:
                    image_urls.append(img['url'])
            
            # Create image URLs text section
            image_urls_text = "\n    ".join(image_urls) if image_urls else "No images available"
            
            # Create the structured entry
            entry = {
                'landmark_name': row['name'],
                'coordinates': {
                    'latitude': row['latitude'],
                    'longitude': row['longitude']
                },
                'location': {
                    'town': row['town'],
                    'direction': row['direction']
                },
                'details': {
                    'primary_category': row['primary_category'],
                    'secondary_category': row['secondary_category'],
                    'visit_duration': row['visit_duration'],
                    'hours': row['hours'],
                    'admission': row['admission'],
                    'website': row['website']
                },
                'content': row['content'],
                'images': landmark_images,
                'metadata': {
                    'has_images': len(landmark_images) > 0,
                    'chatbot_tags': chatbot_tags
                },
                'text_for_embedding': f"""
                    Landmark: {row['name']}
                    
                    Location: {row['town']}, {row['direction']} Puerto Rico
                    Coordinates: Latitude {row['latitude']}, Longitude {row['longitude']}
                    
                    Category: {row['primary_category']} - {row['secondary_category']}
                    
                    Description: {row['content']}
                    
                    Visit Information:
                    Duration: {row['visit_duration']}
                    Hours: {row['hours']}
                    Admission: {row['admission']}
                    
                    Website: {row['website']}
                    
                    Tags: {chatbot_tags}
                    
                    Images ({len(landmark_images)} available):
                    {image_urls_text}
                """.strip()
            }
            
            structured_data.append(entry)
            
        except Exception as e:
            print(f"Error processing row: {row['name']}")
            print(f"Error details: {str(e)}")
            continue
    
    return structured_data

In [None]:
# Create structured data
structured_landmarks = create_structured_landmarks_data(cleaned_landmarks_df)

# Print sample to verify
print(f"Processed {len(structured_landmarks)} landmarks")
print("\nSample structured entry:")
print(json.dumps(structured_landmarks[0], indent=2))

In [None]:
# Convert structured data to DataFrame
structured_df = pd.DataFrame(structured_landmarks)

# Save to JSON for vector database processing
#with open('processed_landmarks_gpt3_images_STRUCTURED_FILL_NANS.json', 'w', encoding='utf-8') as f:
#    json.dump(structured_landmarks, f, indent=2, ensure_ascii=False)

# Save DataFrame to CSV
#structured_df.to_csv('processed_landmarks_gpt3_images_STRUCTURED_FILL_NANS.csv', index=False)

In [None]:
structured_df.head()

In [None]:
# Save DataFrame to CSV
structured_df.to_csv('processed_landmarks_gpt3_images_STRUCTURED_FILL_NANS.csv', index=False)