In [None]:
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

OPENAI_API_KEY  = os.getenv('OPENAI_API_KEY')
PINECONE_API_KEY= os.getenv('PINECONE_API_KEY')

In [None]:
OPENAI_API_KEY

In [None]:
PINECONE_API_KEY

In [None]:
# Data paths
# Define base data directory
DATA_DIR = os.path.join(os.getcwd(), 'data')
# Define zip file paths
#LANDMARKS_ZIP = os.path.join(DATA_DIR, 'landmarks.zip')
#MUNICIPALITIES_ZIP = os.path.join(DATA_DIR, 'municipalities.zip')
#NEWS_ZIP = os.path.join(DATA_DIR, 'elmundo_chunked_en_page1_15years.zip')

# Create temporary directories for extracted files
LANDMARKS_DIR = os.path.join(DATA_DIR, 'landmarks_extracted')
MUNICIPALITIES_DIR = os.path.join(DATA_DIR, 'municipalities_extracted') 
NEWS_DIR = os.path.join(DATA_DIR, 'news_extracted')

# Putting the data into DataFrames would be a good approach for several reasons:
 1. Easy data manipulation and analysis
 2. Built-in methods for handling missing values
 3. Efficient filtering and sorting
 4. Simple integration with other data processing libraries
 5. Good for structured data representation

-  We can create DataFrames for each type of data:
    - landmarks_df: to store landmark information
    - municipalities_df: to store municipality data 
    - news_df: to store news articles

- This will make it easier to:
    - Clean and preprocess the data
    - Extract relevant features
    - Prepare data for vector embeddings
    - Track metadata

In [None]:
import pandas as pd
# Initialize empty lists to store data
landmark_names = []
landmark_texts = []

try:
    for filename in os.listdir(LANDMARKS_DIR):
        if filename.endswith('.txt'):  # Only process text files
            file_path = os.path.join(LANDMARKS_DIR, filename)
            try:
                with open(file_path, 'r', encoding='utf-8') as file:
                    html_content = file.read()
                    landmark_names.append(filename[:-4])
                    landmark_texts.append(html_content)
            except PermissionError:
                print(f"Permission denied for file: {file_path}")
            except Exception as e:
                print(f"Error processing file {filename}: {str(e)}")
except Exception as e:
    print(f"Error accessing directory {LANDMARKS_DIR}: {str(e)}")


- The probelm with the landmark name is that sometimes it contain information about the town (like its name) and sometimes it doesn't.

- I might need to group the landmarks by town and then clean the names of the landmarks. How could i do this if the landmark_name not all have the town name. 


## Load LLM LAT LON, IMAGE ADD

In [None]:
import pandas as pd
#Load Municipality
municipalities_df_enhanced = pd.read_csv('data/municipalities_extracted/Proccessed/municipalities_data_corr_gpt3.csv')

## LETS PROCESS THE MUNICIPALITIES DATASET





# 1) LOOK FOR IMAGES

In [None]:
import os
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin
import logging
import json
import pandas as pd

def extract_urls_from_html(html_content, base_url="https://en.wikipedia.org"):
    """
    Extract image and relevant content URLs from HTML content, limited to 2 per category
    
    Args:
        html_content (str): Raw HTML content
        base_url (str): Base URL to resolve relative URLs
        
    Returns:
        dict: Dictionary containing different types of extracted URLs (max 2 each)
    """
    urls = {
        'images': set(),  # Using sets to prevent duplicates
        'content': set(), 
        'related_pages': set()
    }
    
    try:
        soup = BeautifulSoup(html_content, 'html.parser')
        
        # 1. Find image URLs from img tags (max 2)
        for img in soup.find_all('img'):
            if len(urls['images']) >= 2:
                break
            src = img.get('src')
            if src:
                # Handle relative URLs
                if src.startswith('/'):
                    src = urljoin(base_url, src)
                # Skip Wikipedia logo
                if src == "https://en.wikipedia.org/static/images/icons/wikipedia.png":
                    continue
                if any(ext in src.lower() for ext in ['.jpg', '.jpeg', '.png', '.gif']):
                    urls['images'].add(src)
        
        # 2. Find meta content URLs (max 2)
        for meta in soup.find_all('meta'):
            if len(urls['content']) >= 2:
                break
            content = meta.get('content', '')
            if 'upload.wikimedia.org' in content:
                urls['content'].add(content)
        
        # 3. Find related wiki pages about Puerto Rico locations (max 2)
        for link in soup.find_all('a'):
            if len(urls['related_pages']) >= 2:
                break
            href = link.get('href', '')
            if href.startswith('/wiki/') and 'Puerto_Rico' in href:
                full_url = urljoin(base_url, href)
                urls['related_pages'].add(full_url)
            elif 'wikipedia.org' in href and 'Puerto_Rico' in href:
                urls['related_pages'].add(href)
        
        # Convert sets back to lists for JSON serialization
        return {k: list(v) for k, v in urls.items()}
    
    except Exception as e:
        logging.error(f"Error parsing HTML: {str(e)}")
        return {k: [] for k in ['images', 'content', 'related_pages']}

def process_municipality_files(municipalities_path):
    """
    Process all municipality files and extract URLs
    
    Args:
        municipalities_path (str): Path to the municipalities directory
        
    Returns:
        dict: Dictionary with municipality names as keys and extracted URLs as values
    """
    municipality_urls = {}
    
    for filename in os.listdir(municipalities_path):
        if filename.endswith('.txt'):
            municipality_name = filename.replace('.txt', '')
            try:
                with open(os.path.join(municipalities_path, filename), 'r', encoding='utf-8') as f:
                    content = f.read()
                    urls = extract_urls_from_html(content)
                    municipality_urls[municipality_name] = urls
                    print(f"Processed {municipality_name}: Found {sum(len(v) for v in urls.values())} URLs")
            except Exception as e:
                print(f"Error processing {filename}: {str(e)}")
    
    return municipality_urls

def create_urls_dataframe(municipality_urls):
    """
    Convert the municipality URLs dictionary into a pandas DataFrame
    
    Args:
        municipality_urls (dict): Dictionary containing URLs for each municipality
        
    Returns:
        pd.DataFrame: DataFrame with municipality URLs
    """
    # Create lists to store the data
    rows = []
    
    # Keep track of seen URLs to avoid duplicates
    seen_urls = set()
    
    for municipality, urls in municipality_urls.items():
        # For each image URL
        for img_url in urls['images']:
            if img_url not in seen_urls:
                rows.append({
                    'municipality_name': municipality,
                    'url_type': 'image',
                    'url': img_url
                })
                seen_urls.add(img_url)
        
        # For each content URL
        for content_url in urls['content']:
            if content_url not in seen_urls:
                rows.append({
                    'municipality_name': municipality,
                    'url_type': 'content',
                    'url': content_url
                })
                seen_urls.add(content_url)
            
        # For each related page URL
        for page_url in urls['related_pages']:
            if page_url not in seen_urls:
                rows.append({
                    'municipality_name': municipality,
                    'url_type': 'related_page',
                    'url': page_url
                })
                seen_urls.add(page_url)
    
    # Create DataFrame
    df = pd.DataFrame(rows)
    
    return df

# Process files and create DataFrame
municipalities_path = 'data/municipalities_extracted'
municipality_urls = process_municipality_files(municipalities_path)

# Create DataFrame
urls_df = create_urls_dataframe(municipality_urls)

# Save DataFrame to CSV
#urls_df.to_csv('data/municipality_urls.csv', index=False)



In [None]:
urls_df.head()

In [None]:
def create_structured_dataframe(municipalities_df_enhanced: pd.DataFrame, urls_df: pd.DataFrame) -> pd.DataFrame:
    """
    Create a structured DataFrame combining all municipality information
    
    Args:
        municipalities_df_enhanced: DataFrame with enhanced municipality info (GPT processed)
        urls_df: DataFrame with image URLs
        
    Returns:
        pd.DataFrame: Structured DataFrame ready for vector storage
    """
    # First, group image URLs by municipality and include both 'image' and 'content' types
    image_urls = urls_df[
        (urls_df['url_type'] == 'image') | (urls_df['url_type'] == 'content')
    ].groupby('municipality_name')['url'].agg(list).reset_index()
    
    # Create the structured DataFrame
    structured_data = []
    
    for _, row in municipalities_df_enhanced.iterrows():
        # Get images for this municipality
        municipality_images = image_urls[
            image_urls['municipality_name'] == row['municipality_name']
        ]['url'].tolist() if not image_urls[
            image_urls['municipality_name'] == row['municipality_name']
        ].empty else []
        
        # Filter out non-image URLs and duplicates
        filtered_images = []
        for url_item in municipality_images:
            # Handle case where url_item might be a list
            if isinstance(url_item, list):
                for url in url_item:
                    if isinstance(url, str) and any(ext in url.lower() for ext in ['.jpg', '.jpeg', '.png', '.gif']):
                        filtered_images.append(url)
            elif isinstance(url_item, str) and any(ext in url_item.lower() for ext in ['.jpg', '.jpeg', '.png', '.gif']):
                filtered_images.append(url_item)
                
        municipality_images = list(set(filtered_images))
        
        # Create the structured entry
        entry = {
            'municipality_name': row['municipality_name'],
            'coordinates': {
                'latitude': row['latitude'],
                'longitude': row['longitude']
            },
            'summary': row['content'],  # GPT-generated summary
            'images': municipality_images,
            'metadata': {
                'has_images': len(municipality_images) > 0,
                'coordinates_valid': row['coordinates_valid'],
                'google_maps_url': row['google_maps_url']
            },
            'text_for_embedding': f"""
                Municipality: {row['municipality_name']}
                
                Location: Latitude {row['latitude']}, Longitude {row['longitude']}
                
                Description: {row['content']}
                
                Images Available: {len(municipality_images)}
                
                Google Maps: {row['google_maps_url']}
            """.strip()
        }
        
        structured_data.append(entry)
    
    # Convert to DataFrame
    structured_df = pd.DataFrame(structured_data)
    
    # Save both CSV and JSON versions
    output_dir = 'data'
    
    # Save as CSV (flattened version for easy viewing)
    flat_data = []
    for entry in structured_data:
        flat_entry = {
            'municipality_name': entry['municipality_name'],
            'latitude': entry['coordinates']['latitude'],
            'longitude': entry['coordinates']['longitude'],
            'summary': entry['summary'],
            'image_urls': entry['images'],  # Keep as list
            'has_images': entry['metadata']['has_images'],
            'coordinates_valid': entry['metadata']['coordinates_valid'],
            'google_maps_url': entry['metadata']['google_maps_url'],
            'text_for_embedding': entry['text_for_embedding']
        }
        flat_data.append(flat_entry)
    
    pd.DataFrame(flat_data).to_csv(f'{output_dir}/municipalities_structured.csv', index=False)
    
    # Save as JSON (preserving nested structure)
    with open(f'{output_dir}/municipalities_structured.json', 'w', encoding='utf-8') as f:
        json.dump(structured_data, f, ensure_ascii=False, indent=2)
    
    return structured_df

# Create the structured DataFrame
structured_df = create_structured_dataframe(municipalities_df_enhanced, urls_df)

# Display sample results
print("\nStructured DataFrame Shape:", structured_df.shape)
print("\nSample of first municipality's images:")
print(f"\nMunicipality: {structured_df.iloc[0]['municipality_name']}")
print(f"Number of images: {len(structured_df.iloc[0]['images'])}")
print("\nImage URLs:")
for url in structured_df.iloc[0]['images']:
    print(f"- {url}")

# Display statistics
print("\nStatistics:")
print(f"Total municipalities: {len(structured_df)}")
print(f"Average images per municipality: {structured_df['images'].apply(len).mean():.1f}")
print(f"Municipalities with images: {structured_df['metadata'].apply(lambda x: x['has_images']).sum()}")

In [None]:
structured_df.head()

# ORGANIZE DATA FOR VECTORDB

In [None]:
def create_structured_dataframe(municipalities_df_enhanced: pd.DataFrame, urls_df: pd.DataFrame) -> pd.DataFrame:
    """
    Create a structured DataFrame combining all municipality information
    
    Args:
        municipalities_df_enhanced: DataFrame with enhanced municipality info (GPT processed)
        urls_df: DataFrame with image URLs
        
    Returns:
        pd.DataFrame: Structured DataFrame ready for vector storage
    """
    # First, group image URLs by municipality and include both 'image' and 'content' types
    image_urls = urls_df[
        (urls_df['url_type'] == 'image') | (urls_df['url_type'] == 'content')
    ].groupby('municipality_name')['url'].agg(list).reset_index()
    
    # Create the structured DataFrame
    structured_data = []
    
    for _, row in municipalities_df_enhanced.iterrows():
        # Get images for this municipality
        municipality_images = image_urls[
            image_urls['municipality_name'] == row['municipality_name']
        ]['url'].tolist() if not image_urls[
            image_urls['municipality_name'] == row['municipality_name']
        ].empty else []
        
        # Filter out non-image URLs and duplicates
        filtered_images = []
        for url_item in municipality_images:
            # Handle case where url_item might be a list
            if isinstance(url_item, list):
                for url in url_item:
                    if isinstance(url, str) and any(ext in url.lower() for ext in ['.jpg', '.jpeg', '.png', '.gif']):
                        filtered_images.append(url)
            elif isinstance(url_item, str) and any(ext in url_item.lower() for ext in ['.jpg', '.jpeg', '.png', '.gif']):
                filtered_images.append(url_item)
                
        municipality_images = list(set(filtered_images))
        
        # Create the structured entry
        entry = {
            'municipality_name': row['municipality_name'],
            'coordinates': {
                'latitude': row['latitude'],
                'longitude': row['longitude']
            },
            'summary': row['content'],  # GPT-generated summary
            'images': municipality_images,
            'metadata': {
                'has_images': len(municipality_images) > 0,
                'coordinates_valid': row['coordinates_valid'],
                'google_maps_url': row['google_maps_url']
            },
            'text_for_embedding': f"""
                Municipality: {row['municipality_name']}
                
                Location: Latitude {row['latitude']}, Longitude {row['longitude']}
                
                Description: {row['content']}
                
                Images Available: {len(municipality_images)}
                
                Google Maps: {row['google_maps_url']}
            """.strip()
        }
        
        structured_data.append(entry)
    
    # Convert to DataFrame
    structured_df = pd.DataFrame(structured_data)
    
    # Save both CSV and JSON versions
    output_dir = 'data'
    
    # Save as CSV (flattened version for easy viewing)
    flat_data = []
    for entry in structured_data:
        flat_entry = {
            'municipality_name': entry['municipality_name'],
            'latitude': entry['coordinates']['latitude'],
            'longitude': entry['coordinates']['longitude'],
            'summary': entry['summary'],
            'image_urls': entry['images'],  # Keep as list
            'has_images': entry['metadata']['has_images'],
            'coordinates_valid': entry['metadata']['coordinates_valid'],
            'google_maps_url': entry['metadata']['google_maps_url'],
            'text_for_embedding': entry['text_for_embedding']
        }
        flat_data.append(flat_entry)
    
    pd.DataFrame(flat_data).to_csv(f'{output_dir}/municipalities_structured.csv', index=False)
    
    # Save as JSON (preserving nested structure)
    with open(f'{output_dir}/municipalities_structured.json', 'w', encoding='utf-8') as f:
        json.dump(structured_data, f, ensure_ascii=False, indent=2)
    
    return structured_df

# Create the structured DataFrame
structured_df = create_structured_dataframe(municipalities_df_enhanced, urls_df)

# Display sample results
print("\nStructured DataFrame Shape:", structured_df.shape)
print("\nSample of first municipality's images:")
print(f"\nMunicipality: {structured_df.iloc[0]['municipality_name']}")
print(f"Number of images: {len(structured_df.iloc[0]['images'])}")
print("\nImage URLs:")
for url in structured_df.iloc[0]['images']:
    print(f"- {url}")

# Display statistics
print("\nStatistics:")
print(f"Total municipalities: {len(structured_df)}")
print(f"Average images per municipality: {structured_df['images'].apply(len).mean():.1f}")
print(f"Municipalities with images: {structured_df['metadata'].apply(lambda x: x['has_images']).sum()}")