In [188]:
import requests
import json
import time
from pathlib import Path
from datetime import datetime
from collections import deque
import threading

In [189]:
import time
import threading
from collections import deque

class RateLimiter:
    """
    Makes sure the extraction of the API is following the rules 
    - Max 10 requests per second 
    - Max 1000 requests per hour
    - Max 20 concurrent requests 
    """
    def __init__(self, max_per_second=10, max_per_hour=1000): 
        self.max_per_second = max_per_second  # Stores the maximum requests per second
        self.max_per_hour = max_per_hour
        self.requests_last_second = deque()  # Track requests in the last second
        self.requests_last_hour = deque()  # Track requests in the last hour
        self.lock = threading.Lock()  # Lock for thread safety
        self.min_delay = 1.0 / max_per_second  # Minimum delay between requests (100ms = 10 req/s)
        self.last_request_time = 0
    
    def wait_if_needed(self):
        """Wait if necessary to respect rate limits"""
        with self.lock:
            current_time = time.time()
            
            # Clean old entries from last second tracking
            while self.requests_last_second and \
                current_time - self.requests_last_second[0] > 1.0:
                self.requests_last_second.popleft()
            
            # Clean old entries from last hour tracking
            while self.requests_last_hour and \
                  current_time - self.requests_last_hour[0] > 3600:
                  self.requests_last_hour.popleft()
            
            # Check hourly limit
            if len(self.requests_last_hour) >= self.max_per_hour:
                oldest_request = self.requests_last_hour[0]  # Calculate how long to wait
                wait_time = 3600 - (current_time - oldest_request)
                if wait_time > 0:
                    print(f"\n  Hourly limit reached ({self.max_per_hour} req/hour)")
                    print(f" Waiting {wait_time:.0f} seconds before continuing...")
                    time.sleep(wait_time + 1)
                    current_time = time.time()
                    
                    # Clean after waiting
                    while self.requests_last_hour and \
                          current_time - self.requests_last_hour[0] > 3600:
                          self.requests_last_hour.popleft()
            
            # Check per-second limit
            if len(self.requests_last_second) >= self.max_per_second:
                wait_time = 1.0 - (current_time - self.requests_last_second[0])
                if wait_time > 0:
                    time.sleep(wait_time)
                    current_time = time.time()
            
            # Ensure minimum delay between requests
            time_since_last = current_time - self.last_request_time
            if time_since_last < self.min_delay:
                time.sleep(self.min_delay - time_since_last)
                current_time = time.time()
            
            # Record this request
            self.requests_last_second.append(current_time)
            self.requests_last_hour.append(current_time)
            self.last_request_time = current_time
    
    def get_stats(self):
        """Has the purpose to see how many requests have been made recently 
           Monitorise when the requests are getting close to the limit"""
        
        with self.lock:
            current_time = time.time()
            
            # Count requests in last second
            recent_second = sum(1 for t in self.requests_last_second
                              if current_time - t <= 1.0)
            
            # Count requests in last hour
            recent_hour = sum(1 for t in self.requests_last_hour
                              if current_time - t <= 3600)
            
            return {
                'requests_last_second': recent_second,
                'requests_last_hour': recent_hour,
                'max_per_second': self.max_per_second,
                'max_per_hour': self.max_per_hour
            }

import requests
import json
import time
import threading
from pathlib import Path
from datetime import datetime
from collections import deque

class DATAtourismeDownloader:
    '''Has all neccesary functions that fetch api data.
       The combination of the functions are supposed to work for different context'''
    
    def __init__(self, api_key, output_dir="datatourisme_data"):  # initialization method 
        self.api_key = api_key  # mandatory
        self.base_url = "https://api.datatourisme.fr/v1/catalog"
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(exist_ok=True)
        
        self.rate_limiter = RateLimiter(max_per_second=10, max_per_hour=1000)  # Rate limiter: 10 req/s sustained, 1000 req/hour
        
        self.total_downloaded = 0   # Stats
        self.current_page = 0
        self.total_pages = 0
        self.start_time = None
        self.requests_made = 0
    
    def fetch_page(self, cursor=None):  # method to fetch one page from the API
        """Fetch a single page of data with rate limiting"""
        self.rate_limiter.wait_if_needed()    # Wait if needed to respect rate limits 
        params = {
            "api_key": self.api_key
        }
        
        if cursor:
            params["crs"] = cursor
        
        try:
            response = requests.get(self.base_url, params=params, timeout=30)  # gets the http API request from the endpoint URL 
            response.raise_for_status()
            self.requests_made += 1
            return response.json()
        except requests.exceptions.RequestException as e:
            print(f"\nError fetching page: {e}")
            return None
    
    def save_batch(self, objects, batch_num):
        """Save a batch of objects to a JSON file"""
        filename = self.output_dir / f"batch_{batch_num:06d}.json"
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(objects, f, ensure_ascii=False, indent=2)
        print(f"✅ Saved batch {batch_num} with {len(objects)} objects to {filename}")
    
    def print_progress(self, page, total_records):
        """Print detailed progress information"""
        progress = (self.total_downloaded / total_records * 100) if total_records > 0 else 0
        
        if self.start_time:
            elapsed = time.time() - self.start_time   # Calculate time statistics
            rate = self.requests_made / elapsed if elapsed > 0 else 0
            
            pages_remaining = self.total_pages - self.current_page  # Estimate remaining time
            if rate > 0:
                estimated_remaining = pages_remaining / rate
                hours = int(estimated_remaining // 3600)
                minutes = int((estimated_remaining % 3600) // 60)
                seconds = int(estimated_remaining % 60)
                eta = f"{hours}h {minutes}m {seconds}s"
            else:
                eta = "calculating..."
        else:
            rate = 0
            eta = "calculating..."
        
        limiter_stats = self.rate_limiter.get_stats()  # Get rate limiter stats
        
        print(f"Progress: {self.total_downloaded:,}/{total_records:,} ({progress:.2f}%) | "
              f"Page {self.current_page:,}/{self.total_pages:,}")
        print(f"Rate: {rate:.2f} req/s | "
              f"Recent: {limiter_stats['requests_last_second']}/10 per sec, "
              f"{limiter_stats['requests_last_hour']}/1000 per hour")
        print(f"ETA: {eta}")
    
    def download_all(self, batch_size=100):
        """
        Download all data from the API with rate limiting
        
        Args:
            batch_size: Number of pages to group into one file
        
        Rate Limits Applied:
            - Max 10 requests per second 
            - Max 1000 requests per hour
            - Sequential requests 
        """
        cursor = None
        page = 1
        batch_num = 1
        current_batch = []
        
        self.start_time = time.time()
        
        print("=" * 70)
        print("DATAtourisme Data Downloader")
        print("=" * 70)
        print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        print(f"Output: {self.output_dir.absolute()}")
        print(f"\n Rate Limits:")
        print(f"   • Max 10 requests/second (sustained)")
        print(f"   • Max 1000 requests/hour")
        print(f"   • Sequential requests (no concurrency)")
        print("=" * 70)
        
        while True:
            print(f"\n Fetching page {page}...", end=" ")
            
            data = self.fetch_page(cursor)
            
            if not data or "objects" not in data:
                print("\n No more data or error occurred")
                break
            
            objects = data["objects"]
            meta = data.get("meta", {})
            
            self.total_pages = meta.get("total_pages", 0)   # Update stats
            self.current_page = meta.get("page", page)
            total_records = meta.get("total", 0)
            
            print(f"✓ Received {len(objects)} objects")
            
            current_batch.extend(objects)  # Add to current batch
            self.total_downloaded += len(objects)
            
            if page % batch_size == 0:  # Save batch if we've reached batch_size pages
                self.save_batch(current_batch, batch_num)
                current_batch = []
                batch_num += 1
            
            if page % 10 == 0:  # Print detailed progress every 10 pages
                self.print_progress(page, total_records)
            
            next_cursor = meta.get("next")  # Check if there's a next page
            if not next_cursor:
                print("\n Reached the end of data")
                break
            
            if "crs=" in next_cursor:  # Extract cursor from next URL
                cursor = next_cursor.split("crs=")[1].split("&")[0]
            else:
                print("\n Could not find cursor in next URL")
                break
            
            page += 1
        
        if current_batch:  # Save any remaining objects
            self.save_batch(current_batch, batch_num)
        
        self.print_final_stats(total_records)    # Final statistics
    
    def print_final_stats(self, total_records):
        """Print final download statistics"""
        elapsed = time.time() - self.start_time
        hours = int(elapsed // 3600)
        minutes = int((elapsed % 3600) // 60)
        seconds = int(elapsed % 60)
        
        avg_rate = self.requests_made / elapsed if elapsed > 0 else 0
        
        print("\n" + "=" * 70)
        print("DOWNLOAD COMPLETED")
        print("=" * 70)
        print(f"Completed: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        print(f"Duration: {hours}h {minutes}m {seconds}s")
        print(f"Total objects: {self.total_downloaded:,} / {total_records:,}")
        print(f"Total pages: {self.current_page:,} / {self.total_pages:,}")
        print(f"Total requests: {self.requests_made:,}")
        print(f"Average rate: {avg_rate:.2f} req/s")
        print(f"Output directory: {self.output_dir.absolute()}")
        print("=" * 70)
        
        self.save_metadata(total_records, elapsed)   # Save metadata
    
    def save_metadata(self, total_records, elapsed_time):
        """Save download metadata"""
        metadata = {
            "download_date": datetime.now().isoformat(),
            "duration_seconds": elapsed_time,
            "total_downloaded": self.total_downloaded,
            "total_pages_processed": self.current_page,
            "total_records_in_api": total_records,
            "total_requests": self.requests_made,
            "average_rate_per_second": self.requests_made / elapsed_time if elapsed_time > 0 else 0,
            "api_endpoint": self.base_url,
            "rate_limits": {
                "max_per_second": 10,
                "max_per_hour": 1000,
                "max_concurrent": 1
            }
        }
        
        metadata_file = self.output_dir / "metadata.json"
        with open(metadata_file, 'w', encoding='utf-8') as f:
            json.dump(metadata, f, indent=2)
        print(f" Metadata saved to {metadata_file}")


def main():
    API_KEY = "98bf037a-4b11-4645-aa93-ec008b57f911"
    
    downloader = DATAtourismeDownloader(
        api_key=API_KEY,
        output_dir="datatourisme_data"
    )
    
    downloader.download_all(batch_size=100)  # Download all data with rate limiting

if __name__ == "__main__":
    main()

def update_datatourisme(api_key: str, output_dir: str = "datatourisme_data"):
    """
    Fetch only new/updated records from DataTourisme API
    
    Args:
        api_key:'98bf037a-4b11-4645-aa93-ec008b57f911'
        output_dir:"C:\Users\Vivobook\datatourisme_data"
        
    Returns:
        Number of new objects downloaded
    """
    output_path = Path(output_dir)
    output_path.mkdir(exist_ok=True)
    
    # 1. Load existing UUIDs
    print(" Loading existing data...")
    existing_uuids = set()
    batch_files = sorted(output_path.glob("batch_*.json"))
    
    for bf in batch_files:
        with open(bf, 'r', encoding='utf-8') as f:
            objects = json.load(f)
            for obj in objects:
                existing_uuids.add(obj.get('uuid'))
    
    print(f" Found {len(existing_uuids):,} existing objects")
    
    # 2. Fetch new data
    print(" Checking for updates...")
    base_url = "https://api.datatourisme.fr/v1/catalog"
    new_objects = []
    cursor = None
    page = 0
    
    while True:
        # API request
        params = {"api_key": api_key}
        if cursor:
            params["crs"] = cursor
        
        try:
            response = requests.get(base_url, params=params, timeout=60)
            response.raise_for_status()
            data = response.json()
        except Exception as e:
            print(f" Error: {e}")
            break
        
        # Check for new objects
        objects = data.get("objects", [])
        for obj in objects:
            uuid = obj.get('uuid')
            if uuid not in existing_uuids:
                new_objects.append(obj)
                existing_uuids.add(uuid)
        
        page += 1
        if page % 10 == 0:
            print(f"   Page {page} - Found {len(new_objects)} new objects...")
        
        # Check if done
        meta = data.get("meta", {})
        if not meta.get("next") or "crs=" not in meta.get("next", ""):
            break
        
        cursor = meta["next"].split("crs=")[1].split("&")[0]
        time.sleep(0.125)  # Rate limit: 8 requests/second
    
    # 3. Save new data
    if new_objects:
        next_batch = len(batch_files) + 1
        filename = output_path / f"batch_{next_batch:06d}.json"
        
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(new_objects, f, ensure_ascii=False, indent=2)
        
        print(f"\n Saved {len(new_objects)} new objects to {filename.name}")
    else:
        print("\n No new updates found. Data is up to date!")
    
    return len(new_objects)


# Simple usage
if __name__ == "__main__":
    API_KEY = "98bf037a-4b11-4645-aa93-ec008b57f911"
    
    new_count = update_datatourisme(API_KEY)
    print(f"\n Summary: {new_count} new objects added")

In [192]:
import pandas as pd
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')

In [193]:
import requests
import time
import json
from pathlib import Path

API_BASE_URL = "https://api.datatourisme.fr/v1"
API_KEY = "98bf037a-4b11-4645-aa93-ec008b57f911"

def fetch_thesaurus(code):
    all_items, cursor = [], None
    while True:
        params = {'api_key': API_KEY}
        if cursor: 
            params['crs'] = cursor
        
        r = requests.get(f"{API_BASE_URL}/thesaurus/{code}", params=params, timeout=30)
        if r.status_code != 200: 
            break
        
        data = r.json()
        if not data.get('objects'): 
            break
        
        all_items.extend(data['objects'])
        
        next_url = data.get('meta', {}).get('next', '')
        if 'crs=' not in next_url: 
            break
        
        cursor = next_url.split('crs=')[1].split('&')[0]
        time.sleep(0.1)
    
    return all_items

print("PHASE 1: Fetching Thesaurus\n")

thesaurus_codes = ['Rating', 'PointOfInterestClass', 'Theme', 'Amenity']
thesaurus_data = {}

for code in thesaurus_codes:
    thesaurus_data[code] = fetch_thesaurus(code)
    print(f" {code}: {len(thesaurus_data[code])} items")

# Save thesaurus
with open('thesaurus_vocabulary.json', 'w', encoding='utf-8') as f:
    json.dump(thesaurus_data, f, ensure_ascii=False, indent=2)

# Extract valid keys
valid_rating_keys = {item['key'] for item in thesaurus_data['Rating']}
valid_poi_keys = {item['key'] for item in thesaurus_data['PointOfInterestClass']}
valid_theme_keys = {item['key'] for item in thesaurus_data['Theme']}
valid_amenity_keys = {item['key'] for item in thesaurus_data['Amenity']}

print(f"\n Thesaurus saved\n")


PHASE 1: Fetching Thesaurus

 Rating: 206 items
 PointOfInterestClass: 384 items
 Theme: 758 items
 Amenity: 312 items

 Thesaurus saved



In [194]:
def load_all_batches(data_dir="datatourisme_data"):
    """Load all JSON batch files"""
    all_objects = []
    batch_files = sorted(Path(data_dir).glob("batch_*.json"))
    
    print(f" Loading {len(batch_files)} batch files...")
    
    for batch_file in tqdm(batch_files, desc="Loading batches"):
        with open(batch_file, 'r', encoding='utf-8') as f:
            objects = json.load(f)
            all_objects.extend(objects)
    
    print(f" Loaded {len(all_objects):,} total objects\n")
    return all_objects

In [195]:
def safe_get(obj, key, lang='@fr'):
    """Safely extract multilingual fields with robust handling"""
    if not obj:
        return None
    
    if isinstance(obj, str):   # If obj is already a string, return it
        return obj
    
    if isinstance(obj, list):  # If obj is a list, try first element
        if not obj:
            return None
        first = obj[0]
        if isinstance(first, dict):
            return first.get(lang)
        return str(first)
    
    if isinstance(obj, dict):  # If obj is a dict, get the language key
        return obj.get(lang)
    
    return None


In [196]:
def create_relational_dataframes(objects):
    """Convert nested JSON to relational dataframes"""
    
    print(" Building relational dataframes...\n")
    
  
    # 1. MAIN OBJECTS TABLE
  
    print("1️  Creating main objects table...")
    main_data = []
    
    for obj in tqdm(objects, desc="   Processing"):
        label = obj.get('label', {})
        main_data.append({
            'uuid': obj.get('uuid'),
            'label_fr': safe_get(label, 'label', '@fr'),
            'label_en': safe_get(label, 'label', '@en'),
            'last_update': obj.get('lastUpdate'),
            'last_update_datatourisme': obj.get('lastUpdateDatatourisme'),
            'uri': obj.get('uri'),
        })
    
    df_main = pd.DataFrame(main_data)
    print(f"   ✓ Created: {len(df_main):,} rows\n")
    
    # 2. TYPES TABLE (many-to-many)
    
    print("2️  Creating types table...")
    types_data = []
    
    for obj in tqdm(objects, desc="   Processing"):
        uuid = obj.get('uuid')
        for type_name in obj.get('type', []):
            types_data.append({
                'uuid': uuid,
                'type_name': type_name
            })
    
    df_types = pd.DataFrame(types_data)  
    
    # 3. LOCATIONS TABLE
   
    print("3️  Creating locations table...")
    locations_data = []
    
    for obj in tqdm(objects, desc="   Processing"):
        uuid = obj.get('uuid')
        locations = obj.get('isLocatedAt', [])
        
        for loc in locations:
            geo = loc.get('geo', {})
            addresses = loc.get('address', [])
            
            if addresses:
                for addr in addresses:
                    city_info = addr.get('hasAddressCity', {})
                    city_label = city_info.get('label', {})
                    
                    locations_data.append({
                        'uuid': uuid,
                        'latitude': geo.get('latitude'),
                        'longitude': geo.get('longitude'),
                        'elevation': geo.get('elevation'),
                        'street_address': ', '.join(addr.get('streetAddress', [])) if addr.get('streetAddress') else None,
                        'postal_code': addr.get('postalCode'),
                        'city': addr.get('addressLocality'),
                        'city_label_fr': safe_get(city_label, 'label', '@fr'),
                        'city_label_en': safe_get(city_label, 'label', '@en'),
                        'city_insee': city_info.get('insee'),
                    })
            else:
                # Location without address
                locations_data.append({
                    'uuid': uuid,
                    'latitude': geo.get('latitude'),
                    'longitude': geo.get('longitude'),
                    'elevation': geo.get('elevation'),
                    'street_address': None,
                    'postal_code': None,
                    'city': None,
                    'city_label_fr': None,
                    'city_label_en': None,
                    'city_insee': None,
                })
    
    df_locations = pd.DataFrame(locations_data)
   
    # 4. DESCRIPTIONS TABLE
    
    print("4️  Creating descriptions table...")
    descriptions_data = []
    
    for obj in tqdm(objects, desc="   Processing"):
        uuid = obj.get('uuid')
        descriptions = obj.get('hasDescription', [])
        
        if descriptions:
            for desc in descriptions:
                full_desc = desc.get('description', {})
                short_desc = desc.get('shortDescription', {})
                
                descriptions_data.append({
                    'uuid': uuid,
                    'description_fr': safe_get(full_desc, 'desc', '@fr'),
                    'description_en': safe_get(full_desc, 'desc', '@en'),
                    'short_description_fr': safe_get(short_desc, 'short', '@fr'),
                    'short_description_en': safe_get(short_desc, 'short', '@en'),
                })
    
    df_descriptions = pd.DataFrame(descriptions_data)
    
    # 5. CONTACTS TABLE
    
    print("5️  Creating contacts table...")
    contacts_data = []
    
    for obj in tqdm(objects, desc="   Processing"):
        uuid = obj.get('uuid')
        contacts = obj.get('hasContact', [])
        
        for idx, contact in enumerate(contacts):
            phones = contact.get('telephone', [])
            emails = contact.get('email', [])
            homepages = contact.get('homepage', [])
            
            # Ensure lists
            if not isinstance(phones, list):
                phones = [phones] if phones else []
            if not isinstance(emails, list):
                emails = [emails] if emails else []
            if not isinstance(homepages, list):
                homepages = [homepages] if homepages else []
            
            contacts_data.append({
                'uuid': uuid,
                'contact_id': idx,
                'legal_name': contact.get('legalName'),
                'telephone': phones[0] if phones else None,
                'telephone_all': ' | '.join(str(p) for p in phones if p) if phones else None,
                'email': emails[0] if emails else None,
                'email_all': ' | '.join(str(e) for e in emails if e) if emails else None,
                'homepage': homepages[0] if homepages else None,
                'homepage_all': ' | '.join(str(h) for h in homepages if h) if homepages else None,
                'identifier': contact.get('identifier'),
            })
    
    df_contacts = pd.DataFrame(contacts_data)
   
    # 6. CREATORS TABLE
    
    print("6️  Creating creators table...")
    creators_data = []
    
    for obj in tqdm(objects, desc="   Processing"):
        uuid = obj.get('uuid')
        creator = obj.get('hasBeenCreatedBy', {})
        
        if creator:
            emails = creator.get('email', [])
            homepages = creator.get('homepage', [])
            phones = creator.get('telephone', [])
            
            # Ensure lists
            if not isinstance(emails, list):
                emails = [emails] if emails else []
            if not isinstance(homepages, list):
                homepages = [homepages] if homepages else []
            if not isinstance(phones, list):
                phones = [phones] if phones else []
            
            creators_data.append({
                'uuid': uuid,
                'legal_name': creator.get('legalName'),
                'identifier': creator.get('identifier'),
                'email': emails[0] if emails else None,
                'email_all': ' | '.join(str(e) for e in emails if e) if emails else None,
                'homepage': homepages[0] if homepages else None,
                'homepage_all': ' | '.join(str(h) for h in homepages if h) if homepages else None,
                'telephone': phones[0] if phones else None,
                'logo': creator.get('logo'),
            })
    
    df_creators = pd.DataFrame(creators_data)

    # 7. IMAGES TABLE

    print("7️  Creating images table...")
    images_data = []
    
    for obj in tqdm(objects, desc="   Processing"):
        uuid = obj.get('uuid')
        main_rep = obj.get('hasMainRepresentation', [])
        
        for rep in main_rep:
            annotations = rep.get('hasAnnotation', [])
            resources = rep.get('hasRelatedResource', [])
            
            for annotation in annotations:
                title = annotation.get('title', {})
                for resource in resources:
                    locators = resource.get('locator', [])
                    mime_types = resource.get('hasMimeType', [{}])
                    
                    if not isinstance(locators, list):
                        locators = [locators] if locators else []
                    
                    for locator in locators:
                        images_data.append({
                            'uuid': uuid,
                            'image_url': locator,
                            'title_fr': safe_get(title, 'title', '@fr'),
                            'title_en': safe_get(title, 'title', '@en'),
                            'credits': ', '.join(annotation.get('credits', [])) if annotation.get('credits') else None,
                            'mime_type': safe_get(mime_types[0].get('label', {}), 'type', '@fr') if mime_types else None,
                            'is_covered_by': annotation.get('isCoveredBy'),
                            'rights_start_date': annotation.get('rightsStartDate'),
                            'rights_end_date': annotation.get('rightsEndDate'),
                        })
    
    df_images = pd.DataFrame(images_data)
    
    # 8. REVIEWS/RATINGS TABLE
    
    print("8️  Creating reviews table...")
    reviews_data = []
    
    for obj in tqdm(objects, desc="   Processing"):
        uuid = obj.get('uuid')
        reviews = obj.get('hasReview', [])
        
        for review in reviews:
            review_value = review.get('hasReviewValue', {})
            label = review_value.get('label', {})
            
            reviews_data.append({
                'uuid': uuid,
                'rating_label_fr': safe_get(label, 'rating', '@fr'),
                'rating_label_en': safe_get(label, 'rating', '@en'),
                'rating_key': review_value.get('key'),
                'is_compliant_with': ', '.join(review_value.get('isCompliantWith', [])) if review_value.get('isCompliantWith') else None,
            })
    
    df_reviews = pd.DataFrame(reviews_data)

    # 9. COVID MEASURES TABLE

    print("9  Creating COVID measures table...")
    covid_data = []
    
    for obj in tqdm(objects, desc="   Processing"):
        uuid = obj.get('uuid')
        covid = obj.get('COVID19SpecialMeasures', {})
        
        if covid:
            covid_data.append({
                'uuid': uuid,
                'measures_fr': safe_get(covid, 'covid', '@fr'),
                'measures_en': safe_get(covid, 'covid', '@en'),
            })
    
    df_covid = pd.DataFrame(covid_data) if covid_data else pd.DataFrame(columns=['uuid', 'measures_fr', 'measures_en'])
    
    return {
        'main': df_main,
        'types': df_types,
        'locations': df_locations,
        'descriptions': df_descriptions,
        'contacts': df_contacts,
        'creators': df_creators,
        'images': df_images,
        'reviews': df_reviews,
        'covid': df_covid,
    }



In [197]:
# MAIN EXECUTION
import json
from pathlib import Path
print("="*70)
print("BUILDING RELATIONAL DATAFRAMES FROM JSON")
print("="*70)
print()

all_objects = load_all_batches("datatourisme_data")  # Step 1: Load all data

dataframes = create_relational_dataframes(all_objects)  # Step 2: Create dataframes

df_main = dataframes['main']   # Step 3: Assign to variables
df_types = dataframes['types']
df_locations = dataframes['locations']
df_descriptions = dataframes['descriptions']
df_contacts = dataframes['contacts']
df_creators = dataframes['creators']
df_images = dataframes['images']
df_reviews = dataframes['reviews']
df_covid = dataframes['covid']

print("="*70)
print("ALL DATAFRAMES CREATED SUCCESSFULLY!")
print("="*70)

BUILDING RELATIONAL DATAFRAMES FROM JSON

 Loading 240 batch files...


Loading batches:   0%|          | 0/240 [00:00<?, ?it/s]

 Loaded 476,528 total objects

 Building relational dataframes...

1️  Creating main objects table...


   Processing:   0%|          | 0/476528 [00:00<?, ?it/s]

   ✓ Created: 476,528 rows

2️  Creating types table...


   Processing:   0%|          | 0/476528 [00:00<?, ?it/s]

3️  Creating locations table...


   Processing:   0%|          | 0/476528 [00:00<?, ?it/s]

4️  Creating descriptions table...


   Processing:   0%|          | 0/476528 [00:00<?, ?it/s]

5️  Creating contacts table...


   Processing:   0%|          | 0/476528 [00:00<?, ?it/s]

6️  Creating creators table...


   Processing:   0%|          | 0/476528 [00:00<?, ?it/s]

7️  Creating images table...


   Processing:   0%|          | 0/476528 [00:00<?, ?it/s]

8️  Creating reviews table...


   Processing:   0%|          | 0/476528 [00:00<?, ?it/s]

9  Creating COVID measures table...


   Processing:   0%|          | 0/476528 [00:00<?, ?it/s]

ALL DATAFRAMES CREATED SUCCESSFULLY!


In [198]:
thesaurus_file = Path('thesaurus_vocabulary.json')
thesaurus_codes = ['Rating', 'PointOfInterestClass', 'Theme', 'Amenity']
thesaurus_data = {}
    
for code in thesaurus_codes:
        thesaurus_data[code] = fetch_thesaurus(code)
        print(f" {code}: {len(thesaurus_data[code])} items")
    
# Save thesaurus
with open('thesaurus_vocabulary.json', 'w', encoding='utf-8') as f:
    json.dump(thesaurus_data, f, ensure_ascii=False, indent=2)
print("\n Thesaurus saved to file")

# Extract valid keys
valid_rating_keys = {item['key'] for item in thesaurus_data['Rating']}
valid_poi_keys = {item['key'] for item in thesaurus_data['PointOfInterestClass']}
valid_theme_keys = {item['key'] for item in thesaurus_data['Theme']}
valid_amenity_keys = {item['key'] for item in thesaurus_data['Amenity']}

print(f"\n Thesaurus loaded:")
print(f"   Rating: {len(valid_rating_keys)} keys")
print(f"   PointOfInterestClass: {len(valid_poi_keys)} keys")
print(f"   Theme: {len(valid_theme_keys)} keys")
print(f"   Amenity: {len(valid_amenity_keys)} keys\n")

print("Cleaning data...\n")

# Clean reviews
dataframes['reviews']['rating_key'] = dataframes['reviews']['rating_key'].str.strip()

# Clean types
dataframes['types']['type_name'] = dataframes['types']['type_name'].str.strip()

print("Validating with thesaurus...\n")

# Validate reviews
before_reviews = len(dataframes['reviews'])
dataframes['reviews'] = dataframes['reviews'][dataframes['reviews']['rating_key'].isin(valid_rating_keys)]
print(f"Reviews: {before_reviews:,} → {len(dataframes['reviews']):,}")

# Validate types
before_types = len(dataframes['types'])
dataframes['types'] = dataframes['types'][dataframes['types']['type_name'].isin(valid_poi_keys)]
print(f"Types: {before_types:,} → {len(dataframes['types']):,}\n")

print(" Cleaning complete\n")

# EXPORT

print("Exporting CSV files...\n")

for name, df in dataframes.items():
    filename = f'clean_{name}.csv'
    df.to_csv(filename, index=False, encoding='utf-8')
    print(f" {filename}: {len(df):,} rows")

print("\n Complete!")

 Rating: 206 items
 PointOfInterestClass: 384 items
 Theme: 758 items
 Amenity: 312 items

 Thesaurus saved to file

 Thesaurus loaded:
   Rating: 206 keys
   PointOfInterestClass: 384 keys
   Theme: 756 keys
   Amenity: 312 keys

Cleaning data...

Validating with thesaurus...

Reviews: 142,426 → 138,804
Types: 2,497,258 → 2,141,626

 Cleaning complete

Exporting CSV files...

 clean_main.csv: 476,528 rows
 clean_types.csv: 2,141,626 rows
 clean_locations.csv: 476,506 rows
 clean_descriptions.csv: 438,862 rows
 clean_contacts.csv: 544,145 rows
 clean_creators.csv: 476,455 rows
 clean_images.csv: 151,269 rows
 clean_reviews.csv: 138,804 rows
 clean_covid.csv: 5,307 rows

 Complete!


In [201]:
#backup my dataframes
df_main_copy=df_main 
df_type_copy =df_types
df_locations_copy =df_locations 
df_descriptions_copy=df_descriptions
df_contacts_copy=df_contacts
df_creators_copy=df_creators 
df_images_copy=df_images 
df_reviews_copy=df_reviews 
df_covid_copy=df_covid

In [202]:
df_main.head(5)

Unnamed: 0,uuid,label_fr,label_en,last_update,last_update_datatourisme,uri
0,000016ad-998e-3b6a-80b7-25531604fc11,Télégraphe Chappe de Courberon,Chappe de Courberon telegraph,2026-02-03,2026-02-05T07:10:58.046Z,https://data.datatourisme.fr/13/2c29c0aa-bb2f-...
1,000018f6-c7e6-3762-9713-5595c558c900,La roulotte du Château de Lescaut,La roulotte du Château de Lescaut,2026-01-07,2026-01-08T02:18:22.424Z,https://data.datatourisme.fr/23/aa697488-989b-...
2,000035ae-b4f4-3ced-9d4d-23287800fd9e,Camping du Dourdou,,2025-01-17,2026-01-01T07:11:39.306Z,https://data.datatourisme.fr/31/294d4e55-372e-...
3,000041c5-b92b-3236-9768-6e560cea215c,LA GUINGUETTE,,2025-12-12,2026-01-20T04:12:16.093Z,https://data.datatourisme.fr/41/bfd872de-b7e6-...
4,00004466-aa48-3660-8bce-01ff6b03e7a5,Maison Jean HUTTARD,,2026-01-12,2026-01-13T08:44:56.424Z,https://data.datatourisme.fr/5/f331c854-aa0b-3...


In [203]:
# check shape an consistence
print(f"\n Dataset Shape: {df_main.shape}")
print(f"   Rows: {df_main.shape[0]:,}")
print(f"   Columns: {df_main.shape[1]}")


 Dataset Shape: (476528, 6)
   Rows: 476,528
   Columns: 6


In [204]:
print(f"\n Column Names:")
for i, col in enumerate(df_main.columns, 1):
    print(f"   {i}. {col}")


 Column Names:
   1. uuid
   2. label_fr
   3. label_en
   4. last_update
   5. last_update_datatourisme
   6. uri


In [205]:
# Count missing values
missing_counts = df_main.isnull().sum()
missing_percentages = (missing_counts / len(df_main) * 100).round(2)
df_missing = pd.DataFrame({
    'Column': df_main.columns,
    'Missing_Count': missing_counts.values,
    'Missing_Percentage': missing_percentages.values,
    'Non_Missing': len(df_main) - missing_counts.values,
    'Data_Type': df_main.dtypes.values
})

In [206]:
print(df_missing)

                     Column  Missing_Count  Missing_Percentage  Non_Missing  \
0                      uuid              0                0.00       476528   
1                  label_fr             73                0.02       476455   
2                  label_en         267321               56.10       209207   
3               last_update             73                0.02       476455   
4  last_update_datatourisme              0                0.00       476528   
5                       uri              0                0.00       476528   

  Data_Type  
0       str  
1       str  
2       str  
3       str  
4       str  
5       str  


In [207]:
missing_labels = df_main[df_main['label_fr'].isnull()]
print(missing_labels[['uuid', 'label_en', 'uri']])
# we have no english label in order to be able to replace it in the french field, so i decide to leave it how it is.
#in time, i will fetch the homepage contact and extract the name for the 73 missing values 

                                        uuid label_en  \
360941  c21a3ffb-45d9-35ab-a3aa-c91c0395549e      NaN   
361379  c2525785-1e74-3783-875e-823ec964d769      NaN   
361544  c269ecb4-8867-321e-865d-11fb2ba8aaee      NaN   
362292  c2cfa7c2-3cf1-31ba-a653-d80daa0e0553      NaN   
366349  c4e1f412-888b-3493-b649-6d353abf21b1      NaN   
...                                      ...      ...   
470154  fc9910c6-1629-3079-9312-c8fda3572d1d      NaN   
473069  fe2d06a5-cad5-3343-bccf-899ab14a7e72      NaN   
473712  fe859e14-b0a6-38b9-b1bb-e4c22b72faa6      NaN   
474693  ff0dce1f-965a-30eb-97c5-3ae5a0b890d2      NaN   
474892  ff2a4602-e87b-3e51-a08b-35b64ea3c034      NaN   

                                                      uri  
360941  https://data.datatourisme.fr/3/9b4f877e-f427-3...  
361379  https://data.datatourisme.fr/3/26e79db3-8e6f-3...  
361544  https://data.datatourisme.fr/3/6240c26b-a218-3...  
362292  https://data.datatourisme.fr/3/8e37e222-fcb8-3...  
366349  https:/

In [208]:
# check if uuid is unique 
print("\n1️  UUID Uniqueness:")
uuid_count = len(df_main)
unique_uuids = df_main['uuid'].nunique()
print(f"   Total rows: {uuid_count:,}")
print(f"   Unique UUIDs: {unique_uuids:,}")


1️  UUID Uniqueness:
   Total rows: 476,528
   Unique UUIDs: 476,497


In [209]:
 # check if all the uuid have the same length

invalid_uuids = df_main['uuid'].str.len() < 10  # UUIDs should be longer
invalid_count = invalid_uuids.sum()
    
if invalid_count > 0:
        print(f"{invalid_count} potentially invalid UUIDs")
else:
        print(f"All UUIDs have valid length")

All UUIDs have valid length


In [210]:
# check for empty rows
empty_rows = df_main.isnull().all(axis=1).sum()
print(f"   Completely empty rows: {empty_rows}")

   Completely empty rows: 0


In [211]:
# check for data consistency 
print(df_main.dtypes)

uuid                        str
label_fr                    str
label_en                    str
last_update                 str
last_update_datatourisme    str
uri                         str
dtype: object


In [212]:
# Convert to datetime with French timezone
df_main['last_update'] = pd.to_datetime(df_main['last_update'], errors='coerce')
df_main['last_update_datatourisme'] = pd.to_datetime(df_main['last_update_datatourisme'], errors='coerce')

In [213]:
print(df_main.dtypes)

uuid                                        str
label_fr                                    str
label_en                                    str
last_update                      datetime64[us]
last_update_datatourisme    datetime64[us, UTC]
uri                                         str
dtype: object


In [214]:
df_types.head(5)

Unnamed: 0,uuid,type_name
0,000016ad-998e-3b6a-80b7-25531604fc11,PointOfInterest
1,000016ad-998e-3b6a-80b7-25531604fc11,CulturalSite
2,000016ad-998e-3b6a-80b7-25531604fc11,PlaceOfInterest
3,000018f6-c7e6-3762-9713-5595c558c900,Accommodation
4,000018f6-c7e6-3762-9713-5595c558c900,RentalAccommodation


In [215]:
print(f"\n Dataset Shape: {df_types.shape}")
print(f"   Rows: {df_types.shape[0]:,}")
print(f"   Columns: {df_types.shape[1]}") # is good since one uuid can have multiple types 


 Dataset Shape: (2497258, 2)
   Rows: 2,497,258
   Columns: 2


In [216]:
missing_labels = df_types[df_types['type_name'].isnull()]
print(missing_labels[['uuid', 'type_name']]) # no null values 

Empty DataFrame
Columns: [uuid, type_name]
Index: []


In [217]:
# check if uuid is unique 
print("\n1️  UUID Uniqueness:")
uuid_count = len(df_types)
unique_uuids = df_types['uuid'].nunique()
print(f"   Total rows: {uuid_count:,}")
print(f"   Unique UUIDs: {unique_uuids:,}")


1️  UUID Uniqueness:
   Total rows: 2,497,258
   Unique UUIDs: 476,424


In [218]:
df_locations.head(5)

Unnamed: 0,uuid,latitude,longitude,elevation,street_address,postal_code,city,city_label_fr,city_label_en,city_insee
0,000016ad-998e-3b6a-80b7-25531604fc11,45.233402,6.722158,2002.0,,73500,Avrieux,Avrieux,Avrieux,73026
1,000018f6-c7e6-3762-9713-5595c558c900,44.57023,0.44739,,"Château Lescaut, 746 route de Laperche",47800,Montignac-de-Lauzun,Montignac-de-Lauzun,Montignac-de-Lauzun,47188
2,000035ae-b4f4-3ced-9d4d-23287800fd9e,43.7609,2.9526,,Route d'Arnac,12360,Brusque,Brusque,Brusque,12039
3,000041c5-b92b-3236-9768-6e560cea215c,43.240773,2.765577,,Rue de la fontaine fraîche,11200,Argens-Minervois,Argens-Minervois,Argens-Minervois,11013
4,00004466-aa48-3660-8bce-01ff6b03e7a5,48.171538,7.320499,,10 route du Vin,68340,Zellenberg,Zellenberg,Zellenberg,68383


In [219]:
print(df_locations.dtypes)

uuid                  str
latitude          float64
longitude         float64
elevation         float64
street_address        str
postal_code           str
city                  str
city_label_fr         str
city_label_en         str
city_insee            str
dtype: object


In [220]:
# Count missing values
missing_counts = df_locations.isnull().sum()
missing_percentages = (missing_counts / len(df_locations) * 100).round(2)
df_missing = pd.DataFrame({
    'Column': df_locations.columns,
    'Missing_Count': missing_counts.values,
    'Missing_Percentage': missing_percentages.values,
    'Non_Missing': len(df_locations) - missing_counts.values,
    'Data_Type': df_locations.dtypes.values
})
print(df_missing) #is better to delete to column elevation 

           Column  Missing_Count  Missing_Percentage  Non_Missing Data_Type
0            uuid              0                0.00       476506       str
1        latitude              0                0.00       476506   float64
2       longitude              0                0.00       476506   float64
3       elevation         425787               89.36        50719   float64
4  street_address          47143                9.89       429363       str
5     postal_code              0                0.00       476506       str
6            city              0                0.00       476506       str
7   city_label_fr              0                0.00       476506       str
8   city_label_en              0                0.00       476506       str
9      city_insee              0                0.00       476506       str


In [221]:
df_locations.head(5)

Unnamed: 0,uuid,latitude,longitude,elevation,street_address,postal_code,city,city_label_fr,city_label_en,city_insee
0,000016ad-998e-3b6a-80b7-25531604fc11,45.233402,6.722158,2002.0,,73500,Avrieux,Avrieux,Avrieux,73026
1,000018f6-c7e6-3762-9713-5595c558c900,44.57023,0.44739,,"Château Lescaut, 746 route de Laperche",47800,Montignac-de-Lauzun,Montignac-de-Lauzun,Montignac-de-Lauzun,47188
2,000035ae-b4f4-3ced-9d4d-23287800fd9e,43.7609,2.9526,,Route d'Arnac,12360,Brusque,Brusque,Brusque,12039
3,000041c5-b92b-3236-9768-6e560cea215c,43.240773,2.765577,,Rue de la fontaine fraîche,11200,Argens-Minervois,Argens-Minervois,Argens-Minervois,11013
4,00004466-aa48-3660-8bce-01ff6b03e7a5,48.171538,7.320499,,10 route du Vin,68340,Zellenberg,Zellenberg,Zellenberg,68383


In [222]:
# GPS CHECK - df_locations

print("GPS Check - df_locations:\n")
print(f"   Total records: {len(df_locations):,}")
print(f"   Missing lat: {df_locations['latitude'].isnull().sum():,}")
print(f"   Missing lon: {df_locations['longitude'].isnull().sum():,}")
print(f"   Lat range: {df_locations['latitude'].min():.2f} to {df_locations['latitude'].max():.2f} (valid: -90 to 90)")
print(f"   Lon range: {df_locations['longitude'].min():.2f} to {df_locations['longitude'].max():.2f} (valid: -180 to 180)")

zero_count = ((df_locations['latitude'] == 0) & (df_locations['longitude'] == 0)).sum()
print(f"   (0,0) coords: {zero_count:,}")

in_france = (
    (df_locations['latitude'] >= 41) & 
    (df_locations['latitude'] <= 51.5) & 
    (df_locations['longitude'] >= -5.5) & 
    (df_locations['longitude'] <= 10)
).sum()
print(f"  In France: {in_france:,} ({in_france/len(df_locations)*100:.1f}%)")

GPS Check - df_locations:

   Total records: 476,506
   Missing lat: 0
   Missing lon: 0
   Lat range: -61.00 to 51.08 (valid: -90 to 90)
   Lon range: -115.23 to 168.09 (valid: -180 to 180)
   (0,0) coords: 8
  In France: 471,013 (98.8%)


In [223]:
print(df_descriptions.dtypes)

uuid                       str
description_fr          object
description_en          object
short_description_fr    object
short_description_en    object
dtype: object


In [224]:
# Convert object to string dtype
df_descriptions['description_fr'] = df_descriptions['description_fr'].astype('string')
df_descriptions['description_en'] = df_descriptions['description_en'].astype('string')
df_descriptions['short_description_fr'] = df_descriptions['short_description_fr'].astype('string')
df_descriptions['short_description_en'] = df_descriptions['short_description_en'].astype('string')

In [225]:
print(df_descriptions.dtypes)

uuid                       str
description_fr          string
description_en          string
short_description_fr    string
short_description_en    string
dtype: object


In [226]:
df_descriptions.head(5)

Unnamed: 0,uuid,description_fr,description_en,short_description_fr,short_description_en
0,000016ad-998e-3b6a-80b7-25531604fc11,,,"La télégraphie Chappe, ancêtre de nos télécomm...","Chappe telegraphy, the forerunner of modern te..."
1,000018f6-c7e6-3762-9713-5595c558c900,"À Montignac-de-Lauzun, à proximité des commerc...","In Montignac-de-Lauzun, close to shops and the...","À Montignac-de-Lauzun, à proximité des commerc...","In Montignac-de-Lauzun, close to the shops and..."
2,000035ae-b4f4-3ced-9d4d-23287800fd9e,Animations estivales (Les Mardis de Brusque) à...,Summer activities (Les Mardis de Brusque) near...,"Camping calme, ombragé situé en bordure du Dou...","Quiet, shady campsite on the banks of the Dour..."
3,000041c5-b92b-3236-9768-6e560cea215c,,,La guinguette est un agréable restaurant où l'...,La guinguette is a pleasant restaurant where y...
4,00004466-aa48-3660-8bce-01ff6b03e7a5,« La Maison Jean Huttard est le reflet d’une ...,"""The House of Jean Huttard is the reflection o...","Vignerons Indépendants depuis 1860, nous vous ...","Independent winemakers since 1860, we offer un..."


In [227]:
df_descriptions.shape

(438862, 5)

In [228]:
# Check for empty cells in each column
null_counts = df_descriptions.isnull().sum()
empty_string_counts = (df_descriptions == '').sum()
whitespace_counts = df_descriptions.apply(lambda col: col.str.strip().eq('') if col.dtype == 'string' else 0).sum()

print (null_counts)

uuid                         0
description_fr          130997
description_en          131792
short_description_fr    145352
short_description_en    143752
dtype: int64


In [229]:
no_description_any = df_descriptions[
    (df_descriptions['description_fr'].isna()) & 
    (df_descriptions['short_description_fr'].isna()) &
    (df_descriptions['description_en'].isna()) & 
    (df_descriptions['short_description_en'].isna())
]
print("uuid with no description", no_description_any)

uuid with no description                                         uuid description_fr description_en  \
61      0007e1ac-2866-3bdd-87cc-68b0552e448d           <NA>           <NA>   
433     0038083b-20c2-3c87-9bc0-77fd42ce57af           <NA>           <NA>   
484     003ffd28-6d05-3a9a-b43a-98008b86356b           <NA>           <NA>   
516     00440ce0-ba42-3eef-b6f8-a6dd0793c31e           <NA>           <NA>   
925     0084b24d-d7fd-3f5c-8d65-12cb027046a4           <NA>           <NA>   
...                                      ...            ...            ...   
437463  ff3b2a83-d2b8-3729-b81c-7a5f09b96fd7           <NA>           <NA>   
437551  ff47767c-181d-343b-a5f7-fd88da26ac9f           <NA>           <NA>   
438005  ff8e7acb-4ca7-33f8-8900-1cd9f9993f80           <NA>           <NA>   
438101  ff9dacee-c39f-3e75-b9a4-eb49bfb8ac0e           <NA>           <NA>   
438703  fff73676-605f-3ddd-b932-8e0621c1ad13           <NA>           <NA>   

       short_description_fr short_desc

In [230]:
# Merge telephone and telephone_all into single 'phone' column
df_contacts['phone'] = df_contacts['telephone_all'].fillna(df_contacts['telephone'])

# Drop old columns
df_contacts = df_contacts.drop(columns=['telephone', 'telephone_all'])

In [231]:
df_contacts.head(5)

Unnamed: 0,uuid,contact_id,legal_name,email,email_all,homepage,homepage_all,identifier,phone
0,000016ad-998e-3b6a-80b7-25531604fc11,0,,marie-annick.blondon@wanadoo.fr,marie-annick.blondon@wanadoo.fr,https://avrieux.com/telegraphe-chappe/,https://avrieux.com/telegraphe-chappe/,,+33 6 87 74 31 54
1,000018f6-c7e6-3762-9713-5595c558c900,0,Madame Elisabeth CAPPONI,e.capponi@orange.fr,e.capponi@orange.fr,https://chateaulescaut.fr/,https://chateaulescaut.fr/,,+33 6 80 44 43 34
2,000018f6-c7e6-3762-9713-5595c558c900,1,La roulotte du Château de Lescaut,e.capponi@orange.fr,e.capponi@orange.fr,https://chateaulescaut.fr/,https://chateaulescaut.fr/,,+33 6 80 44 43 34
3,000035ae-b4f4-3ced-9d4d-23287800fd9e,0,Camping du Dourdou,mairie@brusque.fr,mairie@brusque.fr,,,,+33 5 65 99 53 12
4,000035ae-b4f4-3ced-9d4d-23287800fd9e,1,Camping du Dourdou,,,,,,+33 7 86 11 61 86


In [232]:
print(df_contacts.dtypes)

uuid              str
contact_id      int64
legal_name        str
email             str
email_all         str
homepage          str
homepage_all      str
identifier        str
phone             str
dtype: object


In [233]:
# Convert contact_id to boolean
df_contacts['contact_id'] = df_contacts['contact_id'].astype('bool')

In [234]:
# Merge email and email_all into single 'email' column
df_contacts['email'] = df_contacts['email_all'].fillna(df_contacts['email'])

# Drop old column
df_contacts = df_contacts.drop(columns=['email_all'])


In [235]:
# Merge homepage and homepage_all into single 'homepage' column
df_contacts['homepage'] = df_contacts['homepage_all'].fillna(df_contacts['homepage'])

# Drop old column
df_contacts = df_contacts.drop(columns=['homepage_all'])

In [236]:
df_contacts = df_contacts.drop(columns=['identifier']) # the missing percentage is too big 

In [237]:
df_contacts.head()

Unnamed: 0,uuid,contact_id,legal_name,email,homepage,phone
0,000016ad-998e-3b6a-80b7-25531604fc11,False,,marie-annick.blondon@wanadoo.fr,https://avrieux.com/telegraphe-chappe/,+33 6 87 74 31 54
1,000018f6-c7e6-3762-9713-5595c558c900,False,Madame Elisabeth CAPPONI,e.capponi@orange.fr,https://chateaulescaut.fr/,+33 6 80 44 43 34
2,000018f6-c7e6-3762-9713-5595c558c900,True,La roulotte du Château de Lescaut,e.capponi@orange.fr,https://chateaulescaut.fr/,+33 6 80 44 43 34
3,000035ae-b4f4-3ced-9d4d-23287800fd9e,False,Camping du Dourdou,mairie@brusque.fr,,+33 5 65 99 53 12
4,000035ae-b4f4-3ced-9d4d-23287800fd9e,True,Camping du Dourdou,,,+33 7 86 11 61 86


In [238]:
df_creators.head(5)

Unnamed: 0,uuid,legal_name,identifier,email,email_all,homepage,homepage_all,telephone,logo
0,000016ad-998e-3b6a-80b7-25531604fc11,Haute Maurienne Vanoise Tourisme,582,laurent.sevenier@hautemaurienne.com,laurent.sevenier@hautemaurienne.com | elisabet...,https://www.haute-maurienne-vanoise.com/,https://www.haute-maurienne-vanoise.com/,,
1,000018f6-c7e6-3762-9713-5595c558c900,OT du Pays de Lauzun,ae9d1f79-cda3-4ede-813c-7951207f931f,tourisme@paysdelauzun.com,tourisme@paysdelauzun.com,https://www.paysdelauzun.com/,https://www.paysdelauzun.com/,+33 5 53 94 13 09,
2,000035ae-b4f4-3ced-9d4d-23287800fd9e,HIT Aveyron (ADAT Aveyron),,,,,,,
3,000041c5-b92b-3236-9768-6e560cea215c,AGENCE DE DÉVELOPPEMENT TOURISTIQUE DE L'AUDE,,siti@audetourisme.com,siti@audetourisme.com,https://www.audetourisme.com/,https://www.audetourisme.com/,+33 4 68 11 66 00,https://www.audetourisme.com/lae/public1.0/img...
4,00004466-aa48-3660-8bce-01ff6b03e7a5,Office de tourisme du pays de Ribeauvillé et R...,,,,,,,


In [239]:
print(df_creators.dtypes)

uuid            str
legal_name      str
identifier      str
email           str
email_all       str
homepage        str
homepage_all    str
telephone       str
logo            str
dtype: object


In [240]:
# Check for null/NaN values
for col in df_creators.columns:
    null_count = df_creators[col].isnull().sum()
    total = len(df_contacts)
    percentage = (null_count / total * 100)
    print(f"{col:20s} Null: {null_count:>10,} ({percentage:>6.2f}%)")

uuid                 Null:          0 (  0.00%)
legal_name           Null:         48 (  0.01%)
identifier           Null:    170,962 ( 31.42%)
email                Null:    163,940 ( 30.13%)
email_all            Null:    163,940 ( 30.13%)
homepage             Null:    246,723 ( 45.34%)
homepage_all         Null:    246,723 ( 45.34%)
telephone            Null:    391,098 ( 71.87%)
logo                 Null:    474,687 ( 87.24%)


In [241]:
df_creators['email'] = df_creators['email_all'].fillna(df_creators['email'])
df_creators = df_creators.drop(columns=['email_all'])


In [242]:
df_creators['homepage'] = df_creators['homepage_all'].fillna(df_creators['homepage'])
df_creators = df_creators.drop(columns=['homepage_all'])

In [243]:
# Drop logo column from df_creators
df_creators = df_creators.drop(columns=['logo'])

In [244]:
# Check if emails contain '@' symbol
import re
valid_emails = df_creators['email'].dropna().str.contains('@', na=False).sum()
invalid_emails = df_creators['email'].dropna().shape[0] - valid_emails
print(f"Valid emails: {valid_emails}, Invalid: {invalid_emails}")

Valid emails: 312515, Invalid: 0


In [245]:
# Check if homepages start with http/https
valid_urls = df_creators['homepage'].dropna().str.match(r'^https?://', na=False).sum()
total_urls = df_creators['homepage'].notna().sum()
print(f"Valid URLs: {valid_urls}/{total_urls}")

Valid URLs: 229732/229732


In [246]:
# standardize the phone number
df_creators['telephone'] = df_creators['telephone'].str.replace(r'[\s\-\(\)]', '', regex=True)
df_contacts['phone'] = df_contacts['phone'].str.replace(r'[\s\-\(\)]', '', regex=True)

In [247]:
df_images.head()

Unnamed: 0,uuid,image_url,title_fr,title_en,credits,mime_type,is_covered_by,rights_start_date,rights_end_date
0,000018f6-c7e6-3762-9713-5595c558c900,https://cdt64.media.tourinsoft.eu/upload/monti...,montignac de lauzun-roulotte chateau de lescau...,montignac de lauzun-roulotte chateau de lescau...,©CAPPONI Elisabeth,image/jpeg,By-NC-ND 4.0,,
1,000035ae-b4f4-3ced-9d4d-23287800fd9e,https://medias.hit.enaveyron.fr/fiches/18324/i...,Camping du Dourdou,,Camping du Dourdou,image/webp,,,
2,000041c5-b92b-3236-9768-6e560cea215c,http://cdt11.media.tourinsoft.eu/upload/Restau...,Argens Restaurant La Guinguette,,Céline Deschamps Photographe,,,,
3,000074dc-0c43-3122-a324-1d0dddd8222f,https://cdt64.media.tourinsoft.eu/upload/visue...,Design sans titre - 1,Design sans titre - 1,Piko Paseos - Grop,image/png,By-NC-ND 4.0,,
4,000075e9-8af7-3c15-ba1a-e78b88f075a4,https://cdt64.media.tourinsoft.eu/upload/CouvS...,CouvSem1-2026carre,CouvSem1-2026carre,france services,image/png,By-NC-ND 4.0,,


In [248]:
# Check for null/NaN values
for col in df_images.columns:
    null_count = df_images[col].isnull().sum()
    total = len(df_images)
    percentage = (null_count / total * 100)
    print(f"{col:20s} Null: {null_count:>10,} ({percentage:>6.2f}%)")

uuid                 Null:          0 (  0.00%)
image_url            Null:          0 (  0.00%)
title_fr             Null:     13,596 (  8.99%)
title_en             Null:     86,878 ( 57.43%)
credits              Null:      2,107 (  1.39%)
mime_type            Null:     46,248 ( 30.57%)
is_covered_by        Null:     81,066 ( 53.59%)
rights_start_date    Null:    145,847 ( 96.42%)
rights_end_date      Null:    137,462 ( 90.87%)


In [249]:
# Drop rights_start_date and rights_end_date columns from df_images
df_images = df_images.drop(columns=['rights_start_date', 'rights_end_date'])

In [250]:
print(df_images.dtypes)

uuid             str
image_url        str
title_fr         str
title_en         str
credits          str
mime_type        str
is_covered_by    str
dtype: object


In [251]:
missing_both = df_images[(df_images['title_fr'].isna()) & (df_images['title_en'].isna())].shape[0]
print(f"Missing both titles: {missing_both:,}")

Missing both titles: 13,596


In [252]:
# Ill keeb both language titles for reference

In [253]:
# Check what values exist in is_covered_by
print(df_images['is_covered_by'].value_counts())
print(f"\nUnique values: {df_images['is_covered_by'].nunique()}")

is_covered_by
By-NC-ND 4.0                           69164
Théâtre équestre de Champagne             47
Le Sarrail                                17
©Gîtes de France Aube                     15
L'Aiguillage                              14
                                       ...  
© Les Chambres de Troyes                   1
© Capitaine BNB                            1
©troyes_japan_otaku_festival               1
Le petit cocon du 10                       1
© Cité du Vitrail - Samuel Silvares        1
Name: count, Length: 693, dtype: int64

Unique values: 693


In [254]:
# The data from is_covered_by contains data about the licence under the image is. In case i want to use it later on, I ll need to standardize it

In [255]:
# I ll keep the data in df_images_complete

In [256]:
df_images_complete=df_images

In [257]:
# Drop is_covered_by column from df_images
df_images = df_images.drop(columns=['is_covered_by'])

In [258]:
# For rows where mime_type is missing, extract extension and fill mime_type
def get_mime_from_url(row):
    # If mime_type already exists, keep it
    if pd.notna(row['mime_type']):
        return row['mime_type']
    
    # Extract extension from URL
    match = re.search(r'\.(\w+)(?:\?|$)', row['image_url'])
    if match:
        extension = match.group(1).lower()
        
        # Map extension to MIME type
        extension_to_mime = {
            'jpg': 'image/jpeg',
            'jpeg': 'image/jpeg',
            'png': 'image/png',
            'gif': 'image/gif',
            'webp': 'image/webp',
            'svg': 'image/svg+xml',
            'bmp': 'image/bmp',
            'tif': 'image/tiff',
            'tiff': 'image/tiff'
        }
        
        return extension_to_mime.get(extension, None)
    
    return None

# Apply the function
df_images['mime_type'] = df_images.apply(get_mime_from_url, axis=1)

# Check how many nulls remain
print(f"mime_type nulls after filling: {df_images['mime_type'].isna().sum():,}")

mime_type nulls after filling: 68


In [259]:
# check shape an consistence

print(f"\n Dataset Shape: {df_images.shape}")
print(f"   Rows: {df_images.shape[0]:,}")
print(f"   Columns: {df_images.shape[1]}")


 Dataset Shape: (151269, 6)
   Rows: 151,269
   Columns: 6


In [260]:
print(df_images.dtypes)

uuid         str
image_url    str
title_fr     str
title_en     str
credits      str
mime_type    str
dtype: object


In [261]:
# Find the most common mime_type value
most_common_mime = df_images['mime_type'].mode()[0]

print(f"Most common mime_type: {most_common_mime}")
print(f"Count: {(df_images['mime_type'] == most_common_mime).sum():,}")

# Fill missing mime_type values with the most common one
df_images['mime_type'] = df_images['mime_type'].fillna(most_common_mime)

# Verify no more nulls
print(f"\nmime_type nulls after filling: {df_images['mime_type'].isna().sum()}")

Most common mime_type: image/jpeg
Count: 132,129

mime_type nulls after filling: 0


In [262]:
print(df_images.dtypes)

uuid         str
image_url    str
title_fr     str
title_en     str
credits      str
mime_type    str
dtype: object


In [263]:
df_reviews.head(5)

Unnamed: 0,uuid,rating_label_fr,rating_label_en,rating_key,is_compliant_with
0,000041c5-b92b-3236-9768-6e560cea215c,Accueil Vélo,Accueil Vélo,LabelRating_AccueilVelo,PlaceOfInterest
1,000041c5-b92b-3236-9768-6e560cea215c,Vignobles & Découvertes,Vignobles & Découvertes,LabelRating_VignoblesDecouvertes,"EntertainmentAndEvent, Visit, Restaurant, Bist..."
2,00004466-aa48-3660-8bce-01ff6b03e7a5,Bienvenue à la ferme,Welcome to the farm,LabelRating_BienvenueaLaFerme,"Practice, Guesthouse, CampingAndCaravanning, R..."
3,00017aad-a0ae-3af8-8bdd-5ea5a1206e6a,3 étoiles,3 Stars,ScaleRating_3etoiles,"CampingAndCaravanning, Hotel, SelfCateringAcco..."
4,00018315-087b-33c0-b520-1f561842e15c,2 étoiles,2 Stars,ScaleRating_2etoiles,"CampingAndCaravanning, HolidayCentre, Hotel, C..."


In [264]:
# check for nulls for each column 
print("Null values:")
for col in df_reviews.columns:
    null_count = df_reviews[col].isnull().sum()
    percentage = (null_count / len(df_reviews)) * 100
    print(f"{col:25s} Null: {null_count:>10,} ({percentage:>6.2f}%)")

Null values:
uuid                      Null:          0 (  0.00%)
rating_label_fr           Null:      3,622 (  2.54%)
rating_label_en           Null:      3,622 (  2.54%)
rating_key                Null:      3,622 (  2.54%)
is_compliant_with         Null:      3,622 (  2.54%)


In [265]:
# Print all unique rating_key values
print(f"Total unique rating_keys: {df_reviews['rating_key'].nunique()}\n")
print("All rating_key values:")
print("-" * 60)
for key in sorted(df_reviews['rating_key'].dropna().unique()):
    print(key)

Total unique rating_keys: 160

All rating_key values:
------------------------------------------------------------
LabelRating_AccueilCompostelle
LabelRating_AccueilMotards
LabelRating_AccueilPaysan
LabelRating_AccueilVelo
LabelRating_AireNaturelle
LabelRating_ArchitectureContemporaineRemarquable
LabelRating_ArtisansDeQualite
LabelRating_AubergeCollective
LabelRating_AubergeDeVillage
LabelRating_BibGourmand
LabelRating_BienvenueaLaFerme
LabelRating_BistrotDePays
LabelRating_BistrotGourmand
LabelRating_BonClimat
LabelRating_BottinGourmand
LabelRating_BouillabaisseCharter
LabelRating_CampingQualite
LabelRating_ChambreDhotesReference
LabelRating_Charmance
LabelRating_ChouetteNature
LabelRating_CityBreakConfort
LabelRating_CityBreakLuxury
LabelRating_CityBreakPremium
LabelRating_CleVacances
LabelRating_ClefVerte
LabelRating_CoastlandPreservationArea
LabelRating_CommuneHalte
LabelRating_CommuneTouristique
LabelRating_CyclingAndWheelchairPath
LabelRating_CyclingRoute
LabelRating_Demeter
Labe

In [266]:
# Drop rating_label_fr and rating_label_en columns
df_review_complete=df_reviews
df_reviews = df_reviews.drop(columns=['rating_label_fr', 'rating_label_en'])

In [267]:
df_reviews.head(5)

Unnamed: 0,uuid,rating_key,is_compliant_with
0,000041c5-b92b-3236-9768-6e560cea215c,LabelRating_AccueilVelo,PlaceOfInterest
1,000041c5-b92b-3236-9768-6e560cea215c,LabelRating_VignoblesDecouvertes,"EntertainmentAndEvent, Visit, Restaurant, Bist..."
2,00004466-aa48-3660-8bce-01ff6b03e7a5,LabelRating_BienvenueaLaFerme,"Practice, Guesthouse, CampingAndCaravanning, R..."
3,00017aad-a0ae-3af8-8bdd-5ea5a1206e6a,ScaleRating_3etoiles,"CampingAndCaravanning, Hotel, SelfCateringAcco..."
4,00018315-087b-33c0-b520-1f561842e15c,ScaleRating_2etoiles,"CampingAndCaravanning, HolidayCentre, Hotel, C..."


In [268]:
df_covid.shape

(5307, 3)

In [269]:
df_covid.head(5)

Unnamed: 0,uuid,measures_fr,measures_en
0,00050ac9-6838-3404-a68b-6d01fff4548e,Mise en place et respect des gestes barrière,Implementation of and compliance with barrier ...
1,000c32ab-e80b-3f6b-83b4-29f3cfd2c539,"Sentier en plein -air, accessible et ouvert","Sentier en plein -air, accessible et ouvert"
2,000ea97f-9074-3ad4-aa9d-7199ceaec529,"Professionnel de l’évènementiel, Briffe propos...","Since 2019, Briffe has been offering responsib..."
3,000ff0fc-e26c-3dcc-ba7f-2804a309e325,Mise en place et respect des gestes barrière,Implementation of and compliance with barrier ...
4,0013e8e8-2e6e-36c3-b554-8affb8e24b90,Mise en place et respect des gestes barrière,Implementation of and compliance with barrier ...


In [270]:
# Check for null/NaN values
for col in df_covid.columns:
    null_count = df_covid[col].isnull().sum()
    total = len(df_images)
    percentage = (null_count / total * 100)
    print(f"{col:20s} Null: {null_count:>10,} ({percentage:>6.2f}%)")

uuid                 Null:          0 (  0.00%)
measures_fr          Null:      1,065 (  0.70%)
measures_en          Null:        134 (  0.09%)


In [271]:
import requests

API_BASE_URL = "https://api.datatourisme.fr/v1"
API_KEY = "98bf037a-4b11-4645-aa93-ec008b57f911"

headers = {
    "Authorization": f"Bearer {API_KEY}",
    "Accept": "application/json"
}

# Test if the main /thesaurus endpoint works (it did before)
response = requests.get(f"{API_BASE_URL}/thesaurus", headers=headers)
print(f"GET /thesaurus: {response.status_code}")
if response.status_code == 200:
    print(response.json())

GET /thesaurus: 200
{'NamedIndividual': 37140, 'City': 35084, 'Theme': 758, 'Class': 384, 'PointOfInterestClass': 384, 'Amenity': 312, 'Country': 240, 'RouteTheme': 217, 'CycleRouteTheme': 214, 'Rating': 206}


In [272]:
# CORRECT APPROACH: EXTRACT THEME/AMENITY FROM THESAURUS

print("Extracting Theme and Amenity keys from thesaurus...\n")

# The thesaurus items have a 'type' field that tells us what category they belong to
theme_keys = set()
amenity_keys = set()
poi_keys = set()

for item in thesaurus_data['Theme']:
    theme_keys.add(item['key'])

for item in thesaurus_data['Amenity']:
    amenity_keys.add(item['key'])

for item in thesaurus_data['PointOfInterestClass']:
    poi_keys.add(item['key'])

print(f"Theme keys from thesaurus: {len(theme_keys)}")
print(f"Amenity keys from thesaurus: {len(amenity_keys)}")
print(f"POI keys from thesaurus: {len(poi_keys)}")

# Collect all unique values from type field
all_type_values = set()
type_value_counts = {}

for obj in all_objects:
    types = obj.get('type', [])
    for t in types:
        all_type_values.add(t)
        type_value_counts[t] = type_value_counts.get(t, 0) + 1

print(f"\nTotal unique type values: {len(all_type_values)}")

# categoriye type values

poi_in_data = set()
theme_in_data = set()
amenity_in_data = set()
unknown_in_data = set()

for type_val in all_type_values:
    if type_val in poi_keys:
        poi_in_data.add(type_val)
    elif type_val in theme_keys:
        theme_in_data.add(type_val)
    elif type_val in amenity_keys:
        amenity_in_data.add(type_val)
    else:
        unknown_in_data.add(type_val)

print("\n" + "="*70)
print("TYPE FIELD BREAKDOWN")
print("="*70)
print(f"PointOfInterestClass: {len(poi_in_data)} types")
print(f"Theme: {len(theme_in_data)} types")
print(f"Amenity: {len(amenity_in_data)} types")
print(f"Unknown/Invalid: {len(unknown_in_data)} types")

# create tables for each category

print("\n" + "="*70)
print("CREATING CATEGORIZED TABLES")
print("="*70)

poi_data = []
theme_data = []
amenity_data = []

for obj in all_objects:
    uuid = obj.get('uuid')
    types = obj.get('type', [])
    
    for type_val in types:
        if type_val in poi_keys:
            poi_data.append({
                'uuid': uuid,
                'poi_class': type_val
            })
        elif type_val in theme_keys:
            theme_data.append({
                'uuid': uuid,
                'theme': type_val
            })
        elif type_val in amenity_keys:
            amenity_data.append({
                'uuid': uuid,
                'amenity': type_val
            })

df_poi = pd.DataFrame(poi_data)
df_theme = pd.DataFrame(theme_data)
df_amenity = pd.DataFrame(amenity_data)

print(f" df_poi (PointOfInterestClass): {len(df_poi):,} rows")
print(f" df_theme: {len(df_theme):,} rows")
print(f" df_amenity: {len(df_amenity):,} rows")

# Add categorized tables
dataframes['poi_classes'] = df_poi
dataframes['theme'] = df_theme
dataframes['amenity'] = df_amenity

# Keep original types table
dataframes['types_all'] = dataframes['types']

# Replace types with just POI classes (most common use case)
dataframes['types'] = df_poi

print("\n Updated dfs dictionary")

# export all tables

print("\n" + "="*70)
print("EXPORTING CATEGORIZED TABLES")
print("="*70)

if not df_poi.empty:
    df_poi.to_csv('clean_poi_classes.csv', index=False, encoding='utf-8')
    print(f" clean_poi_classes.csv: {len(df_poi):,} rows")

if not df_theme.empty:
    df_theme.to_csv('clean_theme.csv', index=False, encoding='utf-8')
    print(f" clean_theme.csv: {len(df_theme):,} rows")

if not df_amenity.empty:
    df_amenity.to_csv('clean_amenity.csv', index=False, encoding='utf-8')
    print(f" clean_amenity.csv: {len(df_amenity):,} rows")

print("\n Complete!")

# SUMMARY STATISTICS

print("\n" + "="*70)
print("SUMMARY")
print("="*70)

total_objects = len(all_objects)
objects_with_poi = df_poi['uuid'].nunique() if not df_poi.empty else 0
objects_with_theme = df_theme['uuid'].nunique() if not df_theme.empty else 0
objects_with_amenity = df_amenity['uuid'].nunique() if not df_amenity.empty else 0

print(f"Total objects: {total_objects:,}")
print(f"Objects with POI classes: {objects_with_poi:,} ({objects_with_poi/total_objects*100:.1f}%)")
print(f"Objects with Theme: {objects_with_theme:,} ({objects_with_theme/total_objects*100:.1f}%)")
print(f"Objects with Amenity: {objects_with_amenity:,} ({objects_with_amenity/total_objects*100:.1f}%)")

print(f"\nAverage per object:")
if objects_with_poi > 0:
    print(f"  POI classes per object: {len(df_poi)/objects_with_poi:.2f}")
if objects_with_theme > 0:
    print(f"  Themes per object: {len(df_theme)/objects_with_theme:.2f}")
if objects_with_amenity > 0:
    print(f"  Amenities per object: {len(df_amenity)/objects_with_amenity:.2f}")

Extracting Theme and Amenity keys from thesaurus...

Theme keys from thesaurus: 756
Amenity keys from thesaurus: 312
POI keys from thesaurus: 384

Total unique type values: 393

TYPE FIELD BREAKDOWN
PointOfInterestClass: 372 types
Theme: 0 types
Amenity: 1 types
Unknown/Invalid: 20 types

CREATING CATEGORIZED TABLES
 df_poi (PointOfInterestClass): 2,141,626 rows
 df_theme: 0 rows
 df_amenity: 3,512 rows

 Updated dfs dictionary

EXPORTING CATEGORIZED TABLES
 clean_poi_classes.csv: 2,141,626 rows
 clean_amenity.csv: 3,512 rows

 Complete!

SUMMARY
Total objects: 476,528
Objects with POI classes: 476,424 (100.0%)
Objects with Theme: 0 (0.0%)
Objects with Amenity: 3,512 (0.7%)

Average per object:
  POI classes per object: 4.50
  Amenities per object: 1.00


In [273]:
# Print just the theme keys for reference
for idx, item in enumerate(thesaurus_data['Theme'], 1):
    print(f"{idx:3d}. {item.get('key')}")

print(f"\nTotal: {len(thesaurus_data['Theme'])} themes")

  1. ATV
  2. AdventureCourse
  3. AerospaceSpace
  4. AfricanCuisine
  5. AirSports
  6. AirTourDeFrance
  7. Alpine
  8. AlpineSkiing
  9. AmericanFootball
 10. AnimationFilm
 11. Anthropology
 12. Antique
 13. Antiquity
 14. AquaWalking
 15. Arboretum
 16. Archeology
 17. Archery
 18. ArchitectureAndHeritage
 19. ArtGallery
 20. ArtNouveauOrArtDeco
 21. ArtSale
 22. ArtistsHouse
 23. ArtsAndCraft
 24. AsianCuisine
 25. Astrology
 26. Astronomy
 27. AtAFarm
 28. AtTheFootOfTheSlopes
 29. AtTheVineyard
 30. Athletics
 31. Automates
 32. Automobile
 33. Badminton
 34. Balneotherapy
 35. Baroque
 36. Baseball
 37. Basketball
 38. Basketry
 39. BasquePelota
 40. Bathing
 41. BeachVolleyball
 42. BeeProduct
 43. Beer
 44. Biathlon
 45. BicycleTouring
 46. Bike
 47. BikeRoute0020GrandTourDeManche
 48. BikeRoute0021PetitTourDeManche
 49. BikeRoute0022LaLoireAVelo
 50. BikeRoute0102ValleeDeLaSerre
 51. BikeRoute0103ValleeDeLOurcq
 52. BikeRoute0300VoieVerteDEvreuxALaValleeDuBec
 53. BikeRout

In [274]:
# DEBUG: WHY NO THEMES?

# 1. Check what's in theme_keys
print(f"\n1. Theme keys loaded: {len(theme_keys)}")
print(f"   Sample theme keys: {list(theme_keys)[:10]}")

# 2. Check what's actually in the type field
print(f"\n2. Analyzing 'type' field in first 100 objects:")
all_types_sample = set()
for obj in all_objects[:100]:
    types = obj.get('type', [])
    all_types_sample.update(types)

print(f"   Unique types found: {len(all_types_sample)}")
print(f"   Sample types: {list(all_types_sample)[:20]}")

# 3. Check for overlap
themes_in_sample = all_types_sample & theme_keys
print(f"\n3. Themes found in sample: {len(themes_in_sample)}")
if themes_in_sample:
    print(f"   {list(themes_in_sample)}")

# 4. Manual search for common theme keywords
print(f"\n4. Searching for common theme-related keywords in types:")
theme_keywords = ['Golf', 'Cinema', 'Sport', 'Culture', 'Nature', 'Bike', 'Wine', 'Beach']
for keyword in theme_keywords:
    matching = [t for t in all_types_sample if keyword.lower() in t.lower()]
    if matching:
        print(f"   '{keyword}': {matching[:5]}")

# 5. Check if themes are POI or Amenity instead
print(f"\n5. Classification check:")
misclassified_as_poi = []
misclassified_as_amenity = []

for theme_key in list(theme_keys)[:50]:  # Check first 50 theme keys
    if theme_key in poi_keys:
        misclassified_as_poi.append(theme_key)
    if theme_key in amenity_keys:
        misclassified_as_amenity.append(theme_key)

print(f"   Themes also in POI keys: {len(misclassified_as_poi)}")
if misclassified_as_poi:
    print(f"      Examples: {misclassified_as_poi[:10]}")

print(f"   Themes also in Amenity keys: {len(misclassified_as_amenity)}")
if misclassified_as_amenity:
    print(f"      Examples: {misclassified_as_amenity[:10]}")

# 6. Check thesaurus structure
print(f"\n6. Thesaurus structure check:")
sample_theme_item = thesaurus_data['Theme'][0]
print(f"   Sample theme item structure:")
print(f"   {json.dumps(sample_theme_item, indent=2, ensure_ascii=False)[:500]}")

# 7. Full scan with detailed logging
print(f"\n7. Full data scan for themes:")
theme_matches = {}
for obj in all_objects:
    types = obj.get('type', [])
    for t in types:
        if t in theme_keys:
            theme_matches[t] = theme_matches.get(t, 0) + 1

print(f"   Total themes found: {len(theme_matches)}")
if theme_matches:
    print(f"   Themes and counts:")
    for theme, count in sorted(theme_matches.items(), key=lambda x: x[1], reverse=True):
        print(f"      {theme}: {count}")
else:
    print(f"  NO THEMES FOUND IN ENTIRE DATASET")



1. Theme keys loaded: 756
   Sample theme keys: ['BikeRouteV47ValleeDuLoirAVelo', 'BikeRoute0365CheminsDeSaintJacques', 'Balneotherapy', 'DogDriving', 'MetalWorking', 'Wine', 'Snowboarding', 'BikeRouteV33LaSeineAVelo', 'Pie', 'PriestMonastic']

2. Analyzing 'type' field in first 100 objects:
   Unique types found: 93
   Sample types: ['Forest', 'LocalTouristOffice', 'EquestrianCenter', 'Visit', 'TrainingWorkshop', 'ChildrensEvent', 'ExhibitionEvent', 'Festival', 'YouthHostelAndInternationalCenter', 'CulturalEvent', 'ServiceProvider', 'RentalAccommodation', 'AccommodationProduct', 'Bakery', 'HorseTour', 'ThemePark', 'RiverPort', 'Producer', 'Conference', 'MusicEvent']

3. Themes found in sample: 0

4. Searching for common theme-related keywords in types:
   'Sport': ['SportsEvent', 'Transport', 'SportsAndLeisurePlace', 'SportsCompetition']
   'Beach': ['Beach']

5. Classification check:
   Themes also in POI keys: 1
      Examples: ['MiniGolf']
   Themes also in Amenity keys: 1
      E

In [275]:
# CREATE THEME TABLE 
print("="*70)
print("RECREATING THEME TABLE")
print("="*70)

# Make sure theme_keys is properly set
theme_keys = {item['key'] for item in thesaurus_data['Theme']}

print(f"Theme keys loaded: {len(theme_keys)}")
print(f"Sample theme keys: {list(theme_keys)[:10]}")

# Create theme data
theme_data = []

for obj in all_objects:
    uuid = obj.get('uuid')
    types = obj.get('type', [])
    
    for type_val in types:
        if type_val in theme_keys:
            theme_data.append({
                'uuid': uuid,
                'theme': type_val
            })

print(f"\nTheme records created: {len(theme_data)}")

if theme_data:
    df_theme = pd.DataFrame(theme_data)
    
    print(f"\n df_theme: {len(df_theme):,} rows")
    print(f" Unique objects: {df_theme['uuid'].nunique():,}")
    print(f" Unique themes: {df_theme['theme'].nunique()}")
    
    print("\nTheme breakdown:")
    print(df_theme['theme'].value_counts())
    
    # Add to dataframes
    dataframes['theme'] = df_theme
    
    # Export
    df_theme.to_csv('clean_theme.csv', index=False, encoding='utf-8')
    print(f"\n Exported to clean_theme.csv")
    
else:
    print("\n No theme data found")
    df_theme = pd.DataFrame(columns=['uuid', 'theme'])
    dataframes['theme'] = df_theme


# COMPLETE CATEGORIZATION SUMMARY

print("\n" + "="*70)
print("COMPLETE DATA CATEGORIZATION")
print("="*70)

total_objects = len(all_objects)

print(f"Total objects: {total_objects:,}\n")

print("Categories breakdown:")
print(f"  POI Classes:  {len(dataframes['poi_classes']):10,} rows  |  {dataframes['poi_classes']['uuid'].nunique():7,} objects  |  {dataframes['poi_classes']['poi_class'].nunique():3} types")
print(f"  Themes:       {len(dataframes['theme']):10,} rows  |  {dataframes['theme']['uuid'].nunique():7,} objects  |  {dataframes['theme']['theme'].nunique():3} types")
print(f"  Amenities:    {len(dataframes['amenity']):10,} rows  |  {dataframes['amenity']['uuid'].nunique():7,} objects  |  {dataframes['amenity']['amenity'].nunique():3} types")

print("\n" + "="*70)
print("THEMES FOUND IN YOUR DATA")
print("="*70)

if not df_theme.empty:
    for theme, count in df_theme['theme'].value_counts().items():
        # Get French label
        theme_info = next((item for item in thesaurus_data['Theme'] if item['key'] == theme), None)
        if theme_info:
            label_fr = theme_info.get('name', {}).get('@fr', theme)
            label_en = theme_info.get('name', {}).get('@en', theme)
            print(f"  {theme:20s} {count:5,} objects   (FR: {label_fr}, EN: {label_en})")
        else:
            print(f"  {theme:20s} {count:5,} objects")


# EXPORT ALL CATEGORIZED TABLES

print("\n" + "="*70)
print("EXPORTING ALL CATEGORIZED TABLES")
print("="*70)

export_summary = []

# Export POI
if not dataframes['poi_classes'].empty:
    dataframes['poi_classes'].to_csv('clean_poi_classes.csv', index=False, encoding='utf-8')
    export_summary.append(('clean_poi_classes.csv', len(dataframes['poi_classes'])))
    print(f" clean_poi_classes.csv: {len(dataframes['poi_classes']):,} rows")

# Export Theme
if not dataframes['theme'].empty:
    dataframes['theme'].to_csv('clean_theme.csv', index=False, encoding='utf-8')
    export_summary.append(('clean_theme.csv', len(dataframes['theme'])))
    print(f" clean_theme.csv: {len(dataframes['theme']):,} rows")

# Export Amenity
if not dataframes['amenity'].empty:
    dataframes['amenity'].to_csv('clean_amenity.csv', index=False, encoding='utf-8')
    export_summary.append(('clean_amenity.csv', len(dataframes['amenity'])))
    print(f" clean_amenity.csv: {len(dataframes['amenity']):,} rows")

# Export all other tables
print("\nExporting other tables:")
for name, df in dataframes.items():
    if name not in ['poi_classes', 'theme', 'amenity', 'types_all', 'types_original']:
        filename = f'clean_{name}.csv'
        df.to_csv(filename, index=False, encoding='utf-8')
        export_summary.append((filename, len(df)))
        print(f" {filename}: {len(df):,} rows")

print("\n" + "="*70)
print(" ALL DATA PROCESSED AND EXPORTED")
print("="*70)

print(f"\nTotal files exported: {len(export_summary)}")
for filename, rows in export_summary:
    print(f"  📄 {filename:30s} {rows:10,} rows")

RECREATING THEME TABLE
Theme keys loaded: 756
Sample theme keys: ['BikeRouteV47ValleeDuLoirAVelo', 'BikeRoute0365CheminsDeSaintJacques', 'Balneotherapy', 'DogDriving', 'MetalWorking', 'Wine', 'Snowboarding', 'BikeRouteV33LaSeineAVelo', 'Pie', 'PriestMonastic']

Theme records created: 3631

 df_theme: 3,631 rows
 Unique objects: 3,626
 Unique themes: 6

Theme breakdown:
theme
Cinema        2214
Pharmacy       525
MiniGolf       471
ViaFerrata     263
Culture        120
RailBike        38
Name: count, dtype: int64

 Exported to clean_theme.csv

COMPLETE DATA CATEGORIZATION
Total objects: 476,528

Categories breakdown:
  POI Classes:   2,141,626 rows  |  476,424 objects  |  372 types
  Themes:            3,631 rows  |    3,626 objects  |    6 types
  Amenities:         3,512 rows  |    3,512 objects  |    1 types

THEMES FOUND IN YOUR DATA
  Cinema               2,214 objects   (FR: Cinéma, EN: Cinema)
  Pharmacy               525 objects   (FR: Pharmacie, EN: Pharmacy)
  MiniGolf        

In [276]:
# Quick check
df_poi_classes = pd.read_csv('clean_poi_classes.csv')
df_poi_classes.shape

(2141626, 2)

In [277]:
df_poi_classes.head(5)

Unnamed: 0,uuid,poi_class
0,000016ad-998e-3b6a-80b7-25531604fc11,PointOfInterest
1,000016ad-998e-3b6a-80b7-25531604fc11,CulturalSite
2,000016ad-998e-3b6a-80b7-25531604fc11,PlaceOfInterest
3,000018f6-c7e6-3762-9713-5595c558c900,Accommodation
4,000018f6-c7e6-3762-9713-5595c558c900,RentalAccommodation


In [278]:
# Quick check
df_theme = pd.read_csv('clean_theme.csv')
df_theme.shape

(3631, 2)

In [279]:
df_theme.head(5)

Unnamed: 0,uuid,theme
0,00158bc9-91eb-32ad-9631-10494aff55b4,Cinema
1,00174d74-1b21-3eba-a7b6-ebbbb64b5a4d,Cinema
2,001ad642-ffdb-388e-be70-f5413c34714c,MiniGolf
3,00209608-23a3-37ef-8cf3-827aa6630b3a,Cinema
4,0021dac9-3661-3c09-8086-67ec1bdbc2ce,Cinema


In [280]:
# Quick check
df_amenity = pd.read_csv('clean_amenity.csv')
df_amenity.shape

(3512, 2)

In [281]:
df_amenity.head(5)

Unnamed: 0,uuid,amenity
0,001bba3b-f7d1-3d07-ab5a-96bae79501f9,Park
1,002ae7ca-1944-3b74-b695-184d864e4264,Park
2,003d8420-bb2d-326e-b98e-d40fd4cb5fa1,Park
3,00407f14-6316-373d-ac63-4a39516abab9,Park
4,005413f5-b4b4-3b58-a5e8-2cf3a6cc511d,Park


In [282]:
#prepare the data for the Star Schema df_main will be fact_pois_table
#we drop columns last_update and uri
df_main.head(5)
columns_to_drop = ['last_update', 'uri']
df_main = df_main.drop(columns=columns_to_drop, errors='ignore')

In [283]:
df_contacts.shape

(544145, 6)

In [284]:
#df_locations will transform to dim_regions 
#we keep the latitude, longitude, postal_code and city. I will use the city_insee to define the regions later on
#the approach is to fetch api data from api.insee.fr and match the regions

df_locations.head(5)
#extract the columns I am interested in transfering to columns 
df_locations_subset = df_locations[['uuid', 'street_address', 'postal_code']]
# Merge with df_contacts (keep all contacts, add location info where available)
# Create subset with only non-null street_address and postal_code
df_locations_subset = df_locations[
    ['uuid', 'street_address', 'postal_code']
].copy()

# Keep only rows where at least one field is not null
df_locations_subset = df_locations_subset[
    df_locations_subset['street_address'].notna() | 
    df_locations_subset['postal_code'].notna()
]

# Remove duplicate UUIDs (keep first)
df_locations_subset = df_locations_subset.drop_duplicates(subset=['uuid'], keep='first')
df_contacts = df_contacts.merge(
    df_locations_subset,
    on='uuid',
    how='left'
)
columns_to_drop = ['elevation', 'city_label_fr','city_label_en','street_adress']
df_locations = df_locations.drop(columns=columns_to_drop, errors='ignore')
df_locations.head()


Unnamed: 0,uuid,latitude,longitude,street_address,postal_code,city,city_insee
0,000016ad-998e-3b6a-80b7-25531604fc11,45.233402,6.722158,,73500,Avrieux,73026
1,000018f6-c7e6-3762-9713-5595c558c900,44.57023,0.44739,"Château Lescaut, 746 route de Laperche",47800,Montignac-de-Lauzun,47188
2,000035ae-b4f4-3ced-9d4d-23287800fd9e,43.7609,2.9526,Route d'Arnac,12360,Brusque,12039
3,000041c5-b92b-3236-9768-6e560cea215c,43.240773,2.765577,Rue de la fontaine fraîche,11200,Argens-Minervois,11013
4,00004466-aa48-3660-8bce-01ff6b03e7a5,48.171538,7.320499,10 route du Vin,68340,Zellenberg,68383


In [285]:
df_contacts.shape 

(544145, 8)

In [286]:
# building the df_regions

In [287]:
import pandas as pd
import requests
import time

def fetch_insee_communes():
    """Fetch all French communes with INSEE codes and regions"""
    
    base_url = "https://geo.api.gouv.fr"
    
    # Get regions
    regions = pd.DataFrame(requests.get(f"{base_url}/regions").json())
    regions = regions.rename(columns={'code': 'region_code', 'nom': 'region_name'})
    
    # Get departments
    departments = pd.DataFrame(requests.get(f"{base_url}/departements").json())
    departments = departments.rename(columns={
        'code': 'department_code',
        'nom': 'department_name',
        'codeRegion': 'region_code'
    })
    departments = departments.merge(regions[['region_code', 'region_name']], on='region_code')
    
    # Get communes
    all_communes = []
    for dept_code in departments['department_code']:
        try:
            url = f"{base_url}/departements/{dept_code}/communes"
            all_communes.extend(requests.get(url, timeout=10).json())
            time.sleep(0.05)
        except:
            continue
    
    # Build DataFrame
    df = pd.DataFrame(all_communes)
    df = df.rename(columns={
        'code': 'insee_code',
        'nom': 'commune_name',
        'codeDepartement': 'department_code',
        'codeRegion': 'region_code',
        'codesPostaux': 'postal_codes'
    })
    
    # Add department and region names
    df = df.merge(
        departments[['department_code', 'department_name', 'region_name']],
        on='department_code',
        how='left'
    )
    
    # Select columns
    cols = ['insee_code', 'commune_name', 'postal_codes', 'department_code', 
            'department_name', 'region_code', 'region_name']
    df = df[[col for col in cols if col in df.columns]]
    
    return df


# Usage
df_insee = fetch_insee_communes()
df_insee.to_csv('insee_communes.csv', index=False, encoding='utf-8')
print(f" {len(df_insee):,} communes fetched")

 34,875 communes fetched


In [288]:
df_insee.head(5)

Unnamed: 0,insee_code,commune_name,postal_codes,department_code,department_name,region_code,region_name
0,1001,L'Abergement-Clémenciat,[01400],1,Ain,84,Auvergne-Rhône-Alpes
1,1002,L'Abergement-de-Varey,[01640],1,Ain,84,Auvergne-Rhône-Alpes
2,1004,Ambérieu-en-Bugey,[01500],1,Ain,84,Auvergne-Rhône-Alpes
3,1005,Ambérieux-en-Dombes,[01330],1,Ain,84,Auvergne-Rhône-Alpes
4,1006,Ambléon,[01300],1,Ain,84,Auvergne-Rhône-Alpes


In [289]:
df_codes = df_insee[['insee_code', 'region_name']]
df_codes = df_insee[['insee_code', 'region_name']].rename(columns={'insee_code': 'city_insee'})

In [297]:
df_locations = df_locations_copy.merge(
    df_codes,
    on='city_insee',
    how='left'
)

In [298]:
df_locations.head(2) #df_locations will transform in dim_regions

Unnamed: 0,uuid,latitude,longitude,elevation,street_address,postal_code,city,city_label_fr,city_label_en,city_insee,region_name
0,000016ad-998e-3b6a-80b7-25531604fc11,45.233402,6.722158,2002.0,,73500,Avrieux,Avrieux,Avrieux,73026,Auvergne-Rhône-Alpes
1,000018f6-c7e6-3762-9713-5595c558c900,44.57023,0.44739,,"Château Lescaut, 746 route de Laperche",47800,Montignac-de-Lauzun,Montignac-de-Lauzun,Montignac-de-Lauzun,47188,Nouvelle-Aquitaine


In [299]:
columns_to_drop = ['elevation', 'street_address', 'city_label_fr', 'city_label_en', 'city_insee']
df_locations = df_locations.drop(columns=columns_to_drop, errors='ignore')

In [300]:
df_regions = df_locations

In [304]:
df_images.head()

Unnamed: 0,uuid,image_url,title_fr,title_en,credits,mime_type
0,000018f6-c7e6-3762-9713-5595c558c900,https://cdt64.media.tourinsoft.eu/upload/monti...,montignac de lauzun-roulotte chateau de lescau...,montignac de lauzun-roulotte chateau de lescau...,©CAPPONI Elisabeth,image/jpeg
1,000035ae-b4f4-3ced-9d4d-23287800fd9e,https://medias.hit.enaveyron.fr/fiches/18324/i...,Camping du Dourdou,,Camping du Dourdou,image/webp
2,000041c5-b92b-3236-9768-6e560cea215c,http://cdt11.media.tourinsoft.eu/upload/Restau...,Argens Restaurant La Guinguette,,Céline Deschamps Photographe,image/jpeg
3,000074dc-0c43-3122-a324-1d0dddd8222f,https://cdt64.media.tourinsoft.eu/upload/visue...,Design sans titre - 1,Design sans titre - 1,Piko Paseos - Grop,image/png
4,000075e9-8af7-3c15-ba1a-e78b88f075a4,https://cdt64.media.tourinsoft.eu/upload/CouvS...,CouvSem1-2026carre,CouvSem1-2026carre,france services,image/png


In [305]:
columns_to_drop = ['title_fr', 'title_en', 'credits']
df_images = df_images.drop(columns=columns_to_drop, errors='ignore')

In [306]:
import pandas as pd

# List of all your dataframes
dataframes = {
    'df_regions': df_regions,
    'df_amenity': df_amenity,
    'df_contacts': df_contacts,
    'df_covid': df_covid,
    'df_creators': df_creators,
    'df_descriptions': df_descriptions,
    'df_images': df_images,
    'df_main': df_main,
    'df_poi_classes': df_poi_classes,
    'df_reviews': df_reviews,
    'df_theme': df_theme,
    'df_types': df_types
}

for name, df in dataframes.items():
    print(f" {name}")
    print(f"   Shape: {df.shape[0]:,} rows × {df.shape[1]} columns")
    print(f"   Columns: {list(df.columns)}")
    print(f"   Memory: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    print(f"\n   Sample (first row):")
    if not df.empty:
        print(df.head(1).to_string(index=False))
    print("\n" + "-"*80 + "\n")

 df_regions
   Shape: 476,506 rows × 6 columns
   Columns: ['uuid', 'latitude', 'longitude', 'postal_code', 'city', 'region_name']
   Memory: 52.93 MB

   Sample (first row):
                                uuid  latitude  longitude postal_code    city          region_name
000016ad-998e-3b6a-80b7-25531604fc11 45.233402   6.722158       73500 Avrieux Auvergne-Rhône-Alpes

--------------------------------------------------------------------------------

 df_amenity
   Shape: 3,512 rows × 2 columns
   Columns: ['uuid', 'amenity']
   Memory: 0.19 MB

   Sample (first row):
                                uuid amenity
001bba3b-f7d1-3d07-ab5a-96bae79501f9    Park

--------------------------------------------------------------------------------

 df_contacts
   Shape: 544,145 rows × 8 columns
   Columns: ['uuid', 'contact_id', 'legal_name', 'email', 'homepage', 'phone', 'street_address', 'postal_code']
   Memory: 100.42 MB

   Sample (first row):
                                uuid  contact_

In [None]:
# Create output folder
output_folder = Path('ready_for_etl_dataframes')
output_folder.mkdir(exist_ok=True)

# List of all your dataframes
dataframes = {
    'df_regions': df_regions,
    'df_amenity': df_amenity,
    'df_contacts':df_contacts,
    'df_covid': df_covid,
    'df_creators': df_creators,
    'df_descriptions': df_descriptions,
    'df_images': df_images,
    'df_main': df_main,
    'df_poi_classes': df_poi_classes,
    'df_reviews': df_reviews,
    'df_theme': df_theme,
    'df_types': df_types
}

print(" Saving DataFrames to CSV...\n")

for name, df in dataframes.items():
    if not df.empty:
        filename = output_folder / f"{name}.csv"
        df.to_csv(filename, index=False, encoding='utf-8')
        print(f"{name:<20} → {filename} ({len(df):,} rows)")
    else:
        print(f" {name:<20} → Skipped (empty)")

print(f"\n All files saved to: {output_folder.absolute()}")