No need to rererun these scripts, they are only here to explain how we shoose the datasets

In [None]:
import os
import requests
import pandas as pd
from datetime import datetime, timedelta
from io import BytesIO

CITY = "berlin"
COUNTRY = "germany/be"
SAVE_DIR = "../data"
MIN_FILL_RATE = 0.70

def generate_last_two_years_dates():
    end_date = datetime.now()
    start_date = end_date - timedelta(days=2 * 365)
    
    date_range = pd.date_range(start=start_date, end=end_date, freq='D')
    dates_str = date_range.strftime('%Y-%m-%d').tolist()[::-1]
    
    print(f"{len(dates_str)} dates g√©n√©r√©es (de {dates_str[0]} √† {dates_str[-1]})")
    return dates_str

def is_dataset_valid(df, date_str):
    core_features = ['price', 'amenities', 'beds', 'bedrooms']
    
    if 'bathrooms_text' in df.columns:
        bath_col = 'bathrooms_text'
    elif 'bathrooms' in df.columns:
        bath_col = 'bathrooms'
    else:
        bath_col = None

    completeness_scores = {}
    for feat in core_features:
        completeness_scores[feat] = df[feat].notnull().mean() if feat in df.columns else 0.0
            
    completeness_scores['bathrooms'] = df[bath_col].notnull().mean() if bath_col else 0.0

    is_valid = True
    print(f"  Evaluation {date_str} (seuil : {MIN_FILL_RATE*100}%)")
    
    for feat, fill_rate in completeness_scores.items():
        status = "ok" if fill_rate >= MIN_FILL_RATE else "echec"
        print(f"     {status} : '{feat}' = {fill_rate*100:.1f}%")
        if fill_rate < MIN_FILL_RATE:
            is_valid = False
            
    return is_valid

def brute_force_temporal_berlin():
    if not os.path.exists(SAVE_DIR):
        os.makedirs(SAVE_DIR)
        
    dates_to_test = generate_last_two_years_dates()
    print(f"\nScan quotidien sur les serveurs pour {CITY}")
    
    winning_date = None
    df_valid = None
    hits_found = 0
    
    session = requests.Session()
    session.headers.update({'User-Agent': 'Mozilla/5.0'})
    
    for date_str in dates_to_test:
        url = f"http://data.insideairbnb.com/{COUNTRY}/{CITY}/{date_str}/data/listings.csv.gz"
        
        try:
            head_response = session.head(url, timeout=5)
            
            if head_response.status_code == 200:
                hits_found += 1
                print(f"\nFichier trouv√© : {date_str}")
                print("   T√©l√©chargement...")
                
                get_response = session.get(url, stream=True, timeout=20)
                content = BytesIO(get_response.content)
                df_temp = pd.read_csv(content, compression='gzip', low_memory=False)
                
                if is_dataset_valid(df_temp, date_str):
                    winning_date = date_str
                    df_valid = df_temp
                    print(f"\nDate retenue : {winning_date}")
                    break
                else:
                    print("   Fichier trouv√© mais recal√© sur les crit√®res de qualit√©")
                    
        except requests.exceptions.Timeout:
            pass 
        except Exception as e:
            print(f"   Erreur pour {date_str} : {e}")

    if winning_date and df_valid is not None:
        save_path = f"{SAVE_DIR}/{CITY}_listings_raw_{winning_date}.csv.gz"
        df_valid.to_csv(save_path, index=False, compression='gzip')
        print(f"\nSauvegard√© : {save_path}")
        return save_path, df_valid
    else:
        print(f"\nEchec : {hits_found} fichiers trouv√©s sur 730 jours, aucun ne d√©passe 70% de compl√©tude")
        print("Les donn√©es de Berlin sur cette p√©riode sont trop incompl√®tes")
        return None, None

save_path, df_berlin = brute_force_temporal_berlin()


üóìÔ∏è 731 dates g√©n√©r√©es (de 2026-02-21 √† 2024-02-22).

--- D√âMARRAGE DU SCAN QUOTIDIEN SUR LES SERVEURS POUR BERLIN ---

üéØ [BINGO] Fichier d√©tect√© sur le serveur √† la date : 2025-11-24
   üì• T√©l√©chargement en cours (GET)...
  üìä √âvaluation 2025-11-24 (Seuil min: 70.0%) :
     ‚ùå √âCHEC : 'price' = 64.2%
     ‚úÖ PASS  : 'amenities' = 100.0%
     ‚ùå √âCHEC : 'beds' = 64.2%
     ‚úÖ PASS  : 'bedrooms' = 86.0%
     ‚úÖ PASS  : 'bathrooms' = 99.9%
   ‚è≠Ô∏è Fichier existant mais recal√© par vos crit√®res de qualit√©. On continue...

üéØ [BINGO] Fichier d√©tect√© sur le serveur √† la date : 2025-09-23
   üì• T√©l√©chargement en cours (GET)...
  üìä √âvaluation 2025-09-23 (Seuil min: 70.0%) :
     ‚ùå √âCHEC : 'price' = 64.9%
     ‚úÖ PASS  : 'amenities' = 100.0%
     ‚ùå √âCHEC : 'beds' = 65.0%
     ‚úÖ PASS  : 'bedrooms' = 85.8%
     ‚úÖ PASS  : 'bathrooms' = 99.9%
   ‚è≠Ô∏è Fichier existant mais recal√© par vos crit√®res de qualit√©. On continue...

üéØ [BINGO] 

On ne garde pas Berlin, on perds trop de donn√©e. On ne peut pas supposer que les variables manquantes sont iid ce qui d√©truirait la construction en quantile, on cherche donc une autre ville.

On cherche parmi les villes d'europes ayant une grande proportion d'observations

In [None]:
import os
import requests
import re
import pandas as pd
from io import BytesIO

SAVE_DIR = "../data"
MIN_FILL_RATE = 0.70
TARGET_DOMAIN_COUNT = 3 

CANDIDATE_CITIES = [
    ("italy/lazio", "rome"),
    ("the-netherlands/north-holland", "amsterdam"),
    ("united-kingdom/england", "london"),
    ("spain/comunidad-de-madrid", "madrid"),
    ("austria/vienna", "vienna"),
    ("ireland/leinster", "dublin"),
    ("portugal/lisbon", "lisbon"),
    ("greece/attica", "athens"),
    ("belgium/bru", "brussels")
]

def get_latest_date_for_city(country, city):
    portal_url = "http://insideairbnb.com/get-the-data/"
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(portal_url, headers=headers, timeout=10)
        pattern = fr"https?://data\.insideairbnb\.com/[^/]+/[^/]+/{city}/([^/]+)/data/listings\.csv\.gz"
        dates_found = re.findall(pattern, response.text)
        if dates_found:
            return sorted(list(set(dates_found)), reverse=True)[0] 
    except Exception:
        pass
    return "2024-03-20"

def is_dataset_valid(df, city, date_str):
    core_features = ['price', 'amenities', 'beds', 'bedrooms']
    
    if 'bathrooms_text' in df.columns:
        bath_col = 'bathrooms_text'
    elif 'bathrooms' in df.columns:
        bath_col = 'bathrooms'
    else:
        bath_col = None

    completeness_scores = {}
    for feat in core_features:
        completeness_scores[feat] = df[feat].notnull().mean() if feat in df.columns else 0.0
    completeness_scores['bathrooms'] = df[bath_col].notnull().mean() if bath_col else 0.0

    is_valid = True
    print(f"  Scores pour {city} :")
    for feat, fill_rate in completeness_scores.items():
        status = "ok" if fill_rate >= MIN_FILL_RATE else "echec"
        print(f"     {status} : '{feat}' = {fill_rate*100:.1f}%")
        if fill_rate < MIN_FILL_RATE:
            is_valid = False
            
    return is_valid

def find_top_k_valid_domains(k=3):
    if not os.path.exists(SAVE_DIR):
        os.makedirs(SAVE_DIR)
        
    print(f"Recherche des {k} meilleurs domaines")
    
    valid_domains = []
    
    for country, city in CANDIDATE_CITIES:
        if len(valid_domains) >= k:
            break
            
        print(f"\n{city} ({len(valid_domains)}/{k} trouv√©s)")
        latest_date = get_latest_date_for_city(country, city)
        url = f"http://data.insideairbnb.com/{country}/{city}/{latest_date}/data/listings.csv.gz"
        
        try:
            response = requests.get(url, stream=True, timeout=15)
            if response.status_code == 200:
                print("   T√©l√©chargement...")
                content = BytesIO(response.content)
                df_temp = pd.read_csv(content, compression='gzip', low_memory=False)
                
                if is_dataset_valid(df_temp, city, latest_date):
                    save_path = f"{SAVE_DIR}/{city}_listings_raw_{latest_date}.csv.gz"
                    df_temp.to_csv(save_path, index=False, compression='gzip')
                    
                    print(f"   {city} retenu")
                    valid_domains.append({'city': city, 'path': save_path, 'dataframe': df_temp})
                else:
                    print(f"   {city} recal√©")
            else:
                print(f"   Fichier introuvable (code {response.status_code})")
                
        except Exception as e:
            print(f"   Erreur pour {city} : {e}")

    print(f"\nScan termin√© : {len(valid_domains)} villes retenues")
    for domain in valid_domains:
        print(f" - {domain['city'].capitalize()}")
    
    return valid_domains

top_domains = find_top_k_valid_domains(TARGET_DOMAIN_COUNT)


--- D√âMARRAGE DU SCANNER : RECHERCHE DU TOP 3 DOMAINES ---

üåç √âvaluation du domaine : ROME (0/3 trouv√©s)
   üì• T√©l√©chargement en m√©moire...
  üìä Scores pour ROME :
     ‚úÖ PASS  : 'price' = 89.1%
     ‚úÖ PASS  : 'amenities' = 100.0%
     ‚úÖ PASS  : 'beds' = 89.3%
     ‚úÖ PASS  : 'bedrooms' = 97.7%
     ‚úÖ PASS  : 'bathrooms' = 99.9%

üèÜ ROME valid√© et sauvegard√© !

üåç √âvaluation du domaine : AMSTERDAM (1/3 trouv√©s)
   üì• T√©l√©chargement en m√©moire...
  üìä Scores pour AMSTERDAM :
     ‚ùå √âCHEC : 'price' = 56.0%
     ‚úÖ PASS  : 'amenities' = 100.0%
     ‚ùå √âCHEC : 'beds' = 56.3%
     ‚úÖ PASS  : 'bedrooms' = 97.1%
     ‚úÖ PASS  : 'bathrooms' = 99.9%
   üí• AMSTERDAM disqualifi√©.

üåç √âvaluation du domaine : LONDON (1/3 trouv√©s)
   üì• T√©l√©chargement en m√©moire...
  üìä Scores pour LONDON :
     ‚ùå √âCHEC : 'price' = 64.0%
     ‚úÖ PASS  : 'amenities' = 100.0%
     ‚ùå √âCHEC : 'beds' = 64.0%
     ‚úÖ PASS  : 'bedrooms' = 86.8%
     ‚úÖ PAS