In [40]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
import json

In [41]:
# Configuration de l'affichage pandas
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [42]:
# Liste des villes
cities = ["Paris", "Lyon", "Marseille", "Toulouse", "Bordeaux", "Nantes", "Nice"]

In [43]:
class Numbeo:
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }

    def extract_cost_of_living(self, cities):
        """Extraction des données brutes du coût de la vie"""
        raw_data = []
        for city in cities:
            try:
                print(f"Extraction des données de coût de la vie pour {city}...")
                url = f"https://www.numbeo.com/cost-of-living/in/{city}?displayCurrency=EUR"
                response = requests.get(url, headers=self.headers)
                response.raise_for_status()
                
                soup = BeautifulSoup(response.text, 'html.parser')
                tables = soup.find_all('table', class_='data_wide_table')
                
                data = {'City': city}
                for table in tables:
                    for row in table.find_all('tr'):
                        cells = row.find_all('td')
                        if len(cells) >= 2:
                            item_name = cells[0].text.strip()
                            price_text = cells[1].text.strip()
                            data[item_name] = price_text
                
                raw_data.append(data)
                time.sleep(2)
            except Exception as e:
                print(f"Erreur extraction coût de la vie {city}: {str(e)}")
        
        # Sauvegarde des données brutes
        with open('data/extract/cost_of_living_raw.json', 'w', encoding='utf-8') as f:
            json.dump(raw_data, f, ensure_ascii=False, indent=4)
        
        df = pd.DataFrame(raw_data)
        df.to_csv('data/extract/cost_of_living_raw.csv', index=False)
        
        return raw_data

    def extract_health(self, cities):
        """Extraction des données brutes de santé"""
        raw_data = []
        for city in cities:
            try:
                print(f"Extraction des données de santé pour {city}...")
                url = f"https://www.numbeo.com/health-care/in/{city}"
                response = requests.get(url, headers=self.headers)
                response.raise_for_status()
                
                soup = BeautifulSoup(response.text, 'html.parser')
                
                data = {'City': city}
                
                # Extraction de l'index de santé général
                health_index = soup.find('td', string=re.compile('Health Care Index:'))
                if health_index and health_index.find_next_sibling():
                    data['Health Care Index'] = health_index.find_next_sibling().text.strip()
                
                # Extraction des composants de santé depuis la table
                table = soup.find('table', {'class': 'table_builder_with_value_explanation data_wide_table'})
                if table:
                    for row in table.find_all('tr')[1:]:  # Skip header row
                        cols = row.find_all('td')
                        if len(cols) >= 3:
                            component = cols[0].text.strip()
                            value = cols[2].text.strip()
                            data[component] = value
                
                raw_data.append(data)
                time.sleep(2)
                
            except Exception as e:
                print(f"Erreur extraction santé {city}: {str(e)}")
        
        # Sauvegarde des données brutes
        with open('data/extract/health_raw.json', 'w', encoding='utf-8') as f:
            json.dump(raw_data, f, ensure_ascii=False, indent=4)
        
        df = pd.DataFrame(raw_data)
        df.to_csv('data/extract/health_raw.csv', index=False)
        
        return raw_data

In [44]:
class DataCleaner:
    def clean_cost_of_living(self, raw_data):
        """Nettoyage des données du coût de la vie"""
        cleaned_data = []
        for city_data in raw_data:
            cleaned_city_data = {'City': city_data['City']}
            for key, value in city_data.items():
                if key != 'City':
                    if "Mortgage Interest Rate" in key:
                        cleaned_city_data[key] = self.clean_price(value).replace(' €', '')
                    else:
                        cleaned_city_data[key] = self.clean_price(value)
            cleaned_data.append(cleaned_city_data)

        # Sauvegarde des données nettoyées
        with open('data/transform/cost_of_living_cleaned.json', 'w', encoding='utf-8') as f:
            json.dump(cleaned_data, f, ensure_ascii=False, indent=4)
        
        df = pd.DataFrame(cleaned_data)
        df.to_csv('data/transform/cost_of_living_cleaned.csv', index=False)
        
        return cleaned_data

    def clean_health(self, raw_data):
        """Nettoyage des données de santé"""
        cleaned_data = []
        for city_data in raw_data:
            cleaned_city_data = {'City': city_data['City']}
            for key, value in city_data.items():
                if key != 'City':
                    try:
                        # Remplacer \n par un espace et nettoyer
                        value = value.replace('\n', ' ').strip()
                        # Convertir en float et arrondir à 2 décimales
                        if value.split(' ')[0].replace('.', '').isdigit():
                            number = float(value.split(' ')[0])
                            rating = ' '.join(value.split(' ')[1:])  # Garde la partie textuelle (High, Very High, etc.)
                            cleaned_value = f"{number:.2f} {rating}"
                            cleaned_city_data[key] = cleaned_value
                        else:
                            cleaned_city_data[key] = value
                    except (ValueError, AttributeError):
                        cleaned_city_data[key] = value

            cleaned_data.append(cleaned_city_data)

        # Sauvegarde des données nettoyées
        with open('data/transform/health_cleaned.json', 'w', encoding='utf-8') as f:
            json.dump(cleaned_data, f, ensure_ascii=False, indent=4)
        
        df = pd.DataFrame(cleaned_data)
        df.to_csv('data/transform/health_cleaned.csv', index=False)
        
        return cleaned_data

    @staticmethod
    def clean_price(price_text):
        """Nettoyage des prix"""
        try:
            price = price_text.replace(' ', '').strip()
            if '€' in price:
                price = price.replace('€', '').strip()
            price = price.replace(',', '.')
            
            if price and price.replace('.', '').isdigit():
                value = float(price)
                return f"{value:.2f} €"
            return price_text.strip()
        except:
            return price_text.strip()

In [45]:
class DataStorage:
    def save_final_data(self, cost_data, health_data):
        """Sauvegarde des données finales"""
        # Préparation des DataFrames
        cost_df = pd.DataFrame(cost_data).set_index('City')
        health_df = pd.DataFrame(health_data).set_index('City')
        
        # Sauvegarde des données finales
        cost_df.to_csv('data/load/cost_of_living_final.csv')
        health_df.to_csv('data/load/health_final.csv')
        
        # Sauvegarde en JSON
        with open('data/load/cost_of_living_final.json', 'w', encoding='utf-8') as f:
            json.dump(cost_data, f, ensure_ascii=False, indent=4)
        
        with open('data/load/health_final.json', 'w', encoding='utf-8') as f:
            json.dump(health_data, f, ensure_ascii=False, indent=4)
        
        return cost_df, health_df


In [46]:
def create_directories():
    """Création des répertoires nécessaires"""
    import os
    directories = ['data/extract', 'data/transform', 'data/load']
    for directory in directories:
        os.makedirs(directory, exist_ok=True)

In [47]:
def main():
    # Liste des villes
    cities = ["Paris", "Lyon", "Marseille", "Toulouse", "Bordeaux", "Nantes", "Nice"]
    
    # Création des répertoires
    create_directories()
    
    # Initialisation des classes
    numbeo = Numbeo()
    cleaner = DataCleaner()
    storage = DataStorage()
    
    # 1. Extraction
    print("1. Extraction des données...")
    cost_raw_data = numbeo.extract_cost_of_living(cities)
    health_raw_data = numbeo.extract_health(cities)
    
    # 2. Transformation
    print("\n2. Nettoyage des données...")
    cost_cleaned_data = cleaner.clean_cost_of_living(cost_raw_data)
    health_cleaned_data = cleaner.clean_health(health_raw_data)
    
    # 3. Chargement
    print("\n3. Sauvegarde des données finales...")
    cost_df, health_df = storage.save_final_data(cost_cleaned_data, health_cleaned_data)
    
    print("\nAperçu des données de coût de la vie :")
    display(cost_df)
    
    print("\nAperçu des données de santé :")
    display(health_df)

if __name__ == "__main__":
    main()

1. Extraction des données...
Extraction des données de coût de la vie pour Paris...


Extraction des données de coût de la vie pour Lyon...
Extraction des données de coût de la vie pour Marseille...
Extraction des données de coût de la vie pour Toulouse...
Extraction des données de coût de la vie pour Bordeaux...
Extraction des données de coût de la vie pour Nantes...
Extraction des données de coût de la vie pour Nice...
Extraction des données de santé pour Paris...
Extraction des données de santé pour Lyon...
Extraction des données de santé pour Marseille...
Extraction des données de santé pour Toulouse...
Extraction des données de santé pour Bordeaux...
Extraction des données de santé pour Nantes...
Extraction des données de santé pour Nice...

2. Nettoyage des données...

3. Sauvegarde des données finales...

Aperçu des données de coût de la vie :


Unnamed: 0_level_0,"Meal, Inexpensive Restaurant","Meal for 2 People, Mid-range Restaurant, Three-course",McMeal at McDonalds (or Equivalent Combo Meal),Domestic Beer (0.5 liter draught),Imported Beer (0.33 liter bottle),Cappuccino (regular),Coke/Pepsi (0.33 liter bottle),Water (0.33 liter bottle),"Milk (regular), (1 liter)",Loaf of Fresh White Bread (500g),"Rice (white), (1kg)",Eggs (regular) (12),Local Cheese (1kg),Chicken Fillets (1kg),Beef Round (1kg) (or Equivalent Back Leg Red Meat),Apples (1kg),Banana (1kg),Oranges (1kg),Tomato (1kg),Potato (1kg),Onion (1kg),Lettuce (1 head),Water (1.5 liter bottle),Bottle of Wine (Mid-Range),Domestic Beer (0.5 liter bottle),Cigarettes 20 Pack (Marlboro),One-way Ticket (Local Transport),Monthly Pass (Regular Price),Taxi Start (Normal Tariff),Taxi 1km (Normal Tariff),Taxi 1hour Waiting (Normal Tariff),Gasoline (1 liter),Volkswagen Golf 1.4 90 KW Trendline (Or Equivalent New Car),Toyota Corolla Sedan 1.6l 97kW Comfort (Or Equivalent New Car),"Basic (Electricity, Heating, Cooling, Water, Garbage) for 85m2 Apartment",Mobile Phone Monthly Plan with Calls and 10GB+ Data,"Internet (60 Mbps or More, Unlimited Data, Cable/ADSL)","Fitness Club, Monthly Fee for 1 Adult",Tennis Court Rent (1 Hour on Weekend),"Cinema, International Release, 1 Seat","Preschool (or Kindergarten), Full Day, Private, Monthly for 1 Child","International Primary School, Yearly for 1 Child",1 Pair of Jeans (Levis 501 Or Similar),"1 Summer Dress in a Chain Store (Zara, H&M, ...)",1 Pair of Nike Running Shoes (Mid-Range),1 Pair of Men Leather Business Shoes,Apartment (1 bedroom) in City Centre,Apartment (1 bedroom) Outside of Centre,Apartment (3 bedrooms) in City Centre,Apartment (3 bedrooms) Outside of Centre,Price per Square Meter to Buy Apartment in City Centre,Price per Square Meter to Buy Apartment Outside of Centre,Average Monthly Net Salary (After Tax),"Mortgage Interest Rate in Percentages (%), Yearly, for 20 Years Fixed-Rate"
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1
Paris,15.00 €,70.00 €,12.00 €,7.00 €,3.43 €,3.91 €,3.88 €,2.92 €,1.33 €,1.95 €,2.62 €,4.41 €,21.25 €,12.56 €,19.80 €,3.21 €,2.12 €,3.17 €,3.60 €,2.35 €,2.61 €,1.56 €,0.74 €,8.00 €,2.23 €,12.50 €,2.15 €,86.40 €,8.00 €,2.00 €,50.00 €,1.89 €,"30,870.00 €","32,331.25 €",236.21 €,15.14 €,31.84 €,33.80 €,16.93 €,13.00 €,730.62 €,"14,857.14 €",100.11 €,42.50 €,102.59 €,138.20 €,"1,339.49 €","1,011.18 €","3,050.00 €","2,061.54 €","12,052.54 €","8,590.00 €","2,992.52 €",3.72
Lyon,15.00 €,70.00 €,11.00 €,7.00 €,2.45 €,3.41 €,2.87 €,1.64 €,1.35 €,2.38 €,2.31 €,3.75 €,21.20 €,13.28 €,21.38 €,2.49 €,2.03 €,3.47 €,3.46 €,2.38 €,2.35 €,1.36 €,0.75 €,8.00 €,2.34 €,11.75 €,2.00 €,70.00 €,5.00 €,1.80 €,40.00 €,1.79 €,"32,395.00 €","32,012.50 €",228.92 €,21.71 €,32.54 €,38.80 €,20.00 €,11.00 €,560.00 €,"10,333.33 €",89.67 €,37.83 €,87.00 €,133.00 €,904.54 €,630.75 €,"1,815.83 €","1,118.75 €","6,048.00 €","3,899.67 €","2,591.38 €",3.76
Marseille,15.00 €,62.50 €,10.00 €,6.85 €,2.45 €,3.14 €,2.60 €,1.61 €,1.19 €,2.12 €,2.31 €,3.44 €,21.20 €,11.80 €,21.38 €,2.49 €,2.03 €,3.25 €,3.46 €,2.38 €,1.92 €,1.36 €,0.75 €,6.75 €,2.34 €,11.75 €,1.80 €,48.00 €,5.00 €,3.00 €,26.50 €,1.79 €,"32,395.00 €","30,868.00 €",166.42 €,21.71 €,29.44 €,37.00 €,19.50 €,11.00 €,560.00 €,"8,833.33 €",89.67 €,37.00 €,87.00 €,133.00 €,762.18 €,615.00 €,"1,338.33 €","1,178.00 €","5,168.62 €","3,078.50 €","2,001.74 €",3.84
Toulouse,14.50 €,60.00 €,11.00 €,6.75 €,3.00 €,2.88 €,2.42 €,1.56 €,1.27 €,2.00 €,2.64 €,3.61 €,22.00 €,12.56 €,15.76 €,2.83 €,2.68 €,2.80 €,3.06 €,2.37 €,2.30 €,1.56 €,0.81 €,7.00 €,3.42 €,12.75 €,1.90 €,56.00 €,2.60 €,1.30 €,35.20 €,1.77 €,"32,395.00 €","32,812.50 €",163.63 €,19.34 €,30.85 €,35.88 €,23.00 €,12.50 €,753.55 €,"16,000.00 €",100.00 €,39.00 €,103.62 €,133.33 €,748.33 €,578.57 €,"1,491.67 €","1,050.00 €","4,341.25 €","3,078.38 €","2,587.30 €",3.71
Bordeaux,17.50 €,60.00 €,10.00 €,7.50 €,3.75 €,3.50 €,2.41 €,1.34 €,1.08 €,1.85 €,2.65 €,4.47 €,13.25 €,13.42 €,18.25 €,2.85 €,2.00 €,3.50 €,3.84 €,2.20 €,2.28 €,1.67 €,0.66 €,9.00 €,3.58 €,11.00 €,1.80 €,42.85 €,2.00 €,1.66 €,39.00 €,1.81 €,"32,395.00 €","32,812.50 €",219.71 €,21.81 €,28.92 €,32.72 €,25.00 €,12.00 €,664.50 €,"5,800.00 €",91.25 €,37.33 €,80.00 €,119.80 €,767.50 €,600.83 €,"1,438.89 €","1,094.44 €","5,403.83 €","4,375.00 €","2,351.00 €",3.79
Nantes,15.00 €,55.00 €,10.00 €,7.00 €,2.83 €,3.06 €,2.30 €,1.30 €,1.01 €,1.85 €,2.65 €,3.30 €,13.25 €,13.42 €,18.25 €,2.85 €,1.62 €,2.00 €,3.00 €,1.50 €,1.77 €,1.25 €,0.50 €,6.00 €,3.02 €,11.00 €,1.80 €,58.50 €,3.50 €,2.48 €,32.10 €,1.77 €,"32,395.00 €","31,062.50 €",172.78 €,21.81 €,23.60 €,27.15 €,10.00 €,12.00 €,664.50 €,"5,800.00 €",91.25 €,28.75 €,80.00 €,119.80 €,677.00 €,589.17 €,"1,575.00 €","1,220.00 €","4,624.24 €","3,382.29 €","2,182.00 €",3.62
Nice,20.00 €,80.00 €,10.50 €,7.00 €,2.20 €,3.09 €,3.04 €,2.29 €,1.41 €,1.58 €,2.28 €,4.10 €,20.29 €,14.00 €,12.63 €,2.51 €,1.78 €,2.11 €,4.67 €,2.08 €,1.69 €,1.18 €,0.53 €,7.00 €,2.37 €,11.50 €,1.70 €,40.00 €,3.00 €,2.08 €,25.80 €,1.77 €,"32,395.00 €","33,312.50 €",222.19 €,21.67 €,33.42 €,30.56 €,21.25 €,14.00 €,"1,600.00 €","17,500.00 €",93.75 €,41.25 €,100.00 €,163.00 €,969.17 €,846.15 €,"1,875.00 €","1,716.67 €","6,878.43 €","3,898.17 €","2,484.62 €",3.77



Aperçu des données de santé :


Unnamed: 0_level_0,Skill and competency of medical staff,Speed in completing examinations and reports,Equipment for modern diagnosis and treatment,Accuracy and completeness in filling out reports,Friendliness and courtesy of the staff,Satisfaction with responsiveness (waitings) in medical institutions,Satisfaction with cost to you,Convenience of location for you
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Paris,82.49 Very High,75.40 High,87.17 Very High,78.38 High,67.61 High,58.83 Moderate,82.18 Very High,84.14 Very High
Lyon,81.06 Very High,75.00 High,86.72 Very High,78.33 High,67.19 High,54.55 Moderate,80.30 Very High,81.25 Very High
Marseille,86.76 Very High,65.62 High,88.24 Very High,83.82 Very High,75.00 High,72.06 High,91.18 Very High,87.50 Very High
Toulouse,80.65 Very High,75.86 High,87.93 Very High,76.72 High,77.42 High,67.50 High,83.87 Very High,75.83 High
Bordeaux,90.38 Very High,82.69 Very High,84.62 Very High,84.62 Very High,80.77 Very High,72.92 High,86.54 Very High,88.46 Very High
Nantes,91.67 Very High,78.33 High,86.67 Very High,83.93 Very High,76.67 High,58.33 Moderate,82.14 Very High,75.00 High
Nice,89.00 Very High,82.00 Very High,91.30 Very High,85.23 Very High,79.00 High,68.75 High,91.67 Very High,90.91 Very High
