In [None]:
from pymongo import MongoClient
import polars as pl
import json, re
import pandas as pd


In [None]:
# Connexion au serveur local MongoDB
client = MongoClient("mongodb://localhost:27017")

# Sélection de la base de données
db = client["netcites"]

# Sélection de la collection
col = db["logements"]

In [None]:
print("Bases disponibles :", client.list_database_names())
print("Nombre de documents :", col.count_documents({}))


Bases disponibles : ['admin', 'config', 'local', 'ma_bd_mongoDB_migration', 'netcites']
Nombre de documents : 95885


In [None]:
print("Exemple de document :", col.find_one())

Exemple de document : {'_id': ObjectId('68e8e20eca89cc237f6b7f6c'), 'id': 80260, 'listing_url': 'https://www.airbnb.com/rooms/80260', 'scrape_id': 20240610195007, 'last_scraped': '2024-06-13', 'source': 'previous scrape', 'name': "Nice studio in Jourdain's village", 'description': '', 'neighborhood_overview': '', 'picture_url': 'https://a0.muscache.com/pictures/716553/6c14f251_original.jpg', 'host_id': 333548, 'host_url': 'https://www.airbnb.com/users/show/333548', 'host_name': 'Charlotte', 'host_since': '2011-01-03', 'host_location': 'Paris, France', 'host_about': "My name is Charlotte, I'm 26 and I study cinema production. I'll be gone the whole year to study in Grenoble and that's my father Jean Luc, who will keep an eye on my little studio and take care of you. \n", 'host_response_time': 'N/A', 'host_response_rate': 'N/A', 'host_acceptance_rate': 'N/A', 'host_is_superhost': 'f', 'host_thumbnail_url': 'https://a0.muscache.com/im/users/333548/profile_pic/1350320876/original.jpg?aki_p

In [None]:
# Fonction pour afficher les colonnes d'un document

def peek_columns(col, match=None, proj=None):
    doc = col.find_one(match or {}, proj)
    if not doc:
        print("Aucun document pour ce filtre/projection.")
        return []
    cols = [k for k in doc.keys() if k != "_id"]
    print(f"Colonnes renvoyées ({len(cols)}): {cols}")
    return cols

In [None]:
# Fonction pour afficher un échantillon de documents

def to_float(x):
    if x is None or x == "":
        return None
    if isinstance(x, (int, float)):
        return float(x)
    if isinstance(x, str):
        x = re.sub(r"[^\d\.\-]", "", x)
        if x == "":
            return None
        try:
            return float(x)
        except:
            return None
    return None

In [None]:
# Normalisation des valeurs superhost

def norm_superhost(x):
    if x is None or x == "": return None
    s = str(x).strip().lower()
    if s in ("t","true","1","yes","y"): return "superhost"
    if s in ("f","false","0","no","n"): return "non_superhost"
    return None

In [None]:
# Chargement des données dans un DataFrame Polars
rows = ( {k: (json.dumps(v, ensure_ascii=False) if isinstance(v,(dict,list)) else (None if v=="" else v))
          for k,v in doc.items()} for doc in col.find({}, {"_id":0}) )
df = pl.from_dicts(list(rows), infer_schema_length=10000)


In [None]:
# Affichage des colonnes et d'un aperçu des données
display(df.columns)
print("Aperçu des données :")
display(df.head())

['id',
 'listing_url',
 'scrape_id',
 'last_scraped',
 'source',
 'name',
 'description',
 'neighborhood_overview',
 'picture_url',
 'host_id',
 'host_url',
 'host_name',
 'host_since',
 'host_location',
 'host_about',
 'host_response_time',
 'host_response_rate',
 'host_acceptance_rate',
 'host_is_superhost',
 'host_thumbnail_url',
 'host_picture_url',
 'host_neighbourhood',
 'host_listings_count',
 'host_total_listings_count',
 'host_verifications',
 'host_has_profile_pic',
 'host_identity_verified',
 'neighbourhood',
 'neighbourhood_cleansed',
 'neighbourhood_group_cleansed',
 'latitude',
 'longitude',
 'property_type',
 'room_type',
 'accommodates',
 'bathrooms',
 'bathrooms_text',
 'bedrooms',
 'beds',
 'amenities',
 'price',
 'minimum_nights',
 'maximum_nights',
 'minimum_minimum_nights',
 'maximum_minimum_nights',
 'minimum_maximum_nights',
 'maximum_maximum_nights',
 'minimum_nights_avg_ntm',
 'maximum_nights_avg_ntm',
 'calendar_updated',
 'has_availability',
 'availability_30

Aperçu des données :


id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,host_url,host_name,host_since,host_location,host_about,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_thumbnail_url,host_picture_url,host_neighbourhood,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,neighbourhood,neighbourhood_cleansed,neighbourhood_group_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,minimum_nights,maximum_nights,minimum_minimum_nights,maximum_minimum_nights,minimum_maximum_nights,maximum_maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm,calendar_updated,has_availability,availability_30,availability_60,availability_90,availability_365,calendar_last_scraped,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
i64,str,i64,str,str,str,str,str,str,i64,str,str,str,str,str,str,str,str,str,str,str,str,i64,i64,str,str,str,str,str,null,f64,f64,str,str,i64,f64,str,i64,i64,str,str,i64,i64,i64,i64,i64,i64,f64,f64,null,str,i64,i64,i64,i64,str,i64,i64,i64,str,str,f64,f64,f64,f64,f64,f64,f64,str,str,i64,i64,i64,i64,f64
80260,"""https://www.airbnb.com/rooms/8…",20240610195007,"""2024-06-13""","""previous scrape""","""Nice studio in Jourdain's vill…",,,"""https://a0.muscache.com/pictur…",333548,"""https://www.airbnb.com/users/s…","""Charlotte""","""2011-01-03""","""Paris, France""","""My name is Charlotte, I'm 26 a…","""N/A""","""N/A""","""N/A""","""f""","""https://a0.muscache.com/im/use…","""https://a0.muscache.com/im/use…","""Buttes-Chaumont - Belleville""",1,1,"""['email', 'phone']""","""t""","""t""",,"""Ménilmontant""",,48.87131,2.38848,"""Entire rental unit""","""Entire home/apt""",3,,"""1 bath""",1,,"""[""Hangers"", ""Essentials"", ""Wif…",,2,730,2,2,730,730,2.0,730.0,,"""t""",0,0,0,0,"""2024-06-13""",206,0,0,"""2011-04-18""","""2021-10-03""",4.63,4.61,4.75,4.85,4.78,4.61,4.64,"""7512005340473""","""f""",1,1,0,0,1.29
3109,"""https://www.airbnb.com/rooms/3…",20240610195007,"""2024-06-12""","""city scrape""","""zen and calm""","""Lovely Appartment with one bed…","""Good restaurants<br />very clo…","""https://a0.muscache.com/pictur…",3631,"""https://www.airbnb.com/users/s…","""Anne""","""2008-10-14""","""Paris, France""",,"""N/A""","""N/A""","""80%""","""f""","""https://a0.muscache.com/im/use…","""https://a0.muscache.com/im/use…","""Alésia""",1,2,"""['email', 'phone']""","""t""","""t""","""Neighborhood highlights""","""Observatoire""",,48.83191,2.3187,"""Entire rental unit""","""Entire home/apt""",2,1.0,"""1 bath""",1,0.0,"""[""Smart lock"", ""Smoke alarm"", …","""$250.00""",2,30,2,2,30,30,2.0,30.0,,"""t""",29,45,75,349,"""2024-06-12""",4,0,0,"""2017-10-28""","""2019-10-24""",5.0,5.0,5.0,5.0,5.0,5.0,5.0,"""7511409139079""","""t""",1,1,0,0,0.05
80301,"""https://www.airbnb.com/rooms/8…",20240610195007,"""2024-06-13""","""city scrape""","""toits de Paris""","""On the top,sharing my space,yo…","""SAFE neighborhood,late bus and…","""https://a0.muscache.com/pictur…",433758,"""https://www.airbnb.com/users/s…","""Genevieve""","""2011-03-10""","""Paris, France""",""" Doctorat Sorbonne 1,Litteratu…","""within an hour""","""100%""","""27%""","""t""","""https://a0.muscache.com/im/use…","""https://a0.muscache.com/im/use…","""Port-Royal""",2,2,"""['email', 'phone']""","""t""","""t""","""Neighborhood highlights""","""Panthéon""",,48.83918,2.34699,"""Private room in rental unit""","""Private room""",1,1.5,"""1.5 baths""",1,1.0,"""[""Stove"", ""Host greets you"", ""…","""$60.00""",30,190,30,30,190,190,30.0,190.0,,"""t""",11,12,12,145,"""2024-06-13""",42,1,0,"""2011-06-15""","""2023-09-02""",4.83,4.79,4.57,4.89,5.0,4.95,4.74,"""7510500029543""","""f""",2,1,1,0,0.27
5396,"""https://www.airbnb.com/rooms/5…",20240610195007,"""2024-06-13""","""city scrape""","""Your perfect Paris studio on Î…","""NEW SOFA-BED SINCE JUNE 2023, …","""You are within walking distanc…","""https://a0.muscache.com/pictur…",7903,"""https://www.airbnb.com/users/s…","""Borzou""","""2009-02-14""","""Paris, France""","""We have spent a lot of time tr…","""within an hour""","""100%""","""95%""","""f""","""https://a0.muscache.com/im/use…","""https://a0.muscache.com/im/use…","""Saint-Paul - Ile Saint-Louis""",2,4,"""['email', 'phone']""","""t""","""t""","""Neighborhood highlights""","""Hôtel-de-Ville""",,48.85247,2.35835,"""Entire rental unit""","""Entire home/apt""",2,1.0,"""1 bath""",0,1.0,"""[""Stove"", ""Host greets you"", ""…","""$96.00""",2,1125,1,2,1125,1125,1.0,1125.0,,"""t""",13,36,66,154,"""2024-06-13""",403,64,2,"""2009-06-30""","""2024-05-24""",4.61,4.63,4.58,4.81,4.84,4.96,4.59,"""7510402838018""","""f""",1,1,0,0,2.21
7397,"""https://www.airbnb.com/rooms/7…",20240610195007,"""2024-06-13""","""city scrape""","""MARAIS - 2ROOMS APT - 2/4 PEOP…","""VERY CONVENIENT, WITH THE BEST…",,"""https://a0.muscache.com/pictur…",2626,"""https://www.airbnb.com/users/s…","""Franck""","""2008-08-30""","""Paris, France""","""I am a writer,54, author of no…","""within an hour""","""100%""","""68%""","""t""","""https://a0.muscache.com/im/pic…","""https://a0.muscache.com/im/pic…","""Le Marais""",3,9,"""['email', 'phone']""","""t""","""t""",,"""Hôtel-de-Ville""",,48.85909,2.35315,"""Entire rental unit""","""Entire home/apt""",4,1.0,"""1 bath""",2,1.0,"""[""Stove"", ""Host greets you"", ""…","""$160.00""",10,130,7,12,130,130,10.0,130.0,,"""t""",9,19,31,277,"""2024-06-13""",354,21,1,"""2011-04-08""","""2024-05-20""",4.73,4.8,4.45,4.92,4.89,4.93,4.73,"""7510400829623""","""f""",3,3,0,0,2.21


# requête n°1  : Calculer le taux de réservation moyen par mois par type de logement

# Par type de chambre

In [None]:
(df.with_columns(((30 - pl.col("availability_30").cast(pl.Float64,strict=False))/30).alias("taux"))
   .group_by("room_type")
   .agg((pl.col("taux").mean()*100).round(2).alias("taux_moyen_mensuel_%"))
   .sort("taux_moyen_mensuel_%", descending=True))


room_type,taux_moyen_mensuel_%
str,f64
"""Entire home/apt""",71.29
"""Private room""",70.29
"""Shared room""",60.72
"""Hotel room""",53.53


# Par type de propriété

In [None]:


(df.with_columns(((30 - pl.col("availability_30").cast(pl.Float64,strict=False))/30).alias("taux"))
   .group_by("property_type")
   .agg((pl.col("taux").mean()*100).round(2).alias("taux_moyen_mensuel_%"))
   .sort("taux_moyen_mensuel_%", descending=True)) 


property_type,taux_moyen_mensuel_%
str,f64
"""Entire bungalow""",100.0
"""Shared room in ice dome""",100.0
"""Cave""",100.0
"""Dome""",100.0
"""Private room in cave""",100.0
…,…
"""Shipping container""",3.33
"""Tower""",3.33
"""Entire bed and breakfast""",0.0
"""Barn""",0.0


# requête n°2 : Calculer la médiane des nombre d’avis pour tous les logements

In [None]:
df.select(pl.col("number_of_reviews").cast(pl.Float64,strict=False).median()).item()
print("Médiane du nombre d'avis pour tous les logements:", df.select(pl.col("number_of_reviews").cast(pl.Float64,strict=False).median()).item())

Médiane du nombre d'avis pour tous les logements: 3.0


# requête 3 : Calculer la médiane des nombre d’avis par catégorie d’hôte

In [None]:
print("Médiane du nombre d'avis par catégorie d'hôte :")

(df.with_columns(pl.col("number_of_reviews").cast(pl.Float64,strict=False))
   .group_by("host_is_superhost")
   .agg(pl.col("number_of_reviews").median().alias("median_number_of_reviews"))
   .sort("median_number_of_reviews", descending=True))

Médiane du nombre d'avis par catégorie d'hôte :


host_is_superhost,median_number_of_reviews
str,f64
"""t""",24.0
,12.5
"""f""",2.0


# Requete n°4 : Calculer la densité de logements par quartier de Paris

In [None]:
total = df.height
(df.filter(pl.col("neighbourhood_cleansed").is_not_null())
   .group_by("neighbourhood_cleansed")
   .agg(pl.count().alias("nombre_logements"))
   .with_columns((pl.col("nombre_logements")/total*100).round(2).alias("part_%"))
   .sort("nombre_logements", descending=True))


(Deprecated in version 0.20.5)
  .agg(pl.count().alias("nombre_logements"))


neighbourhood_cleansed,nombre_logements,part_%
str,u32,f64
"""Buttes-Montmartre""",10555,11.01
"""Popincourt""",8430,8.79
"""Vaugirard""",7802,8.14
"""Batignolles-Monceau""",6857,7.15
"""Entrepôt""",6558,6.84
…,…,…
"""Élysée""",2898,3.02
"""Hôtel-de-Ville""",2821,2.94
"""Palais-Bourbon""",2740,2.86
"""Luxembourg""",2701,2.82


# requête n°5 : Identifier les quartiers avec le plus fort taux de réservation par mois

In [None]:
(df.with_columns(((30 - pl.col("availability_30").cast(pl.Float64,strict=False))/30).alias("taux"))
   .filter(pl.col("neighbourhood_cleansed").is_not_null() & pl.col("taux").is_not_null())
   .group_by("neighbourhood_cleansed")
   .agg((pl.col("taux").mean()*100).round(2).alias("taux_moyen_mensuel_%"))
   .sort("taux_moyen_mensuel_%", descending=True))
    

neighbourhood_cleansed,taux_moyen_mensuel_%
str,f64
"""Ménilmontant""",75.42
"""Entrepôt""",74.81
"""Popincourt""",74.78
"""Buttes-Chaumont""",74.13
"""Panthéon""",73.14
…,…
"""Palais-Bourbon""",68.87
"""Bourse""",68.69
"""Luxembourg""",66.45
"""Élysée""",62.45
