In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import pickle
import json
from area import area
import plotly.express as px
import plotly.io as pio
from scipy.stats import rayleigh

In [2]:
pio.renderers.default = 'notebook_connected'

Data is from inside Airbnb, we will focus on the city of Paris

In [2]:
listing = pd.read_csv("http://data.insideairbnb.com/france/ile-de-france/paris/2022-06-06/data/listings.csv.gz")
calendar = pd.read_csv("http://data.insideairbnb.com/france/ile-de-france/paris/2022-06-06/data/calendar.csv.gz")
datasets = [listing, calendar]
with open("data.pickle", 'wb') as f:
    pickle.dump(datasets, f)

In [2]:
with open("data.pickle", 'rb') as f:
    listing, calendar = pickle.load(f)
quartierGeo = json.load(open(file="neighbourhoods.geojson", mode="r", encoding='utf-8'))

In [5]:
listing.loc[listing["accommodates"]==2][["name", "price", "accommodates"]][0:5]

Unnamed: 0,name,price,accommodates
1,Cosy and quiet appartement at Les Lilas,$59.00,2
3,"Charming studio, Pont de Neuilly - Paris",$65.00,2
4,A cosy appartement,$51.00,2
5,"A deux pas du canal de l'Ourcq, à Pantin",$70.00,2
6,Private room in a beautiful duplex next to Paris,$18.00,2


In [4]:
print(f"{listing.shape = }")

listing.shape = (56739, 74)


In [3]:
#cleaning:
listing['bedrooms'] = listing['bedrooms'].fillna(0)
listing["accommodates"] = listing["accommodates"].replace(0, np.nan)
listing["price"] = listing["price"].apply(lambda x: int(float(x.replace("$", "").replace(",", ""))))

In [28]:
banwords = ["host", "availability", "review", "night"]
newcol = [col for col in listing.columns if all(banword not in col for banword in banwords)]
df = listing[newcol]

In [10]:
df.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'description',
       'neighborhood_overview', 'picture_url', 'neighbourhood',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms',
       'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
       'calendar_updated', 'calendar_last_scraped', 'license',
       'instant_bookable'],
      dtype='object')

In [5]:
def select_col(colname, size = 6):
    data = listing.groupby([colname])[colname].count().reset_index(name="number of listings")
    data = data.loc[(data[colname] <= size)]
    return data

In [30]:
print(quartierGeo["features"][0].keys())
print(quartierGeo["features"][0]["properties"])

dict_keys(['type', 'geometry', 'properties'])
{'neighbourhood': 'Batignolles-Monceau', 'neighbourhood_group': None}


In [6]:
nomToNumero = {
    "Louvre":"1er",
    "Bourse":"2eme",
    "Temple":"3eme",
    "Hôtel-de-Ville":"4eme",
    "Panthéon":"5eme",
    "Luxembourg":"6eme",
    "Palais-Bourbon":"7eme",
    "Élysée":"8eme",
    "Opéra":"9eme",
    "Entrepôt":"10eme",
    "Popincourt":"11eme",
    "Reuilly":"12eme",
    "Gobelins":"13eme",
    "Observatoire":"14eme",
    "Vaugirard":"15eme",
    "Passy":"16eme",
    "Batignolles-Monceau":"17eme",
    "Buttes-Montmartre":"18eme",
    "Buttes-Chaumont":"19eme",
    "Ménilmontant":"20eme"
}
listing["Arrondissement"] = listing["neighbourhood_cleansed"].apply(lambda x: x+" - "+nomToNumero[x])

In [7]:
# Dictionnaire associant chaque quartier à son aire
neighbourhoodArea = dict()
for cartier in quartierGeo["features"]:
    neighbourhoodArea[cartier["properties"]["neighbourhood"]] = round(area(cartier["geometry"]))
neighbourhoodArea["Reuilly"] += -9950000 #J'enlève la superficie du bois de Vincenne qui fait partie du 12eme
#Nombre de logement par quartier:
mapData = pd.DataFrame(listing.groupby(["neighbourhood_cleansed"])["neighbourhood_cleansed"].count().reset_index(name="#logements"))
mapData["superficie"] = mapData["neighbourhood_cleansed"].apply(lambda x: neighbourhoodArea[x])
#nombre moyen de logement en moyenne par quartier
mapData["#logements/hectares"] = round(mapData["#logements"]/(mapData["superficie"]/10000), 2)
mapData = mapData.rename(columns={mapData.columns[0] : 'neighbourhood'})
mapData["Arrondissement"] = mapData["neighbourhood"].apply(lambda x: nomToNumero[x])

In [33]:
mapData[0:3]

Unnamed: 0,neighbourhood,#logements,superficie,#logements/hectares,Arrondissement
0,Batignolles-Monceau,4173,5664741,7.37,17eme
1,Bourse,1826,990459,18.44,2eme
2,Buttes-Chaumont,3346,6787707,4.93,19eme


In [8]:
fig = px.choropleth_mapbox(mapData, geojson=quartierGeo, color='#logements/hectares',
              locations='neighbourhood', featureidkey="properties.neighbourhood",
              mapbox_style="carto-positron", center={"lat":48.86, "lon": 2.35}, zoom=10.5,
              opacity=0.5, hover_data=["Arrondissement", "#logements/hectares", "#logements", "superficie"])
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

In [17]:
# Carte prix moyen par quartier 
mapData = pd.DataFrame(listing.groupby(["neighbourhood_cleansed"])["neighbourhood_cleansed"].count().reset_index(name="#logements"))

mapPrix = listing.groupby(["neighbourhood_cleansed"])["price"].mean().reset_index(name="prix_moyen")
mapPrix = mapPrix.rename(columns={"neighbourhood_cleansed":"quartier"})
fig = px.choropleth_mapbox(mapPrix, geojson=quartierGeo, color='prix_moyen',
              locations='quartier', featureidkey="properties.neighbourhood",
              mapbox_style="carto-positron", center={"lat":48.86, "lon": 2.35}, zoom=10.5,
              opacity=0.5)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

In [7]:
quartierGeo["features"][0]['properties']

{'neighbourhood': 'Batignolles-Monceau', 'neighbourhood_group': None}

In [9]:
# Pie chart avec le type de logement
typeDeLogement = listing.groupby(["room_type"])["room_type"].count().reset_index(name="#logements").sort_values(by=['#logements'],ascending=False)
fig = px.pie(typeDeLogement, values="#logements", names="room_type", color_discrete_sequence=px.colors.sequential.Turbo)
fig.show()

Est-ce que prendre un AirBnB à plusieurs est rentable ?

In [12]:
# Prix/bedrooms calculé si égale à 0 alors +20%
listing.loc[listing["bedrooms"]==0, "bedrooms"] = 0.95
listing["prixParChambre"] = listing["price"]/listing["bedrooms"]
listing.loc[listing["bedrooms"]==0.95, "bedrooms"] = 0
pPC = listing.groupby("bedrooms")["prixParChambre"].mean().reset_index(name="prixParChambre").rename(columns={"bedrooms":"chambres"})
pPC["prixParChambre"] = pPC["prixParChambre"].apply(lambda x : round(x, 2))
pPC["chambres"] = pPC["chambres"].apply(lambda x : round(x))
pPC["Prix par chambre"] = pPC['prixParChambre'].apply(lambda x: f"{round(x)}€")
pPC = pPC.loc[pPC["chambres"] <= 4]
fig = px.bar(pPC, x='chambres', y='prixParChambre', text='Prix par chambre', hover_data={'prixParChambre':False})
fig.update_traces(marker_color='rgb(158,202,225)', marker_line_color='rgb(0,0,0)',
                  marker_line_width=2, opacity=0.6)
fig.show()

In [6]:
"""Faire une répartition des prix en fonction du quartier sur dash avec du callback"""

# Trop grande distribution des prix
#prixPlot["price"] = prixPlot["price"].apply(lambda x: x if x < 2000 else 2000)
prixPlot = listing.loc[(listing['price'] <= 320)]
fig = px.histogram(prixPlot, x='price', nbins=60)
f = fig.full_figure_for_development(warn=False)
xbins = f.data[0].xbins
plotbins = list(np.arange(start=xbins['start'], stop=xbins['end']+xbins['size'], step=xbins['size']))
counts, bins = np.histogram(list(f.data[0].x), bins=plotbins)
print(counts, bins)
fig.show()

[  35   98  767 2009 3425 4411 4880 4754 4496 4228 3323 2265 2541 1707
 1590 1811 1096  887  981  853 1053  536  612  489  473  738  356  322
  370  332  482  195   63] [ -0.5   9.5  19.5  29.5  39.5  49.5  59.5  69.5  79.5  89.5  99.5 109.5
 119.5 129.5 139.5 149.5 159.5 169.5 179.5 189.5 199.5 209.5 219.5 229.5
 239.5 249.5 259.5 269.5 279.5 289.5 299.5 309.5 319.5 329.5]


In [23]:
listing.shape

(56739, 75)

In [35]:
print(f'Pourcentage de logement au dessus de 500€ : {round(listing.loc[listing["price"]>=500].shape[0]/listing.shape[0], 4)*100}%')

Pourcentage de logement au dessus de 500€ : 3.66%


In [None]:
# Nombre de naninana en fonction du temps avec le dataset calendar

In [None]:
# Barchart prix moyen par nuit/densité de logement par quartier

In [None]:
# Carte avec les review : Propreté/Check-in... En fonction du quartier