# Charger les données et ajouter une distance au centreVisualisation des clusters sur une carte (Géographique)

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, sqrt

# Initialiser Spark
spark = SparkSession.builder.appName("AirbnbAnalysis").getOrCreate()

# Charger les données
listings = spark.read.csv("../Data/cleaned/Data_15_Decembre_2023/listings_detailed_cleaned.csv", header=True, inferSchema=True)

# Ajouter une colonne pour la distance au centre-ville (coordonnées exemple : Lyon)
center_lat, center_lon = 45.764043, 4.835659
listings = listings.withColumn(
    "distance_to_center",
    sqrt((col("latitude") - center_lat)**2 + (col("longitude") - center_lon)**2)
)

# Ajouter une classification "centre-ville" ou "périphérie"
listings = listings.withColumn(
    "location_category",
    when(col("distance_to_center") < 0.05, "Centre-Ville").otherwise("Périphérie")
)

# Clusterisation des profils de propriétaires

In [4]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler

# Assembler les colonnes pour la clusterisation
feature_cols = ["distance_to_center", "accommodates", "bedrooms"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
listings_features = assembler.transform(listings)

# Appliquer KMeans
kmeans = KMeans(k=4, seed=42)  # 4 clusters
model = kmeans.fit(listings_features)
listings_clustered = model.transform(listings_features)

# Ajouter les clusters au DataFrame
listings_clustered.select("id", "location_category", "prediction").show()


24/12/20 15:29:38 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS


+-------------------+-----------------+----------+
|                 id|location_category|prediction|
+-------------------+-----------------+----------+
|           13652635|     Centre-Ville|         1|
|           42535147|     Centre-Ville|         3|
|           43736244|     Centre-Ville|         1|
|           30554074|     Centre-Ville|         3|
|            9474252|     Centre-Ville|         1|
|           35566339|     Centre-Ville|         3|
|           25907692|     Centre-Ville|         3|
|           22070354|     Centre-Ville|         1|
|           13368057|     Centre-Ville|         0|
| 659360798118676837|     Centre-Ville|         1|
|           21627041|     Centre-Ville|         3|
|1038321652893112316|     Centre-Ville|         1|
|           52204459|     Centre-Ville|         3|
|1028705317405886579|     Centre-Ville|         3|
|           19059910|     Centre-Ville|         1|
| 954491709455745451|     Centre-Ville|         3|
| 888929963583097864|     Centr

# Visualisation statistique des clusters

In [6]:
import folium
from pyspark.sql.functions import collect_list

# Convertir les clusters Spark en Pandas pour Folium
listings_clustered_pd = listings_clustered.select("latitude", "longitude", "prediction").toPandas()

# Créer une carte centrée sur Lyon
m = folium.Map(location=[45.764043, 4.835659], zoom_start=12)

# Ajouter des marqueurs pour chaque cluster
colors = ['red', 'blue', 'green', 'purple']  # Une couleur par cluster
for _, row in listings_clustered_pd.iterrows():
    folium.CircleMarker(
        location=[row["latitude"], row["longitude"]],
        radius=5,
        color=colors[int(row["prediction"]) % len(colors)],  # Conversion en entier
        fill=True,
        fill_color=colors[int(row["prediction"]) % len(colors)],  # Conversion en entier
        fill_opacity=0.7,
        popup=f"Cluster: {int(row['prediction'])}"  # Afficher l'entier dans le popup
    ).add_to(m)

# Sauvegarder ou afficher la carte
m.save("cluster_map.html")

## Diaggramme en boîte

In [7]:
import seaborn as sns
import matplotlib.pyplot as plt

# Convertir les données en Pandas pour Seaborn
listings_stats_pd = listings_clustered.select("distance_to_center", "accommodates", "price", "prediction").toPandas()

# Diagramme en boîte des prix par cluster
plt.figure(figsize=(10, 6))
sns.boxplot(data=listings_stats_pd, x="prediction", y="price", palette="Set3")
plt.title("Distribution des prix par cluster")
plt.xlabel("Cluster")
plt.ylabel("Prix")
plt.show()

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `price` cannot be resolved. Did you mean one of the following? [`id`, `name`, `source`, `beds`, `latitude`].;
'Project [distance_to_center#53, accommodates#30, 'price, prediction#230]
+- Project [id#17L, listing_url#18, scrape_id#19L, last_scraped#20, source#21, name#22, picture_url#23, host_id#24, neighbourhood_cleansed#25, latitude#26, longitude#27, property_type#28, room_type#29, accommodates#30, bathrooms_text#31, beds#32, availability_365#33, bedrooms#34, distance_to_center#53, location_category#74, features#97, UDF(features#97) AS prediction#230]
   +- Project [id#17L, listing_url#18, scrape_id#19L, last_scraped#20, source#21, name#22, picture_url#23, host_id#24, neighbourhood_cleansed#25, latitude#26, longitude#27, property_type#28, room_type#29, accommodates#30, bathrooms_text#31, beds#32, availability_365#33, bedrooms#34, distance_to_center#53, location_category#74, UDF(struct(distance_to_center, distance_to_center#53, accommodates_double_VectorAssembler_8b099927c0e5, cast(accommodates#30 as double), bedrooms, bedrooms#34)) AS features#97]
      +- Project [id#17L, listing_url#18, scrape_id#19L, last_scraped#20, source#21, name#22, picture_url#23, host_id#24, neighbourhood_cleansed#25, latitude#26, longitude#27, property_type#28, room_type#29, accommodates#30, bathrooms_text#31, beds#32, availability_365#33, bedrooms#34, distance_to_center#53, CASE WHEN (distance_to_center#53 < 0.05) THEN Centre-Ville ELSE Périphérie END AS location_category#74]
         +- Project [id#17L, listing_url#18, scrape_id#19L, last_scraped#20, source#21, name#22, picture_url#23, host_id#24, neighbourhood_cleansed#25, latitude#26, longitude#27, property_type#28, room_type#29, accommodates#30, bathrooms_text#31, beds#32, availability_365#33, bedrooms#34, SQRT((POWER((latitude#26 - 45.764043), cast(2 as double)) + POWER((longitude#27 - 4.835659), cast(2 as double)))) AS distance_to_center#53]
            +- Relation [id#17L,listing_url#18,scrape_id#19L,last_scraped#20,source#21,name#22,picture_url#23,host_id#24,neighbourhood_cleansed#25,latitude#26,longitude#27,property_type#28,room_type#29,accommodates#30,bathrooms_text#31,beds#32,availability_365#33,bedrooms#34] csv


## Moyennes des clusters

In [None]:
cluster_stats = listings_clustered.groupBy("prediction").agg(
    avg("distance_to_center").alias("avg_distance_to_center"),
    avg("accommodates").alias("avg_accommodates"),
    avg("price").alias("avg_price")
)
cluster_stats.show()

# Cartographie avec clusters et événements


In [None]:
# Convertir les événements en Pandas
events_pd = events.select("latitude", "longitude", "name", "date").toPandas()

# Ajouter des marqueurs pour les événements
for _, event in events_pd.iterrows():
    folium.Marker(
        location=[event["latitude"], event["longitude"]],
        popup=f"{event['name']} ({event['date']})",
        icon=folium.Icon(color="orange", icon="info-sign")
    ).add_to(m)

# Sauvegarder ou afficher la carte enrichie
m.save("cluster_with_events_map.html")


# Analyse interactive (Slider pour prix ou période)

In [None]:
import plotly.express as px

# Tracer une carte interactive avec les clusters
fig = px.scatter_mapbox(
    listings_clustered_pd,
    lat="latitude",
    lon="longitude",
    color="prediction",
    size="price",
    mapbox_style="carto-positron",
    title="Clusters de logements Airbnb"
)
fig.show()