In [None]:
import pandas as pd
import numpy as np
import re

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


file_path = "listings.csv.gz"
df = pd.read_csv(file_path, compression="gzip")

cols = [
    "price", "latitude", "longitude",
    "accommodates", "bathrooms_text",
    "bedrooms", "beds",
    "number_of_reviews", "review_scores_rating"
]

df = df[cols].copy()


df["price"] = df["price"].astype(str)
df["price"] = df["price"].str.replace("$", "", regex=False)
df["price"] = df["price"].str.replace(",", "", regex=False)
df["price"] = pd.to_numeric(df["price"], errors="coerce")


# Salles de bain
def extract_bath(x):
    if pd.isna(x):
        return 0
    m = re.search(r"(\d+(\.\d+)?)", str(x))
    if m:
        return float(m.group(1))
    return 0


df["bathrooms"] = df["bathrooms_text"].apply(extract_bath)
df = df.drop(columns=["bathrooms_text"])


# Suppression NaN
df = df.dropna()


# Creating targets
df["category_price"] = pd.qcut(
    df["price"],
    q=4,
    labels=[0, 1, 2, 3]
)

print(df["category_price"].value_counts())



features = [
    "latitude", "longitude",
    "accommodates", "bedrooms", "beds",
    "bathrooms",
    "number_of_reviews", "review_scores_rating"
]

X = df[features]
y = df["category_price"]


# =========================
# 5. Standardisation
# =========================

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


# =========================
# 6. Train / Test split
# =========================

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled,
    y,
    test_size=0.2,
    random_state=42
)


# =========================
# 7. PCA (visualisation)
# =========================

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

plt.figure(figsize=(8, 6))
plt.scatter(
    X_pca[:, 0],
    X_pca[:, 1],
    c=y.astype(int),
    alpha=0.5
)
plt.title("PCA - Projection des logements")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.colorbar(label="Classe prix")
plt.show()


# =========================
# 8. LDA
# =========================

lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)

y_pred_lda = lda.predict(X_test)
acc_lda = accuracy_score(y_test, y_pred_lda)

print("Accuracy LDA :", round(acc_lda, 3))



rf = RandomForestClassifier(
    n_estimators=100,
    random_state=42
)

rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
acc_rf = accuracy_score(y_test, y_pred_rf)

print("Accuracy Random Forest :", round(acc_rf, 3))


# Importance variables
importances = pd.Series(
    rf.feature_importances_,
    index=features
).sort_values(ascending=False)

print("\nImportances :")
print(importances)
