In [None]:
import pandas as pd

#dataset preparation
path = '../data/paris_dataset_final_ready.csv.gz'
df = pd.read_csv(path, compression='gzip')

print("Dataset chargé avec succès.")
print(f"Dimensions : {df.shape}")


Dataset chargé avec succès.
Dimensions : (73111, 30)


# 0 - Sélection des variables numériques des données et les standardisées

In [None]:
import numpy as np
from sklearn.preprocessing import StandardScaler

num_cols = df.select_dtypes(include=[np.number]).columns.tolist()

var_num = df[num_cols].copy()

#impute missing values with the median
for col in var_num.columns:
    if var_num[col].isna().any():
        var_num[col] = var_num[col].fillna(var_num[col].median())

#standardization
scaler = StandardScaler()
var_num_scaled = scaler.fit_transform(var_num)
var_num_scaled_df = pd.DataFrame(var_num_scaled, columns=num_cols, index=df.index)


print("First 5 rows:")
print(var_num_scaled_df.head())


 (5 premières lignes des variables numériques standardisées) :
   accommodates  bedrooms      beds  minimum_nights_avg_ntm  \
0     -0.814720 -0.353928 -1.220511               -0.211579   
1     -1.408411 -0.353928 -0.440149                0.515598   
2     -0.814720 -1.427401 -0.440149               -0.237549   
3      0.372662  0.719544 -0.440149               -0.003814   
4     -1.408411 -0.353928 -0.440149               -0.159638   

   number_of_reviews  price_clean  dist_to_center  bathrooms_qty    has_ac  \
0          -0.325300     0.015515        0.319210      -0.359003 -0.413491   
1           0.347356    -0.767846       -0.974245       0.574525 -0.413491   
2           6.737593    -0.619420       -1.568968      -0.359003 -0.413491   
3           5.870220    -0.355551       -1.830771      -0.359003 -0.413491   
4           4.524907    -0.817321       -0.110272      -0.359003  2.418430   

   has_elevator  ...  has_washing_machine  standing_thermal  \
0     -0.664876  ...      

# 1 — Scree Plot : Variance expliquée

Cette section affiche la proportion de variance expliquée par chaque composante principale et la variance cumulée.

*Objectif : déterminer combien de composantes garder (ex. 80% / 90% de variance).*

In [None]:
#scree plot — explained variance percentage (cumulative + per component)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA


n_comp = 26
print(f"Scree: local PCA calculation with {n_comp} components")
_pca_scree = PCA(n_components=n_comp)
_pca_scree.fit(X_scaled_df)
#express as percentage
explained = _pca_scree.explained_variance_ratio_ * 100
cum = explained.cumsum()

plt.figure(figsize=(8,5))
plt.bar(range(1, len(explained)+1), explained, alpha=0.6, label='Percentage per PC')
plt.plot(range(1, len(explained)+1), cum, marker='o', color='red', label='Cumulative variance (%)')
plt.xlabel('Principal component (PC)')
plt.ylabel('Percentage of explained variance (%)')
plt.title('Scree Plot — Explained variance per component (%)')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

#indicate number of components needed to reach 80% and 90% variance
n80 = (cum >= 80).argmax() + 1 if (cum >= 80).any() else len(cum)

print(f"Components needed for 80% variance: {n80}")


Scree: calcul PCA local avec 26 composantes


NameError: name 'X_scaled_df' is not defined

Le graphique des valeurs propres montre une décroissance progressive et logarithmique de la variance expliquée, sans apparition d’un coude marqué. Cela indique que la variance est expliqué de manière diffuse entre les différentes composantes principales. Par conséquent, le choix du nombre de composantes retenues le pourcentage de variance cumulée expliquée dans notre cas on choisis de stopper a 80% soit 13 PC pour 26 variables.

# 2 — Valeurs propres (Eigenvalues)

Ici on affiche les valeurs propres et un barplot des eigenvalues pour voir la distribution de la variance en valeurs (scree en valeurs).

*Objectif : identifier les composantes ayant une variance significative.*

In [None]:
#simple eigenvalue study (13 pcs)
from sklearn.decomposition import PCA
import pandas as pd

n_pc = 13
pca13 = PCA(n_components=n_pc)
pca13.fit(X_scaled_df)

eigenvalues = pca13.explained_variance_
explained_pct = (pca13.explained_variance_ratio_ * 100).round(3)
cumulative_pct = explained_pct.cumsum().round(3)

ev13 = pd.DataFrame({
    'eigenvalue': eigenvalues.round(4),
    'explained_pct': explained_pct,
    'cumulative_pct': cumulative_pct
}, index=[f'PC{i+1}' for i in range(n_pc)])

print('\n--- Eigenvalues (13 PCs) ---')
print(ev13)

#simple save
ev13.to_csv('../data/pca_ev_13_simple.csv')
print('\nSaved: ../data/pca_ev_13_simple.csv')


In [None]:
#loadings for pc1 — barplot of signed contributions and top features (over 13 pcs)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

#retrieve loadings from the PCA used above (13 pcs) or fallback
try:
    components = _pca.components_
    pc_cols = [f'PC{i+1}' for i in range(components.shape[0])]
    loadings_df = pd.DataFrame(components.T, index=X_scaled_df.columns, columns=pc_cols)
except Exception:
    #fallback: recompute with 13 pcs
    _pca_fallback = PCA(n_components=min(13, X_scaled_df.shape[1]))
    _pca_fallback.fit(X_scaled_df)
    loadings_df = pd.DataFrame(_pca_fallback.components_.T, index=X_scaled_df.columns, columns=[f'PC{i+1}' for i in range(_pca_fallback.components_.shape[0])])

#pc1 loadings
pc1 = loadings_df['PC1']
#for horizontal display, sort by signed value (negatives first)

#display top n features by absolute importance
top_n = 25
top_feats = pc1.abs().sort_values(ascending=False).head(top_n).index
#obtain signed values for these features and sort for the barh
vals = pc1.loc[top_feats].sort_values(ascending=True)

plt.figure(figsize=(8,10))
plt.barh(vals.index, vals.values, color='C1', alpha=0.8)
plt.xlabel('Loading (PC1) — signed contribution')
plt.title(f'Signed loadings for PC1 — top {top_n} features')
plt.tight_layout()
plt.show()

print(f"\nTop {top_n} features by absolute contribution to PC1:")
print(pc1.abs().sort_values(ascending=False).head(top_n))
