In [None]:
import pandas as pd
import csv
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, classification_report

# VII Clean for ML on appartment rent

To predict appartement rent :
- Number of bedrooms
- "McMeal at McDonalds (or Equivalent Combo Meal) ": "McDonalds",
- "Water (1.5 liter bottle) ": "Water",
- "Fitness Club, Monthly Fee for 1 Adult ": "Fitness_Club",
- "Average Monthly Net Salary (After Tax) ": "Salary"

In [None]:
df = pd.read_csv("clean_cost_of_living.csv")
df = df.groupby(["Country", "Item"], as_index=False).agg({"Price": "mean"})

df["Price"] = pd.to_numeric(df["Price"], errors="coerce")
extras_pivot = df.pivot(index="Country", columns="Item", values="Price").reset_index()

for col in extras_pivot.columns[1:]:
    extras_pivot[col] = pd.to_numeric(extras_pivot[col], errors="coerce")
# Filtrer les colonnes purement numériques
numeric_df = extras_pivot.select_dtypes(include=["number"])

# 1. Calculer la matrice de corrélation
corr_matrix = numeric_df.corr()

# 2. Appliquer une condition : Masquer les valeurs < 0.4 (remplacées par NaN)
filtered_corr = corr_matrix.where(corr_matrix >= 0.4)
# Calculer la matrice de corrélation
plt.figure(figsize=(25, 25))  # Facultatif : pour ajuster la taille de l'image
# Masquer à la fois les corrélations sous le seuil et la diagonale
mask = np.triu(np.ones_like(filtered_corr, dtype=bool))  # Masque pour la matrice triangulaire supérieure
sns.heatmap(
    filtered_corr,
    annot=True,
    cmap="coolwarm",
    fmt=".2f",
    mask=mask  # Appliquer le masque
)
plt.title("Matrice de corrélation (>= 0.4 uniquement)")
plt.show()

In [None]:
print(extras_pivot.isna().sum())

In [None]:
df = pd.read_csv("clean_cost_of_living.csv")
df = df[df['Item'].str.contains(r'Apartment \(') | df['Item'].str.contains(r'Water \(1.5 liter bottle\)') | df['Item'].str.contains(r'McDonalds') | df['Item'].str.contains(r'Average Monthly Net Salary \(After Tax\)') | df['Item'].str.contains(r'Fitness Club, Monthly Fee for 1 Adult')]
df = df.dropna()


In [None]:
df

In [None]:
df.to_csv("clean_apartment.csv", index=False, quoting=csv.QUOTE_NONNUMERIC)

In [None]:
print(df.head())
print(df.info())
print(df.describe())

In [None]:
# Étape 2 : Extraire les prix pour McMeal et Gasoline
extras = df[df["Item"].str.contains("McMeal|Water|Fitness|Average")].copy()

# Étape 3 : Réorganiser les données pour obtenir une table pivot des extras
extras_pivot = extras.pivot(index="Country", columns="Item", values="Price").reset_index()

# Renommer les colonnes pivotées pour faciliter la lecture
extras_pivot.rename(columns={
    "McMeal at McDonalds (or Equivalent Combo Meal) ": "McDonalds",
    "Water (1.5 liter bottle) ": "Water",
    "Fitness Club, Monthly Fee for 1 Adult ": "Fitness_Club",
    "Average Monthly Net Salary (After Tax) ": "Salary"

}, inplace=True)

# Étape 4 : Filtrer les lignes pour les appartements uniquement
apartments = df[df["Item"].str.contains("Apartment")].copy()

# Ajout des colonnes "City Centre" et "Bedrooms"
apartments["City_Centre"] = apartments["Item"].apply(
    lambda x: 1 if "in City Centre" in x else 0
)
apartments["Bedrooms"] = apartments["Item"].str.extract(r'(\d+)').astype(int)

# Étape 5 : Garder les colonnes principales pour les logements
apartments = apartments[["Country", "City_Centre", "Bedrooms", "Price"]]

# Étape 6 : Fusionner les extras au tableau des appartements
final_result = apartments.merge(extras_pivot, on="Country", how="left")

In [None]:
df = final_result
df = df.dropna()
df.to_csv("data_ml.csv", index=False, quoting=csv.QUOTE_NONNUMERIC)
# print(df.head())
# print(df.info())
# print(df.describe())

In [None]:
plt.figure(figsize=(8, 5))
sns.boxplot(x="City_Centre", y="Price", data=df)
plt.title("Prix en fonction de la localisation (centre-ville ou non)", fontsize=16)
plt.xlabel("Centre-ville (0 = Non, 1 = Oui)", fontsize=12)
plt.ylabel("Prix")
plt.show()

In [None]:
# Identifiez les colonnes numériques pour y appliquer l'IQR
numeric_columns = df.select_dtypes(include=["float64", "int64"]).columns

# Stocker les résultats
outliers_info = {}

# Itérer sur chaque colonne numérique
for column in numeric_columns:
    if column != "Price":
        Q1 = df[column].quantile(0.25)  # 1er quartile (25%)
        Q3 = df[column].quantile(0.75)  # 3ème quartile (75%)
        IQR = Q3 - Q1                   # Intervalle interquartile

        # Limites pour détecter les valeurs aberrantes
        lower_bound = Q1 - 3 * IQR
        upper_bound = Q3 + 3 * IQR

        # Stocker les informations pour ce calcul
        outliers_info[column] = {
            "IQR": IQR,
            "Lower Bound": lower_bound,
            "Upper Bound": upper_bound,
            "Outliers": df[(df[column] < lower_bound) | (df[column] > upper_bound)][column].tolist()
        }

# Afficher les résultats
for column, info in outliers_info.items():
    print(f"\n**Colonne : {column}**")
    print(f"IQR : {info['IQR']}")
    print(f"Limite inférieure : {info['Lower Bound']} / Limite supérieure : {info['Upper Bound']}")
    print(f"Valeurs aberrantes : {info['Outliers']}")


In [None]:
print("Nombre de lignes avant nettoyage :", len(df))
for column, info in outliers_info.items():
    lower = info['Lower Bound']
    upper = info['Upper Bound']
    df = df[(df[column] >= lower) & (df[column] <= upper)]

# Afficher la taille des données après suppression
print("Nombre de lignes après nettoyage :", len(df))
df.to_csv("data_ml_clean.csv", index=False, quoting=csv.QUOTE_NONNUMERIC)


In [None]:
plt.figure(figsize=(8, 5))
sns.boxplot(x="City_Centre", y="Price", data=df)
plt.title("Prix en fonction de la localisation (centre-ville ou non)", fontsize=16)
plt.xlabel("Centre-ville (0 = Non, 1 = Oui)", fontsize=12)
plt.ylabel("Prix")
plt.show()