In [9]:
# ======================================================================
# 1. IMPORTATION DES LIBRAIRIES
# ======================================================================
import pandas as pd
import numpy as np


# ======================================================================
# 2. CHARGEMENT DU JEU DE DONNÉES
# ======================================================================
DATA_PATH = 'Data/row/housing_sales_ma_.csv'
print('rarr')
df = pd.read_csv(DATA_PATH)
df_clean = df.copy()


# ======================================================================
# 3. NETTOYAGE INITIAL
# ======================================================================

# Suppression des espaces inutiles dans l’adresse
df_clean['address'] = df_clean['address'].astype(str).str.strip()

# Conversion ft² → m²
df_clean['surface'] = (df_clean['surface'] / 10.7639).round(2)

print("Valeurs manquantes avant nettoyage :")
print(df_clean.isnull().sum(), "\n")


# ======================================================================
# 4. CORRECTION DES VILLES 'Unknown'
# ======================================================================

def remplacer_city(row):
    if row['city'] == 'Unknown' and pd.notna(row['address']):
        return row['address'].split(',')[0].strip()
    return row['city']

df_clean['city'] = df_clean.apply(remplacer_city, axis=1)

print("Valeurs manquantes après correction des villes :")
print(df_clean.isnull().sum(), "\n")

# Normalisation des noms de villes
df_clean['city'] = (
    df_clean['city']
    .astype(str)
    .str.replace("Marrakesh", "Marrakech", case=False)
    .str.replace("MARRAKECH", "Marrakech", case=False)
    .str.replace("Tangier", "Tanger", case=False)
    .str.strip()
)

df_clean['principale'] = df_clean['principale'].replace({
    'Asilah': 'Tanger-Tétouan-Al Hoceïma',
    'Marrakesh-Tensift-El Haouz': 'Marrakech-Safi'
})

df_clean['address'] = df_clean['address'].replace({
    'Tangier, Tanger, Province de': 'Tanger, Tanger-Tétouan-Al Hoceïma',
    'Marrakech, Marrakech, Province de':'Marrakech-Safi'   
})
df_clean['address'] = df_clean['address'].str.replace(
    "Marrakesh-Tensift-El Haouz",
    "Marrakech-Safi",
    case=False,
    regex=True
)


# ======================================================================
# 5. CREATION DES REGIONS A PARTIR DES VILLES
# ======================================================================

city_to_region = {
    "Casablanca": "Casablanca-Settat",
    "Mohammedia": "Casablanca-Settat",
    "Rabat": "Rabat-Salé-Kénitra",
    "Kénitra": "Rabat-Salé-Kénitra",
    "Fès": "Fès-Meknès",
    "Meknès": "Fès-Meknès",
    "Tanger": "Tanger-Tétouan-Al Hoceïma",
    "Tétouan": "Tanger-Tétouan-Al Hoceïma",
    "Asilah": "Tanger-Tétouan-Al Hoceïma",
    "Agadir": "Souss-Massa",
    "Safi": "Marrakech-Safi",
    "Marrakech": "Marrakech-Safi",
    "Ouarzazate": "Drâa-Tafilalet",
    "Zagora": "Drâa-Tafilalet",
    "Oujda": "L’Oriental",
    "Béni Mellal": "Béni Mellal-Khénifra",
    "Khénifra": "opyu",
    "Benslimane": "Casablanca-Settat",
    "Skhirat": "Rabat-Salé-Kénitra",
    "Salé": "Rabat-Salé-Kénitra"
}

df_clean['principale'] = df_clean['principale'].fillna(df_clean['city'].map(city_to_region))
df_clean['principale'] = df_clean.apply(
    lambda row: city_to_region.get(row['city'], row['principale']),
    axis=1
)




# ======================================================================
# 6. SUPPRESSION DES DOUBLONS + NORMALISATION TYPE LOGEMENT
# ======================================================================
df_clean = df_clean.drop_duplicates()

df_clean['proprety type'] = df_clean['proprety type'].map({
    'Rural': 'house'
}).fillna(df_clean['proprety type'])


# ======================================================================
# 7. NORMALISATION DES NOMS DE COLONNES
# ======================================================================
df_clean.rename(columns=lambda x: x.strip().replace(" ", "_"), inplace=True)


# ======================================================================
# 8. DETECTION DES SURFACES ANORMALES
# ======================================================================

plage_surface = {
    'apartment': (30, 200),
    'house': (80, 600)
}

def est_anormale(row):
    t = str(row['proprety_type']).lower()
    s = row['surface']
    if pd.isna(s) or t not in plage_surface:
        return True
    mini, maxi = plage_surface[t]
    return s < mini or s > maxi

df_clean['anormale'] = df_clean.apply(est_anormale, axis=1)

df_clean_normales = df_clean[df_clean['anormale'] == False]
df_clean_anormales = df_clean[df_clean['anormale'] == True]

print("Données normales :", len(df_clean_normales))
print("Données anormales :", len(df_clean_anormales))


# ======================================================================
# 9. REMPLACEMENT DES SURFACES ANORMALES
# ======================================================================

def surface_aleatoire(type_logement):
    type_logement = str(type_logement).lower()
    if type_logement in plage_surface:
        mini, maxi = plage_surface[type_logement]
        return np.random.randint(mini, maxi)
    return np.nan

df_clean_anormales['surface'] = df_clean_anormales.apply(
    lambda row: surface_aleatoire(row['proprety_type']),
    axis=1
)

# Fusion
df_clean_final = pd.concat([df_clean_normales, df_clean_anormales], ignore_index=True)


# ======================================================================
# 10. NORMALISATION FINALE
# ======================================================================
df_clean_final["proprety_type"] = (
    df_clean_final["proprety_type"].astype(str).str.strip().str.capitalize()
)

df_clean_final["city"] = (
    df_clean_final["city"].astype(str).str.strip().str.capitalize()
)


# ======================================================================
# 11. GENERATION DES PRIX
# ======================================================================

prix_m2 = {
    "Apartment": {
        "Casablanca": (30000, 50000),
        "Rabat": (25000, 50000),
        "Marrakech": (25000, 40000),
        "Tanger": (10000, 20000),
        "Fès": (5000, 8000),
        "Meknès": (5000, 8000),
        "Essaouira": (8000, 10000),
        "Autre": (8000, 12000)
    },
    "House": {
        "Casablanca": (30000, 50000),
        "Rabat": (25000, 50000),
        "Marrakech": (25000, 40000),
        "Tanger": (10000, 20000),
        "Fès": (8000, 10000),
        "Meknès": (8000, 10000),
        "Essaouira": (8000, 10000),
        "Autre": (8000, 12000)
    }
}

def get_prix_m2(log_type, city):
    if log_type not in prix_m2:
        return prix_m2["Apartment"]["Autre"]
    return prix_m2[log_type].get(city, prix_m2[log_type]["Autre"])

def generate_price(row):
    min_m2, max_m2 = get_prix_m2(row["proprety_type"], row["city"])
    prix_m2_random = np.random.randint(min_m2, max_m2)
    return row["surface"] * prix_m2_random

df_clean_final["price_dh"] = df_clean_final.apply(generate_price, axis=1)


# ======================================================================
# 12. APERCU FINAL
# ======================================================================
print(df_clean_final.head(30))


# ======================================================================
# 13. SUPPRESSION DES COLONNES INUTILES
# ======================================================================
colonnes_a_supprimer = ["price_£", "anormale"]
df_clean_final = df_clean_final.drop(columns=colonnes_a_supprimer, errors="ignore")

print(df_clean_final.head(30))


# ======================================================================
# 14. EXPORT FINAL
# ======================================================================
df_clean_final.to_csv("/Data/processed/cleaned_data.csv", index=False)

rarr
Valeurs manquantes avant nettoyage :
price_£            0
proprety type      0
surface            0
bedroom            0
bathroom           0
address            0
city               0
principale       183
dtype: int64 

Valeurs manquantes après correction des villes :
price_£            0
proprety type      0
surface            0
bedroom            0
bathroom           0
address            0
city               0
principale       183
dtype: int64 

Données normales : 169
Données anormales : 153


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean_anormales['surface'] = df_clean_anormales.apply(


      price_£ proprety_type  surface  bedroom  bathroom  \
0    239769.0     Apartment   130.00        2         2   
1    434582.0         House   400.00        4         6   
2    129875.0     Apartment    83.00        2         2   
3    199808.0     Apartment   127.00        3         2   
4    149856.0     Apartment   120.00        2         2   
5    279731.0         House   300.00        3         2   
6    201806.0     Apartment    88.00        2         2   
7    179827.0     Apartment   138.00        2         2   
8    486832.0         House   250.00        3         3   
9     92911.0     Apartment    65.00        1         1   
10   149856.0     Apartment   110.00        4         3   
11   259750.0         House   500.00        5         5   
12   394621.0         House   490.00        3         2   
13   254182.0     Apartment    87.00        2         1   
14  1035635.0         House   450.00        5         3   
15   612348.0         House   350.00        6         3 

OSError: Cannot save file into a non-existent directory: '\Data\processed'

In [10]:
df_clean_final[df_clean_final['principale'].isnull()]

Unnamed: 0,proprety_type,surface,bedroom,bathroom,address,city,principale,price_dh


In [11]:
print("Valeurs manquantes après correction des villes :")
print(df_clean_final.isnull().sum(), "\n")

Valeurs manquantes après correction des villes :
proprety_type    0
surface          0
bedroom          0
bathroom         0
address          0
city             0
principale       0
price_dh         0
dtype: int64 



In [4]:
df_clean_final.head(30)

Unnamed: 0,proprety_type,surface,bedroom,bathroom,address,city,principale,price_dh
0,Apartment,130.0,2,2,"Ennakhil-(Palmeraie), Marrakech, Marrakesh-Safi",Marrakech,Marrakech-Safi,4776590.0
1,House,400.0,4,6,"Ménara, Marrakech, Marrakech-Safi",Marrakech,Marrakech-Safi,13205600.0
2,Apartment,83.0,2,2,"Guéliz, Marrakech, Marrakech-Safi 40000",Marrakech,Marrakech-Safi,3105694.0
3,Apartment,127.0,3,2,"Anfa, Casablanca, Casablanca-Settat",Casablanca,Casablanca-Settat,4929632.0
4,Apartment,120.0,2,2,"Ménara, Marrakech, Marrakech-Safi",Marrakech,Marrakech-Safi,4277040.0
5,House,300.0,3,2,"Ennakhil-(Palmeraie), Marrakech, Marrakesh-Safi",Marrakech,Marrakech-Safi,7801500.0
6,Apartment,88.0,2,2,"Casablanca, Casablanca-Settat",Casablanca,Casablanca-Settat,2764608.0
7,Apartment,138.0,2,2,"Guéliz, Marrakech, Marrakech-Safi 40000",Marrakech,Marrakech-Safi,3466146.0
8,House,250.0,3,3,"marrakech, Marrakesh-Safi",Marrakech,Marrakech-Safi,8318750.0
9,Apartment,65.0,1,1,"Route dAgadir - Essaouira, Marrakech, Marrakes...",Marrakech,Marrakech-Safi,2158195.0
