In [2]:
import pandas as pd
import numpy as np
import os
from load import load_file
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
import datetime
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from xgboost import XGBRegressor
import joblib


In [4]:
df = load_file("../data/final_dataset.json")
if df is False:
    print("Failed to load file.")

In [5]:
def remove_rent(df, sale_type='residential_sale'):
    return df[df['TypeOfSale'] == sale_type]

print("remove_rent")
df = remove_rent(df)
print(df.shape[0])

remove_rente
104948


In [6]:
def strip_data(df):
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].map(lambda x: x.strip() if isinstance(x, str) else x)
    return df

print("strip_data")
df = strip_data(df)
print(df.shape[0])

strip_data
104948


In [None]:
def no_duplicates(df): 
    df_unique = df.drop(columns=['Url', 'PropertyId', 'SubtypeOfProperty']).drop_duplicates()
    return df.loc[df_unique.index]

print("no_duplicates")
df = no_duplicates(df)
print(df.shape[0])

no_duplicates
98189


In [7]:
def remove_outliers(df, column, multiplier=2):
    Q1 = df[column].quantile(0.20)
    Q3 = df[column].quantile(0.80)
    IQR = Q3 - Q1
    
    print(f"Q1 (15th percentile): {Q1} Q3 (85th percentile): {Q3} IQR: {IQR}")
    
    before_count = df.shape[0]
    
    if IQR == 0:
        lower_bound = Q1
        upper_bound = Q3 
    else:
        lower_bound = max(Q1 - multiplier * IQR, 0)
        upper_bound = Q3 + multiplier * IQR

    df = df[~((df[column] < lower_bound) | (df[column] > upper_bound))]

    print(f"Lower bound: {lower_bound} Upper bound: {upper_bound}")

    after_count = df.shape[0]
    print(f"{column}: Removed {before_count - after_count} outliers")
    return df

def select_features(df):
    df = df[
        ((df["PEB"].isin(["A++", "A+", "A", "B", "C", "D", "E", "F"])) | (df['PEB'].isna())) &
        (((df['PostalCode'] >= 1000) & (df['PostalCode'] <= 9999)) | (df['PostalCode'].isna())) &
        ((df['ConstructionYear'] <= datetime.datetime.now().year + 20) | (df['ConstructionYear'].isna()))
    ][[
        "Price", "PostalCode", "BathroomCount", "BedroomCount", "ConstructionYear", "NumberOfFacades", "PEB", 
        "SurfaceOfPlot", "LivingArea", "GardenArea", "StateOfBuilding", "SwimmingPool", "Terrace", "ToiletCount", "RoomCount"
    ]]

    df['LivingArea_per_Bedroom'] = df['LivingArea'] / (df['BedroomCount'] + 1)
    df['GardenArea_per_Bedroom'] = df['GardenArea'] / (df['BedroomCount'] + 1)
    df['PropertyAge'] = datetime.datetime.now().year - df['ConstructionYear']
    df['LivingArea_to_TotalArea'] = df['LivingArea'] / (df['SurfaceOfPlot'] + 1)
    df['Bedroom_to_Facades'] = df['BedroomCount'] / (df['NumberOfFacades'] + 1)

    print(f"Rows after initial selection: {df.shape[0]}")

    multi_dico = {
        "Price": 5,
        "BedroomCount": 3,
        "BathroomCount": 3,
        "NumberOfFacades": 2,
        "SurfaceOfPlot": 4,
        "LivingArea": 4,
        "GardenArea": 4,
        "ToiletCount": 2,
    }

    print("===============")
    for k, i in multi_dico.items():
        df = remove_outliers(df, k, multiplier=i)
    print("===============")

    return df

print("select_features")
df = select_features(df)
print(df.shape[0])

select_features
Rows after initial selection: 100211
Q1 (15th percentile): 233600.0 Q3 (85th percentile): 515000.0 IQR: 281400.0
Lower bound: 0 Upper bound: 1922000.0
Price: Removed 1393 outliers
Q1 (15th percentile): 2.0 Q3 (85th percentile): 4.0 IQR: 2.0
Lower bound: 0 Upper bound: 10.0
BedroomCount: Removed 252 outliers
Q1 (15th percentile): 1.0 Q3 (85th percentile): 2.0 IQR: 1.0
Lower bound: 0 Upper bound: 5.0
BathroomCount: Removed 270 outliers
Q1 (15th percentile): 2.0 Q3 (85th percentile): 4.0 IQR: 2.0
Lower bound: 0 Upper bound: 8.0
NumberOfFacades: Removed 9 outliers
Q1 (15th percentile): 127.0 Q3 (85th percentile): 955.0 IQR: 828.0
Lower bound: 0 Upper bound: 4267.0
SurfaceOfPlot: Removed 1522 outliers
Q1 (15th percentile): 85.0 Q3 (85th percentile): 195.0 IQR: 110.0
Lower bound: 0 Upper bound: 635.0
LivingArea: Removed 390 outliers
Q1 (15th percentile): 36.0 Q3 (85th percentile): 500.0 IQR: 464.0
Lower bound: 0 Upper bound: 2356.0
GardenArea: Removed 357 outliers
Q1 (15th pe

In [9]:
def clean_nan(df, n_neighbors=5):
    df["SwimmingPool"].fillna(0, inplace=True)
    df["Terrace"].fillna(0, inplace=True)
    df["GardenArea"].fillna(0, inplace=True)

    label_encoders = {}
    for column in df.select_dtypes(include=['object']).columns:
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column].astype(str))
        label_encoders[column] = le
    
    numeric_df = df.select_dtypes(include=['number'])
    imputer = KNNImputer(n_neighbors=n_neighbors)
    df_imputed = imputer.fit_transform(numeric_df)
    numeric_df_imputed = pd.DataFrame(df_imputed, columns=numeric_df.columns, index=numeric_df.index)
    df.update(numeric_df_imputed)
    return df

print("clean_nan")
df = clean_nan(df)
print(df.shape[0])
print(df)

output_path = '../data/cleaned_data.csv'
os.makedirs(os.path.dirname(output_path), exist_ok=True)
df.to_csv(output_path, index=False)
print(f"DataFrame modifié enregistré sous '{output_path}'")

clean_nan


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["SwimmingPool"].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Terrace"].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behav

95453
         Price  PostalCode  BathroomCount  BedroomCount  ConstructionYear  \
2        99000        8380            1.0             1            1969.0   
8       399000        8370            2.0             4            2008.0   
10      230000        8660            1.0             4            1989.2   
11      198000        3500            0.0             2            1972.0   
14      215000        1030            1.0             1            1994.0   
...        ...         ...            ...           ...               ...   
181786  219000        3830            1.0             1            2017.0   
181788  409000        9880            1.0             3            2024.0   
181789  599000        2020            4.0             4            1955.2   
181790  245000        2140            1.0             2            1983.2   
181791  310000        8000            1.0             2            2017.2   

        NumberOfFacades  PEB  SurfaceOfPlot  LivingArea  GardenArea  

In [10]:
print("Missing values percentage per column:")
print(df.isnull().mean() * 100)
print(df.shape[0])

Missing values percentage per column:
Price                      0.0
PostalCode                 0.0
BathroomCount              0.0
BedroomCount               0.0
ConstructionYear           0.0
NumberOfFacades            0.0
PEB                        0.0
SurfaceOfPlot              0.0
LivingArea                 0.0
GardenArea                 0.0
StateOfBuilding            0.0
SwimmingPool               0.0
Terrace                    0.0
ToiletCount                0.0
RoomCount                  0.0
LivingArea_per_Bedroom     0.0
GardenArea_per_Bedroom     0.0
PropertyAge                0.0
LivingArea_to_TotalArea    0.0
Bedroom_to_Facades         0.0
dtype: float64
95453


In [13]:
df = load_file("../data/cleaned_data.csv")

def handle_categorical_data(df): 
    df = pd.get_dummies(df, drop_first=True)
    return df

y = df['Price']
X = df.drop(columns=['Price'])
X = handle_categorical_data(X)

scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)

def find_latest_model_path(base_dir):
    segments = [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))]
    segments.sort(reverse=True) 
    for segment in segments:
        model_path = os.path.join(base_dir, segment, 'model.pkl')
        if os.path.exists(model_path):
            return model_path
    return None

base_dir = "../data/models"

latest_model_path = find_latest_model_path(base_dir)

if latest_model_path:
    model = joblib.load(latest_model_path)
    print(f"Modèle existant chargé depuis {latest_model_path}.")
else:
    model = XGBRegressor(tree_method='hist', device='cuda', random_state=69, early_stopping_rounds=10)

param_grid = {
    'n_estimators': [100, 300, 500, 700, 900],
    'learning_rate': [0.01, 0.03, 0.1, 0.15, 0.2],
    'max_depth': [3, 5, 7, 10, 12],
    'subsample': [0.5, 0.7, 0.9],
    'colsample_bytree': [0.5, 0.7, 0.9],
    'gamma': [0, 0.1, 0.2],
    'min_child_weight': [1, 3, 5],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [0, 0.1, 0.5]
}

random_search = RandomizedSearchCV(
    model, 
    param_distributions=param_grid, 
    n_iter=400, 
    cv=5, 
    scoring='neg_mean_absolute_error', 
    n_jobs=-1, 
    random_state=69
)

random_search.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)

best_model = random_search.best_estimator_

new_segment_dir = os.path.join(base_dir, f"model_segment_{len(os.listdir(base_dir)) + 1}")
os.makedirs(new_segment_dir, exist_ok=True)
joblib.dump(best_model, os.path.join(new_segment_dir, 'model.pkl'))
print(f"Modèle sauvegardé dans {new_segment_dir}.")
info_path = os.path.join(new_segment_dir, 'info.txt')
with open(info_path, 'w') as f:
    f.write(f"Model segment: {len(os.listdir(base_dir)) + 1}\n")
    f.write(f"Date: {datetime.datetime.now()}\n")
    f.write(f"Mean Absolute Error: {mean_absolute_error(y_test, model.predict(X_test)):.2f}\n")
    f.write(f"R2 Score: {r2_score(y_test, model.predict(X_test)):.2f}\n")
    f.write(f"Best Params: {model.get_params()}\n")
print(f"Informations du modèle enregistrées sous '{info_path}'")


Modèle existant chargé depuis ../data/models\model_segment_1\model.pkl.


  _data = np.array(data, dtype=dtype, copy=copy,


Modèle sauvegardé dans ../data/models\model_segment2.


In [21]:
y_pred = best_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"XGBoost Mean Absolute Error après continuation: {mae:.2f}")
print(f"XGBoost Score après continuation: {r2:.2f}")

XGBoost Mean Absolute Error après continuation: 44186.14
XGBoost Score après continuation: 0.87


In [11]:
def load_file(file_path):
    return pd.read_csv(file_path)

def handle_categorical_data(df):
    return pd.get_dummies(df, drop_first=True)

def prepare_data(input_data, scaler, categorical_columns):
    df = pd.DataFrame([input_data])
    df = handle_categorical_data(df)
    
    for col in categorical_columns:
        if col not in df.columns:
            df[col] = 0

    df = df.reindex(columns=categorical_columns, fill_value=0)

    return scaler.transform(df)

def test_model(input_data, model_path, scaler, categorical_columns):
    model = joblib.load(model_path)

    X_test = prepare_data(input_data, scaler, categorical_columns)

    y_pred = model.predict(X_test)

    return y_pred

input_data = {
    "PostalCode": 4651,
    "BathroomCount": 2,
    "BedroomCount": 3,
    "ConstructionYear": 2011,
    "NumberOfFacades": 4,
    "PEB": 'B',
    "SurfaceOfPlot": 1044,
    "LivingArea": 200,
    "GardenArea": 948,
    "StateOfBuilding": 'Excellent',
    "SwimmingPool": 1,
    "Terrace": 1,
    "ToiletCount": 2,
    "RoomCount": 13
}

df = load_file("../data/cleaned_data.csv")
X = df.drop(columns=['Price'])
X = handle_categorical_data(X)
categorical_columns = X.columns

scaler = StandardScaler()
X = scaler.fit_transform(X)

model_path = "../data/models/model_segment_3/model.pkl"
y_pred = test_model(input_data, model_path, scaler, categorical_columns)
print(f"Predicted Price: {y_pred[0]:.2f}")




Predicted Price for new input: 567198.56
Mean Absolute Error: 16147.67
R² Score: 0.97
