In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
import pickle
import optuna


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('cleaned_dataset.csv')

In [3]:
print(df.shape)
print(df.info())
print(df.describe())

(94891, 30)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94891 entries, 0 to 94890
Data columns (total 30 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   bathroomcount      94891 non-null  int64  
 1   bedroomcount       94891 non-null  int64  
 2   constructionyear   94891 non-null  int64  
 3   country            94891 non-null  object 
 4   district           94891 non-null  object 
 5   fireplace          94891 non-null  int64  
 6   floodingzone       94891 non-null  object 
 7   furnished          94891 non-null  int64  
 8   garden             94891 non-null  int64  
 9   kitchen            94891 non-null  int64  
 10  livingarea         94891 non-null  float64
 11  locality           94888 non-null  object 
 12  monthlycharges     94891 non-null  float64
 13  numberoffacades    94891 non-null  int64  
 14  peb                94891 non-null  object 
 15  postalcode         94891 non-null  int64  
 16  price     

In [4]:
df.columns

Index(['bathroomcount', 'bedroomcount', 'constructionyear', 'country',
       'district', 'fireplace', 'floodingzone', 'furnished', 'garden',
       'kitchen', 'livingarea', 'locality', 'monthlycharges',
       'numberoffacades', 'peb', 'postalcode', 'price', 'propertyid',
       'province', 'region', 'roomcount', 'showercount', 'stateofbuilding',
       'subtypeofproperty', 'surfaceofplot', 'swimmingpool', 'terrace',
       'toiletcount', 'typeofproperty', 'typeofsale'],
      dtype='object')

In [5]:
df

Unnamed: 0,bathroomcount,bedroomcount,constructionyear,country,district,fireplace,floodingzone,furnished,garden,kitchen,...,roomcount,showercount,stateofbuilding,subtypeofproperty,surfaceofplot,swimmingpool,terrace,toiletcount,typeofproperty,typeofsale
0,1,1,1969,Belgium,Brugge,0,NON_FLOOD_ZONE,0,0,1,...,1,0,4,flat_studio,203,0,1,1,2,residential_sale
1,6,13,1920,Belgium,Tournai,0,NON_FLOOD_ZONE,0,0,2,...,31,1,4,apartment_block,130,0,0,5,1,residential_sale
2,2,4,2008,Belgium,Brugge,0,NON_FLOOD_ZONE,1,0,1,...,3,0,4,house,0,0,0,2,1,residential_sale
3,1,4,1979,Belgium,Veurne,0,NON_FLOOD_ZONE,0,1,1,...,9,1,2,house,170,0,1,2,1,residential_sale
4,0,2,1972,Belgium,Hasselt,0,NON_FLOOD_ZONE,0,0,1,...,1,0,5,apartment,400,0,1,1,2,residential_sale
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94886,1,1,2017,Belgium,Tongeren,0,NON_FLOOD_ZONE,0,0,2,...,5,0,4,service_flat,286,0,1,1,2,residential_sale
94887,1,3,2024,Belgium,Gent,0,NON_FLOOD_ZONE,0,1,1,...,4,1,4,house,234,0,0,0,1,residential_sale
94888,4,4,2020,Belgium,Antwerp,0,NON_FLOOD_ZONE,0,0,2,...,8,1,3,apartment_block,202,0,0,1,1,residential_sale
94889,1,2,2014,Belgium,Antwerp,0,NON_FLOOD_ZONE,0,0,3,...,8,1,4,apartment,606,0,1,1,2,residential_sale


refining cleaned data

In [6]:
def clean_data(df):
    # Drop column: 'country'
    df = df.drop(columns=['country'])
    # Drop column: 'fireplace'
    df = df.drop(columns=['fireplace'])
    # Drop column: 'monthlycharges'
    df = df.drop(columns=['monthlycharges'])
    # Drop column: 'locality'
    df = df.drop(columns=['locality'])
    # Drop column: 'propertyid'
    df = df.drop(columns=['propertyid'])
    # Drop column: 'constructionyear'
    df = df.drop(columns=['constructionyear'])
    #df = df.drop(columns=['furnished'])
    df = df.drop(columns=['roomcount'])
    df = df.drop(columns=['postalcode'])
    df = df.drop(columns=['floodingzone'])
   
    return df


df_clean = clean_data(df.copy())
df_clean.head()


Unnamed: 0,bathroomcount,bedroomcount,district,furnished,garden,kitchen,livingarea,numberoffacades,peb,price,...,region,showercount,stateofbuilding,subtypeofproperty,surfaceofplot,swimmingpool,terrace,toiletcount,typeofproperty,typeofsale
0,1,1,Brugge,0,0,1,29.0,2,B,99000,...,Flanders,0,4,flat_studio,203,0,1,1,2,residential_sale
1,6,13,Tournai,0,0,2,391.0,3,D,765000,...,Wallonie,1,4,apartment_block,130,0,0,5,1,residential_sale
2,2,4,Brugge,1,0,1,111.0,2,B,399000,...,Flanders,0,4,house,0,0,0,2,1,residential_sale
3,1,4,Veurne,0,1,1,113.6,2,F,230000,...,Flanders,1,2,house,170,0,1,2,1,residential_sale
4,0,2,Hasselt,0,0,1,92.0,2,B,198000,...,Flanders,0,5,apartment,400,0,1,1,2,residential_sale


In [7]:
def clean_data(df_clean):
    # Filter rows based on column: 'bathroomcount'
    df_clean = df_clean[df_clean['bathroomcount'] <= 8]
    # Filter rows based on column: 'bedroomcount'
    df_clean = df_clean[df_clean['bedroomcount'] <= 6]
    # Filter rows based on column: 'livingarea'
    df_clean = df_clean[df_clean['livingarea'] < 350]
    # Filter rows based on column: 'numberoffacades'
    df_clean = df_clean[df_clean['numberoffacades'] <= 4]
    # Filter rows based on column: 'showercount'
    df_clean = df_clean[df_clean['showercount'] <= 3]
    # Filter rows based on column: 'surfaceofplot'
    df_clean = df_clean[df_clean['surfaceofplot'] <= 1000]
    # Filter rows based on column: 'toiletcount'
    df_clean = df_clean[df_clean['toiletcount'] <= 4]
    # Filter rows based on column: 'bathroomcount'
    df_clean = df_clean[df_clean['bathroomcount'] <= 3]
    return df_clean

df_clean_1 = clean_data(df_clean.copy())
df_clean_1.head()

Unnamed: 0,bathroomcount,bedroomcount,district,furnished,garden,kitchen,livingarea,numberoffacades,peb,price,...,region,showercount,stateofbuilding,subtypeofproperty,surfaceofplot,swimmingpool,terrace,toiletcount,typeofproperty,typeofsale
0,1,1,Brugge,0,0,1,29.0,2,B,99000,...,Flanders,0,4,flat_studio,203,0,1,1,2,residential_sale
2,2,4,Brugge,1,0,1,111.0,2,B,399000,...,Flanders,0,4,house,0,0,0,2,1,residential_sale
3,1,4,Veurne,0,1,1,113.6,2,F,230000,...,Flanders,1,2,house,170,0,1,2,1,residential_sale
4,0,2,Hasselt,0,0,1,92.0,2,B,198000,...,Flanders,0,5,apartment,400,0,1,1,2,residential_sale
5,1,1,Brussels,1,0,3,50.0,2,E,215000,...,Brussels,1,5,apartment,143,0,1,1,2,residential_sale


In [8]:
print(df_clean_1.columns)


Index(['bathroomcount', 'bedroomcount', 'district', 'furnished', 'garden',
       'kitchen', 'livingarea', 'numberoffacades', 'peb', 'price', 'province',
       'region', 'showercount', 'stateofbuilding', 'subtypeofproperty',
       'surfaceofplot', 'swimmingpool', 'terrace', 'toiletcount',
       'typeofproperty', 'typeofsale'],
      dtype='object')


Categorize str values
constructionyear, district, floodingzone, subtypeofproperty, typeofsale

In [9]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

columns_to_encode = ['furnished','district', 'subtypeofproperty', 'typeofsale', 'peb','province', 'region']

data_to_encode = df_clean_1[columns_to_encode]

one = OneHotEncoder()

encoded_data = one.fit_transform(data_to_encode)

encoded_df = pd.DataFrame(encoded_data.toarray(), columns=one.get_feature_names_out(columns_to_encode))

df_final = pd.concat([df_clean_1.drop(columns=columns_to_encode), encoded_df], axis=1)

print(df_final.shape)
print(df_final.info())
df_final.to_csv('final_dataset.csv', index=False)
df_final.head()


(93332, 113)
<class 'pandas.core.frame.DataFrame'>
Index: 93332 entries, 0 to 81968
Columns: 113 entries, bathroomcount to region_Wallonie
dtypes: float64(113)
memory usage: 81.2 MB
None


Unnamed: 0,bathroomcount,bedroomcount,garden,kitchen,livingarea,numberoffacades,price,showercount,stateofbuilding,surfaceofplot,...,province_Hainaut,province_Limburg,province_Liège,province_Luxembourg,province_Namur,province_Walloon Brabant,province_West Flanders,region_Brussels,region_Flanders,region_Wallonie
0,1.0,1.0,0.0,1.0,29.0,2.0,99000.0,0.0,4.0,203.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,2.0,4.0,0.0,1.0,111.0,2.0,399000.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,1.0,4.0,1.0,1.0,113.6,2.0,230000.0,1.0,2.0,170.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,2.0,0.0,1.0,92.0,2.0,198000.0,0.0,5.0,400.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5,1.0,1.0,0.0,3.0,50.0,2.0,215000.0,1.0,5.0,143.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


Machine learning process

In [10]:
correlations = df_final.drop(columns=['price']).corrwith(df_final['price'])

print(correlations)



bathroomcount               0.259586
bedroomcount                0.375350
garden                      0.075140
kitchen                     0.314474
livingarea                  0.495621
                              ...   
province_Walloon Brabant    0.008028
province_West Flanders      0.005201
region_Brussels            -0.002545
region_Flanders             0.002449
region_Wallonie            -0.000996
Length: 112, dtype: float64


  c /= stddev[:, None]
  c /= stddev[None, :]


In [11]:
df_final

Unnamed: 0,bathroomcount,bedroomcount,garden,kitchen,livingarea,numberoffacades,price,showercount,stateofbuilding,surfaceofplot,...,province_Hainaut,province_Limburg,province_Liège,province_Luxembourg,province_Namur,province_Walloon Brabant,province_West Flanders,region_Brussels,region_Flanders,region_Wallonie
0,1.0,1.0,0.0,1.0,29.0,2.0,99000.0,0.0,4.0,203.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,2.0,4.0,0.0,1.0,111.0,2.0,399000.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,1.0,4.0,1.0,1.0,113.6,2.0,230000.0,1.0,2.0,170.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,2.0,0.0,1.0,92.0,2.0,198000.0,0.0,5.0,400.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5,1.0,1.0,0.0,3.0,50.0,2.0,215000.0,1.0,5.0,143.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81948,,,,,,,,,,,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
81956,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
81960,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
81966,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [12]:
from sklearn.model_selection import train_test_split

df = df_final

y = np.array(df ['price'])
X = np.array(df.drop(columns=['price']))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


X_train shape: (74665, 112)
X_test shape: (18667, 112)
y_train shape: (74665,)
y_test shape: (18667,)


Cleaning X train, test


In [13]:
def clean_data(X_train_df):
    for col in X_train_df.columns:
        X_train_df[col] = X_train_df[col].fillna(X_train_df[col].median())
    return X_train_df

X_train_df = pd.DataFrame(X_train.tolist() if len(X_train.shape) > 2 else X_train)
X_train_df_clean = clean_data(X_train_df.copy())
print(X_train_df_clean.head())

   0    1    2    3      4    5    6    7      8    9    ...  102  103  104  \
0  1.0  1.0  1.0  0.0   61.0  2.0  1.0  4.0  651.0  0.0  ...  0.0  0.0  0.0   
1  2.0  4.0  0.0  1.0  192.0  2.0  1.0  4.0    0.0  0.0  ...  0.0  0.0  0.0   
2  1.0  4.0  0.0  2.0  193.0  4.0  1.0  4.0  379.0  0.0  ...  1.0  0.0  0.0   
3  1.0  2.0  0.0  1.0  108.0  3.0  0.0  2.0  536.0  0.0  ...  0.0  0.0  0.0   
4  1.0  3.0  0.0  2.0  126.0  3.0  1.0  3.0  400.0  0.0  ...  0.0  0.0  0.0   

   105  106  107  108  109  110  111  
0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  
1  0.0  0.0  0.0  0.0  0.0  1.0  0.0  
2  0.0  0.0  0.0  0.0  0.0  0.0  1.0  
3  0.0  0.0  0.0  1.0  0.0  1.0  0.0  
4  0.0  0.0  0.0  1.0  0.0  1.0  0.0  

[5 rows x 112 columns]


In [14]:
def clean_data(X_test_df):
    for col in X_test_df.columns:
        if X_test_df[col].isnull().any():
            X_test_df[col].fillna(X_test_df[col].median(), inplace=True)
    return X_test_df

X_test_df = pd.DataFrame(X_test.tolist() if len(X_test.shape) > 2 else X_test)
X_test_df_clean = clean_data(X_test_df.copy())
print(X_test_df_clean.head())




   0    1    2    3      4    5    6    7      8    9    ...  102  103  104  \
0  0.0  1.0  0.0  1.0  111.0  2.0  1.0  2.0   50.0  0.0  ...  0.0  1.0  0.0   
1  1.0  2.0  0.0  1.0   99.0  4.0  1.0  5.0  387.0  0.0  ...  0.0  0.0  0.0   
2  1.0  2.0  0.0  1.0  113.0  2.0  1.0  4.0  275.0  0.0  ...  0.0  0.0  0.0   
3  2.0  3.0  0.0  3.0  190.0  3.0  1.0  4.0  408.0  1.0  ...  0.0  1.0  0.0   
4  1.0  2.0  0.0  3.0   87.0  3.0  1.0  4.0  101.0  0.0  ...  0.0  0.0  0.0   

   105  106  107  108  109  110  111  
0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  
1  0.0  0.0  0.0  0.0  0.0  1.0  0.0  
2  0.0  0.0  0.0  0.0  0.0  1.0  0.0  
3  0.0  0.0  0.0  0.0  0.0  1.0  0.0  
4  0.0  0.0  0.0  0.0  0.0  1.0  0.0  

[5 rows x 112 columns]


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test_df[col].fillna(X_test_df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test_df[col].fillna(X_test_df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on

Cleaning Y train and Y test

In [15]:
def clean_data(y_train_df):
    y_train_df = y_train_df.fillna({0: y_train_df[0].median()})
    return y_train_df

y_train_df = pd.DataFrame(y_train.tolist() if len(y_train.shape) > 2 else y_train)
y_train_df_clean = clean_data(y_train_df.copy())
y_train_df_clean.head()



Unnamed: 0,0
0,320000.0
1,475000.0
2,420000.0
3,239000.0
4,579000.0


In [16]:
def clean_data(y_test_df):
    y_test_df = y_test_df.fillna({0: 0})
    return y_test_df

y_test_df = pd.DataFrame(y_test.tolist() if len(y_test.shape) > 2 else y_test)
y_test_df_clean = clean_data(y_test_df.copy())
print(y_test_df_clean.shape)

(18667, 1)


Utiliser des transformers pour choisir quelle variable est plus intérressante pour le modèle
https://www.youtube.com/watch?v=T4nZDuakYlU&list=PLO_fdPEVlfKoHQ3Ua2NtDL4nmynQC8YiS&index=9

Model training


In [17]:
from catboost import CatBoostRegressor

model = CatBoostRegressor(random_state=42)
model.fit(X_train_df_clean, y_train_df_clean)
y_pred = model.predict(X_test_df_clean)

Learning rate set to 0.080934
0:	learn: 118265.2936762	total: 145ms	remaining: 2m 24s
1:	learn: 114650.6798653	total: 155ms	remaining: 1m 17s
2:	learn: 111518.7284962	total: 163ms	remaining: 54.2s
3:	learn: 108750.6504866	total: 173ms	remaining: 43.1s
4:	learn: 106227.3059220	total: 182ms	remaining: 36.2s
5:	learn: 104071.3380964	total: 191ms	remaining: 31.6s
6:	learn: 102149.3908479	total: 203ms	remaining: 28.8s
7:	learn: 100440.4787139	total: 215ms	remaining: 26.7s
8:	learn: 98904.2362476	total: 223ms	remaining: 24.5s
9:	learn: 97523.8460017	total: 236ms	remaining: 23.3s
10:	learn: 96224.5183187	total: 253ms	remaining: 22.7s
11:	learn: 95167.2686903	total: 263ms	remaining: 21.7s
12:	learn: 94190.9760610	total: 272ms	remaining: 20.7s
13:	learn: 93324.2406367	total: 281ms	remaining: 19.8s
14:	learn: 92468.4714681	total: 289ms	remaining: 19s
15:	learn: 91757.3621169	total: 296ms	remaining: 18.2s
16:	learn: 91153.0305946	total: 306ms	remaining: 17.7s
17:	learn: 90546.6472266	total: 315ms

In [18]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Calculate evaluation metrics
mae = mean_absolute_error(y_test_df_clean, y_pred)
mse = mean_squared_error(y_test_df_clean, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test_df_clean, y_pred)

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R^2 Score: {r2}")

model.save_model('catboost_model.cbm')

MAE: 92543.59713197256
MSE: 17130947822.952677
RMSE: 130885.24677347206
R^2 Score: 0.33289742950159973


In [19]:
df_final.head(1)


Unnamed: 0,bathroomcount,bedroomcount,garden,kitchen,livingarea,numberoffacades,price,showercount,stateofbuilding,surfaceofplot,...,province_Hainaut,province_Limburg,province_Liège,province_Luxembourg,province_Namur,province_Walloon Brabant,province_West Flanders,region_Brussels,region_Flanders,region_Wallonie
0,1.0,1.0,0.0,1.0,29.0,2.0,99000.0,0.0,4.0,203.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [20]:
import streamlit as st
import pandas as pd
from catboost import CatBoostRegressor

# Function to load the model
def load_model():
    model = CatBoostRegressor()
    model.load_model('catboost_model.cbm')
    return model

# Function to preprocess user input
def preprocess_input(user_input):
    categorical_features = ['furnished', 'district', 'subtypeofproperty', 'typeofsale', 'peb', 'province', 'region']
    user_input_df = pd.DataFrame([user_input])
    dummies = pd.get_dummies(user_input_df[categorical_features])
    user_input_df = pd.concat([user_input_df, dummies], axis=1)
    user_input_df = user_input_df.drop(categorical_features, axis=1)
    return user_input_df

# Main function to run the Streamlit app
def main():
    st.title("Real Estate Price Prediction")

    # Predefined options for each feature
    furnished_options = ["Yes", "No"]

    district_options = ["district_Aalst", "district_Antwerp","district_Arlon" , "district_Ath",
                        "district_Bastogne","district_Brugge","district_Brussels","district_Charleroi",
                        "district_Dendermonde","district_Diksmuide","district_Dinant","district_Eeklo",
                        "district_Gent","district_Halle-Vilvoorde","district_Hasselt","district_Huy",
                        "district_Ieper" ,"district_Kortrijk","district_Leuven","district_Liège",
                        "district_Maaseik","district_Marche-en-Famenne","district_Mechelen","district_Mons",
                        "district_Mouscron","district_Namur","district_Neufchâteau","district_Nivelles",
                        "district_Oostend","district_Oudenaarde","district_Philippeville","district_Sint-Niklaas",
                        "district_Roeselare","district_Soignies","district_Thuin","district_Tielt",
                        "district_Tongeren","district_Tournai","district_Turnhout","district_Verviers",
                        "district_Veurne","district_Virton","district_Waremme"]  

    subtypeofproperty_options = ["subtypeofproperty_apartment", "subtypeofproperty_apartement_block","subtypeofproperty_bungalow","subtypeofproperty_castle",
                                 "subtypeofproperty_chalet","subtypeofproperty_country_cottage","subtypeofproperty_duplex","subtypeofproperty_exeptional_property",
                                 "subtypeofproperty_farmhouse","subtypeofproperty_flat_studio","subtypeofproperty_ground_floor","subtypeofproperty_house",
                                 "subtypeofproperty_kot","subtypeofproperty_loft","subtypeofproperty_mansion","subtypeofproperty_manor_house",
                                 "subtypeofproperty_mixed_use_building","subtypeofproperty_other_property","subtypeofproperty_penthouse","subtypeofproperty_service_flat",
                                 "subtypeofproperty_pavilion","subtypeofproperty", "subtypeofproperty_town_house","subtypeofproperty_triplex","subtypeofproperty_villa",]  

    peb_options = ["peb_A", "peb_A+", "peb_A++", 
                   "peb_A_A+", "peb_B", "peb_B_A", 
                   "peb_C","peb_D", "peb_E","peb_E_D","peb_F","peb_F_C",
                   "peb_F_D","peb_F_E","peb_G"] 
    
    province_options = ["province_Antwerp", "province_Brussels", "province_East Flanders",
                        "province_Flemish Brabant","province_Hainaut", "province_Limburg",
                        "province_Liège","province_Luxembourg","province_Namur",
                        "province_Walloon Brabant", "province_West Flanders" ] 
    
    region_options = ["region_Brussels", "region_Flanders", "region_Wallonie"] 

    # User input fields
    furnished = st.selectbox("Furnished", furnished_options)
    district = st.selectbox("District", district_options)
    subtypeofproperty = st.selectbox("Subtype of Property", subtypeofproperty_options)
    peb = st.selectbox("PEB", peb_options)
    province = st.selectbox("Province", province_options)
    region = st.selectbox("Region", region_options)

    # Collect user input into a dictionary
    user_input = {
        'furnished': furnished,
        'district': district,
        'subtypeofproperty': subtypeofproperty,
        'peb': peb,
        'province': province,
        'region': region
    }

    # Preprocess the user input
    user_input_df = preprocess_input(user_input)

    # Load the model
    model = load_model()

    # Predict button
    if st.button("Predict"):
        prediction = model.predict(user_input_df)
        st.write(f"Predicted Price: {prediction[0]}")

if __name__ == '__main__':
    main()
    

2024-07-31 09:36:47.529 
  command:

    streamlit run C:\Users\pieta\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\ipykernel_launcher.py [ARGUMENTS]
2024-07-31 09:36:47.530 Session state does not function when running a script without `streamlit run`


KeyError: "['typeofsale'] not in index"

In [None]:
'''from sklearn.model_selection import GridSearchCV, cross_val_score
from catboost import CatBoostRegressor
import pandas as pd

# Définir le modèle de base
model = CatBoostRegressor(random_state=42)

param_grid = {
    'iterations': [100, 300, 500, 700],
    'learning_rate': [0.001, 0.01, 0.05],
    'depth': [3, 5, 7, 10, 12],
    'l2_leaf_reg': [0.1, 1, 3, 5, 10]
}


# Configurer la recherche en grille
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)

# Exécuter la recherche en grille
grid_search.fit(X_train_df_clean, y_train_df_clean)

# Afficher les meilleurs hyperparamètres
print("Best parameters found: ", grid_search.best_params_)

# Évaluer les performances du modèle optimisé
best_model = grid_search.best_estimator_
cv_scores = cross_val_score(best_model, X_train_df_clean, y_train_df_clean, cv=5, scoring='neg_mean_absolute_error')

# Afficher les scores de validation croisée
print("Cross-validated scores (MAE): ", -cv_scores)
print("Mean CV score (MAE): ", -cv_scores.mean())

# Faire des prédictions et évaluer le modèle sur les données de test
y_pred = best_model.predict(X_test_df_clean)

# Calculer les métriques de performance
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

mae = mean_absolute_error(y_test_df_clean, y_pred)
mse = mean_squared_error(y_test_df_clean, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test_df_clean, y_pred)

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R^2 Score: {r2}")'''




'from sklearn.model_selection import GridSearchCV, cross_val_score\nfrom catboost import CatBoostRegressor\nimport pandas as pd\n\n# Définir le modèle de base\nmodel = CatBoostRegressor(random_state=42)\n\nparam_grid = {\n    \'iterations\': [100, 300, 500, 700],\n    \'learning_rate\': [0.001, 0.01, 0.05],\n    \'depth\': [3, 5, 7, 10, 12],\n    \'l2_leaf_reg\': [0.1, 1, 3, 5, 10]\n}\n\n\n# Configurer la recherche en grille\ngrid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring=\'neg_mean_absolute_error\', n_jobs=-1)\n\n# Exécuter la recherche en grille\ngrid_search.fit(X_train_df_clean, y_train_df_clean)\n\n# Afficher les meilleurs hyperparamètres\nprint("Best parameters found: ", grid_search.best_params_)\n\n# Évaluer les performances du modèle optimisé\nbest_model = grid_search.best_estimator_\ncv_scores = cross_val_score(best_model, X_train_df_clean, y_train_df_clean, cv=5, scoring=\'neg_mean_absolute_error\')\n\n# Afficher les scores de validation crois