In [1]:
import pandas as pd
import seaborn as sns
from sklearn import linear_model
from sklearn.preprocessing import LabelEncoder

df_sales = pd.read_csv('df_cleaned_sale.csv')

df_sales


Unnamed: 0,BathroomCount,BedroomCount,ConstructionYear,District,Fireplace,Furnished,Garden,GardenArea,Kitchen,LivingArea,...,StateOfBuilding,SubtypeOfProperty,SurfaceOfPlot,SwimmingPool,Terrace,ToiletCount,TypeOfSale,Condition_Rank,PEB_Rank,Kitchen_Rank
0,1,1,1969,Brugge,0,0,0,127,INSTALLED,29,...,GOOD,flat_studio,333,0,1,1,residential_sale,4,6,3
1,2,4,2008,Brugge,0,1,0,127,INSTALLED,111,...,GOOD,house,0,0,0,2,residential_sale,4,6,3
2,1,4,2002,Veurne,0,0,1,1,INSTALLED,120,...,TO_BE_DONE_UP,house,170,0,1,2,residential_sale,3,2,3
3,0,2,1972,Hasselt,0,0,0,127,INSTALLED,92,...,AS_NEW,apartment,333,0,1,1,residential_sale,6,6,3
4,1,1,1994,Brussels,0,1,0,127,HYPER_EQUIPPED,50,...,AS_NEW,apartment,333,0,1,1,residential_sale,6,3,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96141,1,1,2017,Tongeren,0,0,0,127,SEMI_EQUIPPED,83,...,GOOD,service_flat,333,0,1,1,residential_sale,4,7,2
96142,1,3,2024,Gent,0,0,1,168,INSTALLED,129,...,GOOD,house,234,0,0,0,residential_sale,4,6,3
96143,4,4,2002,Antwerp,0,0,0,127,INSTALLED,318,...,JUST_RENOVATED,apartment_block,202,0,0,1,residential_sale,5,5,3
96144,1,2,2002,Antwerp,0,0,0,127,HYPER_EQUIPPED,85,...,GOOD,apartment,333,0,1,1,residential_sale,4,4,4


replacing qualitative variables by numbers


In [2]:

categorical_columns = [
    'District', 'Fireplace', 'Furnished', 'Garden', 'Kitchen', 'Locality', 
    'PEB', 'Province', 'Region', 'StateOfBuilding', 'SubtypeOfProperty', 
    'SwimmingPool', 'Terrace', 'TypeOfSale'
]

label_encoder = LabelEncoder()


for column in categorical_columns:
    df_sales[column] = label_encoder.fit_transform(df_sales[column])


mappings = {}
for column in categorical_columns:
    mappings[column] = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))


for column, mapping in mappings.items():
    print(f"Mapping for {column}: {mapping}")


Mapping for District: {'residential_sale': 0}
Mapping for Fireplace: {'residential_sale': 0}
Mapping for Furnished: {'residential_sale': 0}
Mapping for Garden: {'residential_sale': 0}
Mapping for Kitchen: {'residential_sale': 0}
Mapping for Locality: {'residential_sale': 0}
Mapping for PEB: {'residential_sale': 0}
Mapping for Province: {'residential_sale': 0}
Mapping for Region: {'residential_sale': 0}
Mapping for StateOfBuilding: {'residential_sale': 0}
Mapping for SubtypeOfProperty: {'residential_sale': 0}
Mapping for SwimmingPool: {'residential_sale': 0}
Mapping for Terrace: {'residential_sale': 0}
Mapping for TypeOfSale: {'residential_sale': 0}


In [3]:
df_sales.head()

Unnamed: 0,BathroomCount,BedroomCount,ConstructionYear,District,Fireplace,Furnished,Garden,GardenArea,Kitchen,LivingArea,...,StateOfBuilding,SubtypeOfProperty,SurfaceOfPlot,SwimmingPool,Terrace,ToiletCount,TypeOfSale,Condition_Rank,PEB_Rank,Kitchen_Rank
0,1,1,1969,5,0,0,0,127,1,29,...,1,9,333,0,1,1,0,4,6,3
1,2,4,2008,5,0,1,0,127,1,111,...,1,11,0,0,0,2,0,4,6,3
2,1,4,2002,40,0,0,1,1,1,120,...,3,11,170,0,1,2,0,3,2,3
3,0,2,1972,14,0,0,0,127,1,92,...,0,0,333,0,1,1,0,6,6,3
4,1,1,1994,6,0,1,0,127,0,50,...,0,0,333,0,1,1,0,6,3,4


model import and settings

In [8]:
import pickle
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Assuming df_sales is your DataFrame
# Define the target variable and features
X = df_sales.drop(columns=['Price'])
y = df_sales['Price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest Regressor
rf_model = CatBoostRegressor(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5

print(f"Mean Absolute Error: {mae}")
print(f"Root Mean Squared Error: {rmse}")

rf_model.save_model('catboost_model.cbm')


Learning rate set to 0.5
0:	learn: 169788.3809996	total: 17.5ms	remaining: 1.73s
1:	learn: 152970.8529236	total: 34.6ms	remaining: 1.69s
2:	learn: 141920.5153290	total: 52.6ms	remaining: 1.7s
3:	learn: 136513.4607333	total: 68.4ms	remaining: 1.64s
4:	learn: 130402.0788911	total: 83.6ms	remaining: 1.59s
5:	learn: 128122.8367780	total: 98ms	remaining: 1.53s
6:	learn: 126672.5943206	total: 110ms	remaining: 1.46s
7:	learn: 123806.2679650	total: 124ms	remaining: 1.42s
8:	learn: 122058.0409650	total: 136ms	remaining: 1.38s
9:	learn: 120708.4163665	total: 150ms	remaining: 1.35s
10:	learn: 119672.1388169	total: 162ms	remaining: 1.31s
11:	learn: 118917.0223346	total: 176ms	remaining: 1.29s
12:	learn: 118180.0536467	total: 198ms	remaining: 1.32s
13:	learn: 117289.0466394	total: 230ms	remaining: 1.41s
14:	learn: 116838.4837793	total: 247ms	remaining: 1.4s
15:	learn: 115830.6192113	total: 259ms	remaining: 1.36s
16:	learn: 114965.5593611	total: 271ms	remaining: 1.32s
17:	learn: 114431.1459540	total