In [None]:
# import necessary libraries:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
import joblib

In [None]:
#Open the data file and make sure that there is no missing data:

df = pd.read_csv('input.csv')
df.isnull().any()
df.shape

In [None]:
#change categorical data to numeric data and collect in pipline:

num_features = ['bedroom','construction_Year','land_surface']
num_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())])

cat_features = ['zip','province','building_type','heating_type']
cat_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(transformers=[("num", num_transformer, num_features),("cat", cat_transformer, cat_features)])

pred_pipeline = Pipeline([("preprocessor", preprocessor),("classifier",RandomForestRegressor())])


In [None]:
#Define Variables:

x = df.drop(['price'], axis=1)
y = df['price']

In [None]:
#Split the dataset:

x_train, x_test, y_train, y_test = train_test_split(x,y, random_state=41, test_size=0.2)

In [None]:
#Load and fit the model:

pred_pipeline.fit(x_train, y_train)

In [None]:
#test the model:

print("model score: %.3f" % pred_pipeline.score(x_test, y_test))

In [None]:
# save the model to disk:

filename = 'price_prediction_model.sav'
joblib.dump(pred_pipeline, filename)