# **Model**

#### **Imports**

In [1]:
import pandas as pd
import numpy as np
import statistics

import seaborn as sns
from matplotlib import pyplot as plt

from scipy import stats

import operator

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

In [2]:
df = pd.read_csv('model/Property_structured_data.csv')

In [3]:
df_shape_initial = df.shape

#### **Clean data**

In [4]:
z_scores_price = stats.zscore(df['price'])
abs_z_scores_price = np.abs(z_scores_price) 
filtered_entries_price = (abs_z_scores_price < 1)
df = df[filtered_entries_price]
#Price (float)
df= df.loc[df['price'] != -1]
#Surface (float)
df = df.loc[df['surface'] <= 800]
df = df.loc[df['surface'] >= 35]
df = df.loc[df['surface'] != -1]
#Sub type of property (str)
df= df.loc[df['subtype_of_property'] != 'APARTMENT_BLOCK']
df= df.loc[df['subtype_of_property'] != 'MIXED_USE_BUILDING']
others = ["CHALET", "MANOR_HOUSE", "OTHER_PROPERTY", "CASTLE", "PAVILION"]
df.loc[df["subtype_of_property"].isin(others), "type_of_property"] = "OTHER"
#Number of bedrooms (int)
df = df.loc[df['number_of_bedrooms'] <20]
df['type_of_property'] = np.where((df['number_of_bedrooms'] > 20), "OTHER", df['type_of_property'])
df['number_of_bedrooms'] = df['number_of_bedrooms'].replace(-1,0)
#Zip Code (category)
df['zip_code_xx'] = df['postal_code'].apply(lambda x : 'be_zip_'+str(x)[:3])
#Land surface (float)
df['land_surface'] = [land_surface if land_surface != -1 else garden_surface if garden_surface > 0 else land_surface for garden_surface,land_surface in zip(df['garden_surface'],df['land_surface'])]
df['land_surface'] = df['land_surface'].replace(-1,0)
#Garden (0,1)
df['garden'] = df['garden'].replace(-1,0)
#Garden surface (float)
df['garden_surface'] = df['garden_surface'].replace(-1,0)
df['garden_surface'] = df['garden_surface'].replace(1,0)
df.loc[(df["garden_surface"] > 2000) & (df['type_of_property'] == 'APARTMENT'),'type_of_property'] = "OTHER"
#Fully equiped kitchen (int) change later, calculate each value y/n
df["fully_equipped_kitchen"] = df["fully_equipped_kitchen"].map({"-1.0": 0.5,
                                                                 "1.0": 1,
                                                                 "-1": 0.5, 
                                                                 "1": 1, 
                                                                 "INSTALLED": 0.75, 
                                                                 "SEMI_EQUIPPED": 0.60, 
                                                                 "NOT_INSTALLED": 0.57, 
                                                                 "USA_INSTALLED": 0.85, 
                                                                 "USA_SEMI_EQUIPPED": 0.80, 
                                                                 "USA_UNINSTALLED": 0.75})
#Swiming pool (0,1)
df['swimming_pool'] = df['swimming_pool'].replace(-1,0)
#Furnished (0,1)
df['furnished'] = df['furnished'].replace(-1,0)
#Open fire (0,1)
df['open_fire'] = df['open_fire'].replace(-1,0)
#Terrace (0,1)
df['terrace'] = df['terrace'].replace(-1,0)
#Terrace surface (float)
df = df.loc[df['terrace_surface'] < 500]
df['terrace_surface'] = df['terrace_surface'].replace(-1,0)
#Facades (int)
df = df.loc[df["number_of_facades"] < 9]
df["number_of_facades"] = np.where((df["number_of_facades"] == -1) & (df["type_of_property"] == "APARTMENT"), 1, df["number_of_facades"])
df["number_of_facades"] = np.where((df["number_of_facades"] == -1) & (df["type_of_property"] == "HOUSE"), 2, df["number_of_facades"])
df = df.loc[df["number_of_facades"] != -1]
#State of the building (int)
df["state_of_the_building"] = df["state_of_the_building"].map({
    "NO_INFO": 0.87252, #"TO_RENOVATE"
    "TO_BE_DONE_UP": 0.65376, #"JUST_RENOVATED"
    "TO_RENOVATE": 0.56664, #"TO_RENOVATE"
    "TO_RESTORE": 0.46920, #"TO_REBUILD"
    "JUST_RENOVATED": 0.93115, #"JUST_RENOVATED"
    "GOOD": 0.79285, #"GOOD"
    "AS_NEW": 1.0 #"NEW"
})
#Type of property (category)¶
df= df.loc[df["type_of_property"] != "OTHER"]
ohe = OneHotEncoder()
transformed_df = ohe.fit_transform(df[['type_of_property']])
df[ohe.categories_[0]] = transformed_df.toarray()
#price/m² calculate(float)
df['price_m2'] = round(df['price']/df['surface'],2)
#zipcode ratio calculate(float)
df_zip_list = ['price_m2','zip_code_xx']
df_zips = df[df_zip_list]
xxx_zip = df_zips.groupby('zip_code_xx')
xxx_zip_list = []  #stores the name of each zipcode from the data base
for key, values in xxx_zip:
    xxx_zip_list.append(key)
df_zips_mean = round(df_zips.groupby('zip_code_xx').mean(),5)
df_zips_mean_values = df_zips_mean.values  # calculates mean for each zipxx
zip_mean = [] # stores the values as a list of mean for each zipxx
for x in df_zips_mean_values:
    for i in x:
        zip_mean.append(i)
global_mean = statistics.median(zip_mean)  #calculate a global mean
xxx = [] #list of the ponderated means 
for y,i in enumerate(zip_mean):
    xxx.append(round(i/global_mean,2)) #calculates the relation of mean/zip code and the global mean
dic_zip_value = dict()  #creates a dictionay for zipcodes and values
for i,x in enumerate(xxx_zip_list):
    dic_zip_value[x] = xxx[i]
df['zip_code_ratio'] = df['zip_code_xx']
df['zip_code_ratio'] = df['zip_code_ratio'].map(dic_zip_value)

In [5]:
df_test_api_x = df

In [6]:
df_test_api = df_test_api_x[['id',
                 'price',
                 'surface',
                 'type_of_property',
                 'number_of_bedrooms',
                 'postal_code',
                 'land_surface',
                 'garden',
                 'garden_surface',
                 'fully_equipped_kitchen',
                 'swimming_pool',
                 'furnished',
                 'open_fire',
                 'terrace',
                 'terrace_surface',
                 'number_of_facades',
                 'state_of_the_building'                  
                 ]]

In [7]:
df_test_api

Unnamed: 0,id,price,surface,type_of_property,number_of_bedrooms,postal_code,land_surface,garden,garden_surface,fully_equipped_kitchen,swimming_pool,furnished,open_fire,terrace,terrace_surface,number_of_facades,state_of_the_building
0,10131114,100000,150,HOUSE,2,5377,198,0,0,0.57,0,0,0,1,0,3,0.56664
1,10150865,219000,100,APARTMENT,2,5500,0,0,0,1.00,0,0,0,1,0,3,0.79285
3,10022778,285000,81,APARTMENT,2,5030,0,0,0,1.00,0,0,0,1,6,2,1.00000
4,9989192,284000,150,APARTMENT,2,5060,0,0,0,0.75,0,0,0,1,16,3,1.00000
5,9951165,179000,150,HOUSE,2,5170,1013,1,0,0.57,0,0,0,0,0,4,0.56664
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62422,10121854,439900,258,HOUSE,4,9160,969,1,0,0.50,0,0,0,1,0,4,0.65376
62424,10117346,425000,135,HOUSE,4,9040,80,1,12,0.75,0,0,0,1,7,2,1.00000
62427,10103172,235000,170,HOUSE,1,9470,170,1,55,0.75,0,0,0,1,30,2,1.00000
62428,9617011,316200,102,APARTMENT,2,9300,0,0,0,0.50,0,0,0,1,0,1,1.00000


In [8]:
filtered_atributes = [
    'price',
    #'id',
 #'locality',
 #'postal_code',
 # 'region',
 # 'province',
 # 'type_of_property',
 #'subtype_of_property',
 #'type_of_sale',
 'number_of_bedrooms',
 'surface',
 #'kitchen_type',
 'fully_equipped_kitchen',
 #'furnished',
 'open_fire',
 #'terrace',
 'terrace_surface',
 'garden',
 #'garden_surface',
 #'land_surface',
 'number_of_facades',
 'swimming_pool',
 'state_of_the_building',
 #'zip_code_xx',
 #'price_m2',
 'zip_code_ratio',
'HOUSE',
'APARTMENT']

In [9]:
df = df[filtered_atributes]

In [10]:
df_shape_final = df.shape

In [11]:
print(f'initial_df {df_shape_initial} claned_df {df_shape_final}')

initial_df (62430, 23) claned_df (43122, 13)


In [12]:
df.corr()['price']

price                     1.000000
number_of_bedrooms        0.409657
surface                   0.524249
fully_equipped_kitchen    0.212025
open_fire                 0.143235
terrace_surface           0.173810
garden                    0.136044
number_of_facades         0.205666
swimming_pool             0.138481
state_of_the_building     0.218567
zip_code_ratio            0.376468
HOUSE                     0.152233
APARTMENT                -0.152233
Name: price, dtype: float64

In [13]:
df

Unnamed: 0,price,number_of_bedrooms,surface,fully_equipped_kitchen,open_fire,terrace_surface,garden,number_of_facades,swimming_pool,state_of_the_building,zip_code_ratio,HOUSE,APARTMENT
0,100000,2,150,0.57,0,0,0,3,0,0.56664,0.75,1.0,0.0
1,219000,2,100,1.00,0,0,0,3,0,0.79285,0.74,0.0,1.0
3,285000,2,81,1.00,0,6,0,2,0,1.00000,1.07,0.0,1.0
4,284000,2,150,0.75,0,16,0,3,0,1.00000,0.82,0.0,1.0
5,179000,2,150,0.57,0,0,1,4,0,0.56664,0.95,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
62422,439900,4,258,0.50,0,0,1,4,0,0.65376,1.16,1.0,0.0
62424,425000,4,135,0.75,0,7,1,2,0,1.00000,1.28,1.0,0.0
62427,235000,1,170,0.75,0,30,1,2,0,1.00000,0.95,1.0,0.0
62428,316200,2,102,0.50,0,0,0,1,0,1.00000,1.03,0.0,1.0


Initial correlation with price
price                     1.000000
number_of_bedrooms        0.409657
surface                   0.524249
fully_equipped_kitchen    0.156355
furnished                -0.003122
open_fire                 0.143235
terrace                   0.119998
terrace_surface           0.173810
garden                    0.136044
garden_surface            0.098383
land_surface              0.056684
number_of_facades         0.205666
swimming_pool             0.138481
state_of_the_building     0.218567
zip_code_ratio            0.376310

#### **House model**

In [14]:
X = df.iloc[:,1:].values  #features
Y = df.iloc[:,0].values  #target : price

In [15]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=5)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(34497, 12)
(8625, 12)
(34497,)
(8625,)


In [16]:
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [17]:
poly_features = PolynomialFeatures(degree=3)
# transforms the existing features to higher degree features.
X_train_poly = poly_features.fit_transform(X_train)
# fit the transformed features to Linear Regression
poly_model = LinearRegression()
poly_model.fit(X_train_poly, Y_train)
# predicting on training data-set
y_train_predicted = poly_model.predict(X_train_poly)
# predicting on test data-set
y_test_predict = poly_model.predict(poly_features.fit_transform(X_test))

In [18]:
y_train_predicted[1]

124909.25

In [19]:
Y_test[4]

291725

In [20]:
y_test_predict[4]

271750.75

In [21]:
# evaluating the model on training dataset
rmse_train = np.sqrt(mean_squared_error(Y_train, y_train_predicted))
r2_train = r2_score(Y_train, y_train_predicted)

In [22]:
# evaluating the model on test dataset
rmse_test = np.sqrt(mean_squared_error(Y_test, y_test_predict))
r2_test = r2_score(Y_test, y_test_predict)

In [23]:
result = {'rmse_train':round(rmse_train,2),'r2_train':round(r2_train,2),'rmse_test':round(rmse_test,2),'r2_test':round(r2_test,2)}

In [24]:
result

{'rmse_train': 81196.42,
 'r2_train': 0.72,
 'rmse_test': 85317.24,
 'r2_test': 0.7}

## Save model

Save the model using pickle

In [25]:
import pickle

In [26]:
filename = 'immo_house_model.pkl'
pickle.dump(poly_model, open(filename, 'wb'))

# **Preprocessing**

Clean and Transform the json file as an array
{}, same order as the ML model

In [27]:
import json

In [28]:
input_property_json = {
  "data": {
    "area": 150,
    "property_type": "APARTMENT",
    "rooms_number": 2,
    "zip_code": 5377,
    "land_area": 198,
    "garden": False,
    "garden_area": 0,
    "equipped_kitchen": None,
    "full_address": "Heure 134 Montgomery strt",
    "swimming_pool": False,
    "furnished": False,
    "open_fire": False,
    "terrace": False,
    "terrace_area": 0,
    "facades_number": None,
    "building_state": None 
  }
}

In [29]:
with open("preprocessing/test_property.json", "w") as outfile:
    json.dump(input_property_json, outfile)

In [30]:
with open("preprocessing/test_property.json", 'r') as openfile:
    json_object = json.load(openfile)

In [31]:
json_object

{'data': {'area': 150,
  'property_type': 'APARTMENT',
  'rooms_number': 2,
  'zip_code': 5377,
  'land_area': 198,
  'garden': False,
  'garden_area': 0,
  'equipped_kitchen': None,
  'full_address': 'Heure 134 Montgomery strt',
  'swimming_pool': False,
  'furnished': False,
  'open_fire': False,
  'terrace': False,
  'terrace_area': 0,
  'facades_number': None,
  'building_state': None}}

In [32]:
def transform_elment(element):
    zip_code_dict_xx = {'be_zip_10': 1.53,
    'be_zip_11': 1.68,
    'be_zip_12': 1.66,
    'be_zip_13': 1.29,
    'be_zip_14': 1.18,
    'be_zip_15': 1.24,
    'be_zip_16': 1.31,
    'be_zip_17': 1.23,
    'be_zip_18': 1.22,
    'be_zip_19': 1.5,
    'be_zip_20': 1.53,
    'be_zip_21': 1.17,
    'be_zip_22': 1.13,
    'be_zip_23': 1.12,
    'be_zip_24': 1.03,
    'be_zip_25': 1.24,
    'be_zip_26': 1.27,
    'be_zip_27': 1.11, 
    'be_zip_28': 1.22,
    'be_zip_29': 1.3,
    'be_zip_30': 1.58,
    'be_zip_31': 1.18,
    'be_zip_32': 1.1,
    'be_zip_33': 1.07,
    'be_zip_34': 0.87,
    'be_zip_35': 1.13,
    'be_zip_36': 1.0,
    'be_zip_37': 0.9,
    'be_zip_38': 0.94,
    'be_zip_39': 1.0,
    'be_zip_40': 0.93,
    'be_zip_41': 0.85,
    'be_zip_42': 0.86,
    'be_zip_43': 0.87,
    'be_zip_44': 0.81,
    'be_zip_45': 0.76,
    'be_zip_46': 0.95,
    'be_zip_47': 0.98,
    'be_zip_48': 0.85,
    'be_zip_49': 0.94,
    'be_zip_50': 0.97,
    'be_zip_51': 1.0,
    'be_zip_52': 0.77,  
    'be_zip_53': 0.87,
    'be_zip_54': 0.77,
    'be_zip_55': 0.76,
    'be_zip_56': 0.67,
    'be_zip_57': 0.77,
    'be_zip_58': 0.77,
    'be_zip_59': 0.77,
    'be_zip_60': 0.64,
    'be_zip_61': 0.74,
    'be_zip_62': 0.78,
    'be_zip_63': 0.69,
    'be_zip_64': 0.66,
    'be_zip_65': 0.67,
    'be_zip_66': 0.91,
    'be_zip_67': 0.97,
    'be_zip_68': 0.84,
    'be_zip_69': 0.83,
    'be_zip_70': 0.8,
    'be_zip_71': 0.69,
    'be_zip_72': 0.67,
    'be_zip_73': 0.58,
    'be_zip_75': 0.86,
    'be_zip_76': 0.66,
    'be_zip_77': 0.79,
    'be_zip_78': 0.91,
    'be_zip_79': 0.66,
    'be_zip_80': 1.34,
    'be_zip_81': 1.25,
    'be_zip_82': 1.32,
    'be_zip_83': 2.12,
    'be_zip_84': 1.43,
    'be_zip_85': 1.06,
    'be_zip_86': 1.61,
    'be_zip_87': 1.16,
    'be_zip_88': 0.98,
    'be_zip_89': 0.95,
    'be_zip_90': 1.46,
    'be_zip_91': 1.13,
    'be_zip_92': 1.11,
    'be_zip_93': 1.03,
    'be_zip_94': 1.0,
    'be_zip_95': 0.96,
    'be_zip_96': 0.94,
    'be_zip_97': 1.11,
    'be_zip_98': 1.27,
    'be_zip_99': 1.16}
    
    state_of_the_building_dict = {
    "NEW": 1.0,
    "GOOD": 0.79285,
    "TO RENOVATE": 0.56664,
    "JUST RENOVATED": 0.93115,
    "TO REBUILD": 0.46920
    }
    
    transformed_elment = {
    #'type_of_property': element["data"]["property-type"],
    #'full_address': element["data"]["full-address"],
    #'zip_code_xx': element["data"]["zip-code"]
    'number_of_bedrooms': element["data"]["rooms_number"],
    'surface': element["data"]["area"],    
    'fully_equipped_kitchen': 1 if element["data"]["equipped_kitchen"] == True else 0,
    #'furnished': element["data"]["furnished"],
    'open_fire': 1 if element["data"]["open_fire"] == True else 0,
    #'terrace': element["data"]["terrace"],
    'terrace_surface': element["data"]["terrace_area"] if element["data"]["terrace_area"] is not None else 0,
    'garden': 1 if element["data"]["garden"] == True else 0,
    #'garden_surface': element["data"]["garden-area"],
    #'land_surface': element["data"]["land-area"],
    'number_of_facades': element["data"]["facades_number"] if element["data"]["facades_number"] is not None else 1 if element["data"]["property_type"] == 'APARTMENT' else 2,
    'swimming_pool': 1 if element["data"]["swimming_pool"] == True else 0,
    'state_of_the_building': state_of_the_building_dict[element["data"]["building_state"]] if element["data"]["building_state"] is not None else 0.87252,
    'zip_code_ratio': zip_code_dict_xx['be_zip_'+str(element["data"]["zip_code"])[:2]],
    'HOUSE': 1 if element["data"]["property_type"] == 'HOUSE' else 0,
    'APARTMENT': 1 if element["data"]["property_type"] == 'APARTMENT' else 0
    }
    return list(transformed_elment.values())

In [33]:
x = transform_elment(json_object)

In [34]:
x

[2, 150, 0, 0, 0, 0, 1, 0, 0.87252, 0.87, 0, 1]

# **Predict**

In [35]:
from fastapi import FastAPI
from pydantic import BaseModel
import pickle
import pandas as pd

with open('immo_house_model.pkl', 'rb') as f: 
    house_model = pickle.load(f)

In [36]:
app = FastAPI()

In [37]:
@app.get('/')
async def scoring_endpoint():
    return {"hello":"world"}

@app.post('/')
async def scoring_endpoint(item:ScoringItem):
    df = pd.DataFrame([item.dict().values()], columns=item.dict().keys())
    yhat = house_model.predict(df)
    status_code = "?"
    return {"prediction": int(yhat), "status_code": str(status_code)}

In [38]:
def predict(clean_data):

SyntaxError: incomplete input (14203027.py, line 1)

# **App**