# Data Preparation

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

We'll use the data cleaned by all the nan values in the target 'price'

In [2]:
data = pd.read_csv('property24_data.csv')
# Removing the price set as nan
data = data[data['Price'].notna()]
data

Unnamed: 0,Bathrooms,Bedrooms,Erf Size,Floor Size,Garages,Listing Date,Listing Number,Pet Friendly,Price,Town,...,Secure Parking,Parking,No Pets Allowed,Furnished,Flatlet,Fibre Internet,Furnished (Optional),ADSL Internet,Satellite Internet,Fixed WiMax Internet
0,3.0,3.0,304 m²,326 m²,2.0,10 July 2020,108859219,1.0,6480000.0,"Tokai, Cape Town",...,,,,,,,,,,
1,3.0,3.0,,140 m²,2.0,23 September 2020,109153971,1.0,6250000.0,"14 Graham Road, Sea Point, Cape Town",...,,,,,,,,,,
2,3.5,4.0,659 m²,300 m²,2.0,23 September 2020,109154367,,13950000.0,"Camps Bay, Cape Town",...,2.0,,,,,,,,,
3,1.0,2.0,,,,23 September 2020,109154455,,1050000.0,"Ottery, Cape Town",...,,1.0,,,,,,,,
4,1.0,1.0,,61 m²,,23 July 2020,108907258,,1265000.0,"Muizenberg, Cape Town",...,1.0,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7896,2.0,2.0,140 m²,,1.0,26 September 2019,107980205,,3500000.0,"Wynberg, Cape Town",...,,,1.0,,,,,,,
7897,2.0,4.0,1 100 m²,120 m²,,19 August 2019,107846534,1.0,2100000.0,"Grassy Park, Cape Town",...,,12.0,,,,,,,,
7898,2.0,2.0,,225 m²,,15 May 2018,106337080,,8500000.0,"Cape Town City Centre, Cape Town",...,,2.0,1.0,,,,,,,
7899,2.0,3.0,116 m²,126 m²,,15 September 2020,109120907,1.0,2799000.0,"19 Coventry Road, Woodstock, Cape Town",...,,,,,,,,,,


## Split data

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
train_set,test_set=train_test_split(data,test_size=0.2,random_state=26) #We choose a 80% training set and 20% test set
test_set

Unnamed: 0,Bathrooms,Bedrooms,Erf Size,Floor Size,Garages,Listing Date,Listing Number,Pet Friendly,Price,Town,...,Secure Parking,Parking,No Pets Allowed,Furnished,Flatlet,Fibre Internet,Furnished (Optional),ADSL Internet,Satellite Internet,Fixed WiMax Internet
5678,2.0,1.0,,61 m²,1.0,12 March 2020,108563374,,1995000.0,"17 Cambridge Street, Walmer Estate, Cape Town",...,,1.0,,,,1.0,,,,
2645,2.5,4.0,,,2.0,02 September 2020,109057291,,14950000.0,"Camps Bay, Cape Town",...,,,,,,,,,,
4553,2.0,2.0,,198 m²,,24 February 2020,108480262,,36900000.0,"30 Victoria Road, Clifton, Cape Town",...,2.0,,,,,,,,,
2220,2.0,3.0,250 m²,,1.0,23 August 2020,109020308,,1575000.0,"Muizenberg, Cape Town",...,,1.0,,,,,,,,
965,2.0,4.0,,261 m²,2.0,30 July 2020,108932680,1.0,45000000.0,"42 Beach Road, Mouille Point, Cape Town",...,1.0,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7154,1.0,1.0,41 m²,41 m²,1.0,22 January 2020,108348257,1.0,2495000.0,"104 The Vera, 19 Davenport Road, Vredehoek, Ca...",...,,,,,,1.0,,,,
7618,4.0,5.0,2 782 m²,,2.0,16 May 2018,106341069,,11900000.0,"Bishopscourt, Cape Town",...,,2.0,,,,,,,,
3237,3.0,3.0,446 m²,,2.0,23 July 2020,108904885,1.0,7600000.0,"Newlands, Cape Town",...,,,,,,,,,,
2325,1.0,1.0,,40 m²,,19 August 2020,107382641,,1385000.0,"Observatory, Cape Town",...,,1.0,,,,,,,,


## Cleaning missing values and useless features

In [5]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_pipeline

In [6]:
class Cleaning(BaseEstimator,TransformerMixin):
    
    def __init__(self):
        pass
    
    def fit(self,df,y=None):
        return self
    
    def transform(self,df, y=None):
        
        # Droping columns with high number of NAN
        df = df[df.columns[df.isna().sum()/df.shape[0] < 0.8]]
        df.drop(['Listing Date', 'Listing Number', 'Town'], axis = 1, inplace = True)
        
        # Changing the type to float
        df['Erf Size'] = df['Erf Size'].str.strip('m²').str.replace(' ','').str.replace('nan','np.nan')
        df['Floor Size'] = df['Floor Size'].str.strip('m²').str.replace(' ','')
        # Select lines with ha
        replace_dict = {'ha': '*(1e4)'}
        df.loc[df['Erf Size'].notna(), 'Erf Size'] = df.loc[df['Erf Size'].notna(), 'Erf Size'].replace(replace_dict, regex = True).map(pd.eval)
        df['Floor Size'] = df['Floor Size'].astype(float)
        df['Erf Size'] = df['Erf Size'].astype(float)
        
        # We can see that NAN with binar features must be 0
        for col in ['Garages','Pet Friendly','Garden','Pool', 'Parking']:
            df[col] = df[col].fillna(0)
       
        return df
        
    def fit_transform(self,df, y=None ):
        self.fit(df, y)
        return self.transform(df, y) 

In [7]:
class Size(BaseEstimator,TransformerMixin):
    
    def __init__(self):
        pass
    
    def fit(self,X,y=None):
        return self
    
    def transform(self,X, y=None):
        
        Both_null = X.index[(X['Erf Size'].isna()) & (X['Floor Size'].isna())]
        for indice in Both_null:
            X = X.drop(indice)
        
        
        #Erf_null_apartment = X.index[(X['Type of Property'] == 'Apartment / Flat') & (X['Erf Size'].isna())]
        
        #for indice in Erf_null_apartment:
        #    X.loc[indice, 'Erf Size'] = X.loc[indice, 'Floor Size']
        
        Erf_null = X.index[(X['Erf Size'].notna()) &  (X['Erf Size']>=0)]
        #X.loc[Erf_null,:]
        #print(Erf_null)
        #print(X.loc[7714,:])
        
       
        return X
        
    def fit_transform(self,X, y=None ):
        self.fit(X, y)
        return self.transform(X, y)

## Outliers

In [8]:
class Outliers_removal(BaseEstimator,TransformerMixin):
    
    def __init__(self, method = 'IQR', q1=0, q3=0):
        self.method=method
        self.q1 = q1
        self.q3 = q3
    
    def fit(self,X,y=None):
        return self
    
    def transform(self,X, y=None):
        
        # Price outliers
        if self.method == 'IQR':
            self.q1 = 0.01
            self.q3 = 0.99
        
        if self.method == 'Big':
            self.q1 = 0
            self.q3 = 0.99
            
        q_i = X['Price'].quantile(self.q1)
        q_f = X['Price'].quantile(self.q3)
        mask_price = X['Price'].between(q_i, q_f, inclusive = True) 
            
        X = X.loc[mask_price, :]
        
        # Bathrooms outliers
        outlier = X['Bathrooms'].quantile(1 - 1/(X.shape[0]))
        mask_bathrooms = X['Bathrooms'].between(0, outlier, inclusive = True)
        
        X = X.loc[mask_bathrooms, :]
        
        # Floor Size outliers
        outlier = X['Floor Size'].quantile(1 - 0.1/(X.shape[0]))
        mask_floor_size = X['Floor Size'].between(0, outlier, inclusive = True)
        
        X = X.loc[mask_floor_size, :]
        
        # Erf Size outliers
        outlier = X['Erf Size'].quantile(1 - 1/(X.shape[0]))
        mask_erf_size = X['Erf Size'].between(0, outlier, inclusive = True)
        
        #X = X.loc[mask_erf_size, :]
        
        # Bathrooms outliers
        outlier = X['Garages'].quantile(1 - 1/(X.shape[0]))
        mask_garages = X['Garages'].between(0, outlier, inclusive = True)
        
        #X = X.loc[mask_garages, :]
        
        # Bathrooms outliers
        outlier = X['Parking'].quantile(1 - 2/(X.shape[0]))
        mask_parking = X['Parking'].between(0, outlier, inclusive = True)
        
        #X = X.loc[mask_parking, :]
        
        return X  
            

# Creating the pipeline

In [9]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, RobustScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline

All together in a pipeline

In [10]:
pre_pipeline = make_pipeline(Cleaning(), Size() , Outliers_removal())
pre_pipeline

Pipeline(steps=[('cleaning', Cleaning()), ('size', Size()),
                ('outliers_removal', Outliers_removal())])

In [11]:
df_cle = pre_pipeline.fit_transform(train_set)
df_cle

Unnamed: 0,Bathrooms,Bedrooms,Erf Size,Floor Size,Garages,Pet Friendly,Price,Type of Property,Garden,Pool,Parking
2485,2.0,4.0,448.0,185.0,2.0,0.0,2550000.0,House,0.0,0.0,0.0
982,1.0,1.0,,61.0,0.0,0.0,3191550.0,Apartment / Flat,0.0,1.0,1.0
7176,2.0,2.0,,138.0,1.0,0.0,4800000.0,Apartment / Flat,0.0,1.0,0.0
852,1.0,0.5,,39.0,1.0,0.0,2175000.0,Apartment / Flat,0.0,1.0,1.0
7803,1.0,2.0,,59.0,0.0,0.0,2200000.0,Apartment / Flat,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
7293,1.0,1.0,94.0,94.0,0.0,0.0,3500000.0,Apartment / Flat,0.0,1.0,0.0
2005,2.0,4.0,1182.0,171.0,2.0,0.0,15500000.0,House,0.0,0.0,0.0
1471,2.0,3.0,1020.0,350.0,1.0,0.0,4395000.0,House,1.0,1.0,4.0
5479,1.0,1.0,,64.0,1.0,0.0,3750000.0,Apartment / Flat,1.0,0.0,1.0


In [12]:
target = df_cle["Price"].copy()
df_cle.drop("Price", axis=1, inplace = True)
df_cle

Unnamed: 0,Bathrooms,Bedrooms,Erf Size,Floor Size,Garages,Pet Friendly,Type of Property,Garden,Pool,Parking
2485,2.0,4.0,448.0,185.0,2.0,0.0,House,0.0,0.0,0.0
982,1.0,1.0,,61.0,0.0,0.0,Apartment / Flat,0.0,1.0,1.0
7176,2.0,2.0,,138.0,1.0,0.0,Apartment / Flat,0.0,1.0,0.0
852,1.0,0.5,,39.0,1.0,0.0,Apartment / Flat,0.0,1.0,1.0
7803,1.0,2.0,,59.0,0.0,0.0,Apartment / Flat,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...
7293,1.0,1.0,94.0,94.0,0.0,0.0,Apartment / Flat,0.0,1.0,0.0
2005,2.0,4.0,1182.0,171.0,2.0,0.0,House,0.0,0.0,0.0
1471,2.0,3.0,1020.0,350.0,1.0,0.0,House,1.0,1.0,4.0
5479,1.0,1.0,,64.0,1.0,0.0,Apartment / Flat,1.0,0.0,1.0


## Scaling and Encoding

In [13]:
num_pipeline=Pipeline([
                        ('std_scaler',StandardScaler()),
                        ('imputer',KNNImputer(n_neighbors=13)),
                        #('polynomial',PolynomialFeatures())
                      ])
num_pipeline

Pipeline(steps=[('std_scaler', StandardScaler()),
                ('imputer', KNNImputer(n_neighbors=13))])

In [14]:
df_num = df_cle.drop('Type of Property', axis = 1)
num_attribs=list(df_num)
cat_attrib=['Type of Property']
impute_pipeline=ColumnTransformer([('num',num_pipeline,num_attribs),
                                   ('cat',OneHotEncoder(),cat_attrib)
                                ])

impute_pipeline = make_pipeline(impute_pipeline)
impute_pipeline

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('std_scaler',
                                                                   StandardScaler()),
                                                                  ('imputer',
                                                                   KNNImputer(n_neighbors=13))]),
                                                  ['Bathrooms', 'Bedrooms',
                                                   'Erf Size', 'Floor Size',
                                                   'Garages', 'Pet Friendly',
                                                   'Garden', 'Pool',
                                                   'Parking']),
                                                 ('cat', OneHotEncoder(),
                                                  ['Type of Property'])]))])

In [15]:
df_pre = impute_pipeline.fit_transform(df_cle)
df_pre

array([[ 0.1064312 ,  1.25050598, -0.04130842, ...,  0.        ,
         1.        ,  0.        ],
       [-0.74781432, -0.97985672, -0.0124706 , ...,  1.        ,
         0.        ,  0.        ],
       [ 0.1064312 , -0.23640249, -0.03592825, ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [ 0.1064312 ,  0.50705175, -0.0339972 , ...,  0.        ,
         1.        ,  0.        ],
       [-0.74781432, -0.97985672,  0.77978098, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.1064312 ,  0.50705175, -0.04343021, ...,  0.        ,
         1.        ,  0.        ]])

## Model Selecting 

In [16]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_log_error

In [17]:
Forest = RandomForestRegressor()

In [18]:
def evaluate(model,CV=10):
    neg_scores=cross_val_score(model,df_pre,target,scoring="neg_mean_squared_error",cv=CV)
    scores=np.sqrt(-neg_scores)
    y_pred = model.predict(df_pre)
    R2_score = r2_score(target,y_pred)
    RMSLE = mean_squared_log_error(target,y_pred)
    print('r2 socre = ',R2_score)
    print('root_mean_squared_log_error =', np.sqrt(RMSLE))
    print("\nTrain error : ",np.sqrt(mean_squared_error(target,y_pred)))
    print("\nValidation score : ",scores.mean())
    print("\nStandard deviation : ",scores.std())

In [27]:
# try 12 (3×4) combinations of hyperparameters
param_grid = {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]}

grid = GridSearchCV(Forest, param_grid, cv=5,
                    scoring='neg_mean_squared_error',
                    return_train_score=True)

grid.fit(df_pre, target)

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid={'max_features': [2, 4, 6, 8],
                         'n_estimators': [3, 10, 30]},
             return_train_score=True, scoring='neg_mean_squared_error')

In [28]:
final_model = grid.best_estimator_
final_model

RandomForestRegressor(max_features=6, n_estimators=30)

In [30]:
pipeline = make_pipeline(impute_pipeline, final_model)

In [31]:
pipeline.fit(df_cle, target)

Pipeline(steps=[('pipeline',
                 Pipeline(steps=[('columntransformer',
                                  ColumnTransformer(transformers=[('num',
                                                                   Pipeline(steps=[('std_scaler',
                                                                                    StandardScaler()),
                                                                                   ('imputer',
                                                                                    KNNImputer(n_neighbors=13))]),
                                                                   ['Bathrooms',
                                                                    'Bedrooms',
                                                                    'Erf Size',
                                                                    'Floor '
                                                                    'Size',
                                     

In [32]:
pipeline.score(df_cle, target)

0.9493364963578402

In [33]:
test_cle = pre_pipeline.transform(test_set)

In [34]:
y_test = test_cle["Price"].copy()
test_cle.drop("Price", axis=1, inplace = True)
test_cle

Unnamed: 0,Bathrooms,Bedrooms,Erf Size,Floor Size,Garages,Pet Friendly,Type of Property,Garden,Pool,Parking
5678,2.0,1.0,,61.0,1.0,0.0,Apartment / Flat,0.0,0.0,1.0
6471,1.0,2.0,78.0,74.0,0.0,1.0,Apartment / Flat,0.0,0.0,0.0
379,2.5,3.0,,114.0,1.0,1.0,Apartment / Flat,0.0,0.0,0.0
6609,1.0,1.0,,48.0,2.0,0.0,Apartment / Flat,0.0,1.0,0.0
5964,1.0,2.0,,81.0,1.0,0.0,Apartment / Flat,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
5591,3.5,3.0,,267.0,0.0,0.0,Apartment / Flat,0.0,1.0,0.0
790,1.5,2.0,,109.0,1.0,0.0,Apartment / Flat,0.0,0.0,0.0
7154,1.0,1.0,41.0,41.0,1.0,1.0,Apartment / Flat,0.0,0.0,0.0
2325,1.0,1.0,,40.0,0.0,0.0,Apartment / Flat,0.0,0.0,1.0


In [35]:
pipeline.score(test_cle,y_test)

0.7437381173621294

In [36]:
pipeline.predict(test_cle)

array([2361597.22222222, 2139466.66666667, 3590658.33333333, ...,
       2306888.88888889, 1655788.88888889, 3984366.66666667])

# Exporting model

In [37]:
import pickle

In [38]:
pickle.dump(pipeline, open('model.pkl', 'wb'))