# **Machine Learning notebook for Saudi Real Estate Classifier**

# Import Libraries

In [1]:
# import library cell
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time
import scipy.stats as stats

Here, we Import all libraries that we need to complete our ML model in all different steps.

---



# Import our Dataset

In [5]:
Q2Frame = pd.read_csv('./docrealestatesale_2023_q2.csv')
Q3Frame = pd.read_csv('./docrealestatesale_2023_q3.csv')
result = [Q2Frame, Q3Frame]
frame = pd.concat(result)
frame['عدد العقارات'].max()


36

We import our Dataset as CSV then fit it to Pandas read function to convert it to Dataframe.

---



# Exploratory Data Analysis [EDA]

## 1. The number of records and features (The Shape) of the dataframe

In [272]:
records, features = frame.shape
print(f"Number of records: {records}")
print(f"Number of features: {features}")

Number of records: 87356
Number of features: 8


## 2. The datatypes of the dataframe

In [273]:
frame.info()

<class 'pandas.core.frame.DataFrame'>
Index: 87356 entries, 0 to 48544
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   المنطقة         87356 non-null  object 
 1   المدينة         87356 non-null  object 
 2   المدينة / الحي  87356 non-null  object 
 3   تصنيف العقار    87356 non-null  object 
 4   نوع العقار      87356 non-null  object 
 5   عدد العقارات    87356 non-null  int64  
 6   السعر           87356 non-null  float64
 7   المساحة         87356 non-null  float64
dtypes: float64(2), int64(1), object(5)
memory usage: 6.0+ MB


## 3. The statistical description of the dataframe

In [274]:
frame.describe()

Unnamed: 0,عدد العقارات,السعر,المساحة
count,87356.0,87356.0,87356.0
mean,1.029557,1145419.0,4705.424
std,0.490254,17198960.0,68488.59
min,1.0,5000.0,0.0
25%,1.0,120900.0,300.0
50%,1.0,399000.0,500.0
75%,1.0,815000.0,705.15
max,36.0,2657190000.0,4100323.0


## 4. Checking for null values in the dataframe

In [275]:
nullCount=frame.isnull().sum()
print(f"Total Null Values:\n{nullCount}")

Total Null Values:
المنطقة           0
المدينة           0
المدينة / الحي    0
تصنيف العقار      0
نوع العقار        0
عدد العقارات      0
السعر             0
المساحة           0
dtype: int64


## 5. Correlation matrix

In [18]:
#corr=frame.corr()
#print(f"Correlation Matrix: \n{corr}")

## 6. Visualizing of Correlation Matrix

In [19]:
#rs = np.random.RandomState(33)
#mask = np.triu(np.ones_like(corr, dtype=bool))
#f, ax = plt.subplots(figsize=(11, 9))
#cmap = sns.diverging_palette(230, 20, as_cmap=True)
#sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,square=True, linewidths=.5, cbar_kws={"shrink": .5})

# Data preprocessing and Feature engineering


## Feature Extraction

### Because we already did the Correlation matrix we figure out the rainful feature is the least related to the other features.

## Cleaning outliers and imputations data

### Becuase we already did Exploratory Data Analysis we figure out that we did not have any missing or outliers in our dataset that need to be filled or fixed.

## Tranformation

In [276]:
encoder = LabelEncoder()
frame["المدينة"] = encoder.fit_transform(frame["المدينة"])
frame["المدينة / الحي"] = encoder.fit_transform(frame["المدينة / الحي"])
frame["المنطقة"] = encoder.fit_transform(frame["المنطقة"])
frame["تصنيف العقار"] = encoder.fit_transform(frame["تصنيف العقار"])
frame["نوع العقار"] = encoder.fit_transform(frame["نوع العقار"])
bins = [4999, 10000,50000, 100000,200000,300000,400000,500000,600000,700000,800000,900000, 1000000, 1500000, 2000000, 2500000, 3000000, 3500000, 4000000, 4500000, 5000000, 5500000, 6000000,6500000,7000000,7500000,8000000,8500000,9000000,9500000, 10000000,frame["السعر"].max() ]

# Create a new column with interval labels
frame['price_interval'] = pd.cut(frame['السعر'], bins)
frame['price_interval']=encoder.fit_transform(frame['price_interval'])
frame.where(frame['السعر']>10000000).dropna()


Unnamed: 0,المنطقة,المدينة,المدينة / الحي,تصنيف العقار,نوع العقار,عدد العقارات,السعر,المساحة,price_interval
9,3.0,36.0,1080.0,1.0,7.0,1.0,18300000.0,1190.00,30.0
91,3.0,36.0,1002.0,1.0,7.0,13.0,111000000.0,12150.00,30.0
390,3.0,30.0,794.0,3.0,7.0,1.0,16000000.0,10000.00,30.0
412,3.0,36.0,1108.0,1.0,7.0,1.0,12648204.0,843.21,30.0
751,4.0,23.0,594.0,1.0,7.0,4.0,11500000.0,10000.00,30.0
...,...,...,...,...,...,...,...,...,...
47998,11.0,101.0,2709.0,3.0,7.0,1.0,12300000.0,2050.00,30.0
48059,3.0,106.0,2965.0,2.0,0.0,1.0,15000000.0,50000.00,30.0
48125,11.0,101.0,2739.0,3.0,7.0,1.0,12353659.5,1680.77,30.0
48155,6.0,73.0,1607.0,1.0,7.0,1.0,14000000.0,5533.22,30.0


## Normalization

### Because our data do not have wide range and our work flow need use the data in similar, we do not need to perform normalization

# Split data into Training and Testing

## Define X values

In [278]:
X =frame.drop(columns=['السعر','price_interval'])



We will dropped the result because it belong to Y (The target) and Rainful feature because the result of the corrlation matrix and Season after testing that show in feature importans it effect the result negativly


---



## Define y value

In [279]:
y= frame['price_interval']

## Split and Assign Training and Testing

In [280]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1);

# Define algorithms

## Assign each algorithm to variable name

In [281]:
DT= DecisionTreeClassifier()
KNN= KNeighborsClassifier()
NB = GaussianNB()
SVMC =svm.SVC()
RFC = RandomForestClassifier()
ANN=MLPClassifier()

## Fitting X and y to each algorithm

In [282]:
DT.fit(X_train.values,y_train.values)
KNN.fit(X_train.values,y_train.values)
NB.fit(X_train.values,y_train.values)
SVMC.fit(X_train.values,y_train.values)
RFC.fit(X_train.values,y_train.values)
ANN.fit(X_train.values,y_train.values)

# Feature Selection
Only tree-based algorithms mainly focus on the feature importance technique to give a visualization of the tree structure.


---



## Decision Tree Feature Importance

In [283]:
DT.feature_importances_

array([0.04198201, 0.03825549, 0.35322782, 0.03286398, 0.01493581,
       0.00454879, 0.5141861 ])

## Random Forest Feature Importance

In [284]:
RFC.feature_importances_

array([0.03581902, 0.05818723, 0.36174335, 0.01170726, 0.01494943,
       0.00438064, 0.51321308])

# Model Development
In this section we will experiment different models and report observations in terms of predictions and training time


---



## Decision Tree

In [None]:
start = time.time()
DTRes=DT.predict([[4,144,3803,3,7,1,920.00]])
end = time.time()
print("Run time [s]: ",end-start)
print("predictions: ",DTRes)

Run time [s]:  0.0007200241088867188
predictions:  [1]


## K-nearest Neighbors

In [None]:
start = time.time()
KNNRes=KNN.predict([[4,144,3803,3,7,1,920.00]])
end = time.time()
print("Run time [s]: ",end-start)
print("predictions: ",KNNRes)

Run time [s]:  0.002438068389892578
predictions:  [1]


## Naive Bayes

In [None]:
start = time.time()
NBRes=NB.predict([[4,144,3803,3,7,1,920.00]])
end = time.time()
print("Run time [s]: ",end-start)
print("predictions: ",NBRes)

Run time [s]:  0.0007560253143310547
predictions:  [1]


## Random Forest

In [None]:
start = time.time()
RFCRes=RFC.predict([[4,144,3803,3,7,1,920.00]])
end = time.time()
print("Run time [s]: ",end-start)
print("predictions: ",RFCRes)

Run time [s]:  0.004540920257568359
predictions:  [1]


## Support Vector Machine

In [None]:
start = time.time()
SVMRes=SVMC.predict([[4,144,3803,3,7,1,920.00]])
end = time.time()
print("Run time [s]: ",end-start)
print("predictions: ",SVMRes)

Run time [s]:  0.011744976043701172
predictions:  [3]


## Artificial Neural Networks

In [None]:
start = time.time()
ANNRes=ANN.predict([[4,144,3803,3,7,1,920.00]])
end = time.time()
print("Run time [s]: ",end-start)
print("predictions: ",ANNRes)

Run time [s]:  0.0002837181091308594
predictions:  [1]


# Analysis of Decision Tree Before Hyper-parameter Tuning

## The depth of Decision Tree

In [None]:
print(DT.get_depth())

41


## The number of tree leaves

In [None]:
print(DT.get_n_leaves())

32124


## Average Accuracy

### %**79**

## Average Training Time

### **0.00096s**

# Hyper-parameter Tuning for Decision Tree

## Redefine Decision Tree

In [None]:
DT=DecisionTreeClassifier(criterion="entropy", splitter='best')

## Fitting X and y in Decision Tree

In [None]:
DT.fit(X_train.values,y_train.values)

## Predict from Decision Tree after Hyper-parameter Tuning

In [None]:
start = time.time()
DTRes=DT.predict([[4,144,3803,3,7,1,920.00]])
end = time.time()
print("Run time [s]: ",end-start)
print("predictions: ",DTRes)

# Analysis of Decision Tree After Hyper-parameter Tuning

## The depth of Decision Tree

In [None]:
print(DT.get_depth())

34


## The number of tree leaves

In [None]:
print(DT.get_n_leaves())

32117


## Average Accuracy

### **%80**

## Average Training Time

### **0.00085**

# Saving The model

In this section we will save the model that we wish to use per concluding results. When dealing with Scikit learn two popular approaches for saving the models are Joblib and Pickle. It is good to note that pickle sufferes from security issues. Here is an article to show the difference between the two:
https://mljar.com/blog/save-load-scikit-learn-model/

In [None]:
# library for save and load scikit-learn models
#import pickle

In [None]:
# file name, I'm using *.pickle as a file extension
filename = "Saudi_RealEstate_Classifier.pkl"

In [None]:
# save model
pickle.dump(DT, open(filename, "wb"))

In [None]:
# load model
loaded_model = pickle.load(open(filename, "rb"))

In [None]:
# you can use loaded model to compute predictions
y_predicted = loaded_model.predict([[4,144,3803,3,7,1,920.00]])
y_predicted[0]