# Importing Libraries

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("land_riyadh.csv")
df.shape

(7824, 8)

# Concatenating the two data

In [3]:
df2 = pd.read_csv("aqardata_2.csv")
df2 = df2.rename({'Pricepm':'pricepermeter'} ,axis=1)
df_concat = pd.concat([df,df2])
df_concat.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10775 entries, 0 to 2950
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   mainlocation   10775 non-null  object
 1   sublocation    10100 non-null  object
 2   neighborhood   10775 non-null  object
 3   frontage       10775 non-null  object
 4   purpose        10428 non-null  object
 5   size           10775 non-null  int64 
 6   streetwidth    10772 non-null  object
 7   pricepermeter  10775 non-null  int64 
dtypes: int64(2), object(6)
memory usage: 757.6+ KB


# Cleaning data

In [10]:
df_concat = df_concat.dropna()
df_no_dup = df_concat
numeric_columns = [ "size","streetwidth" ,"pricepermeter"]
nominal_columns = ["mainlocation" , "sublocation" , "neighborhood","frontage","purpose"]
df_no_dup = df_no_dup.query("streetwidth != 'غير محدد'")
df_no_dup = df_no_dup.query("purpose != 'غير محدد'")
df_no_dup = df_no_dup.drop_duplicates(keep='first',ignore_index =True)

In [11]:
df_no_dup['streetwidth'] = df_no_dup['streetwidth'].astype('int')
df_no_dup.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5852 entries, 0 to 5851
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   mainlocation   5852 non-null   object
 1   sublocation    5852 non-null   object
 2   neighborhood   5852 non-null   object
 3   frontage       5852 non-null   object
 4   purpose        5852 non-null   object
 5   size           5852 non-null   int64 
 6   streetwidth    5852 non-null   int64 
 7   pricepermeter  5852 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 365.9+ KB


In [12]:
df_no_dup = df_no_dup.reset_index(drop=True)
df_no_dup.head(1)

Unnamed: 0,mainlocation,sublocation,neighborhood,frontage,purpose,size,streetwidth,pricepermeter
0,الرياض,غرب الرياض,حي عرقة,3 شوارع,تجاري,2220,20,3550


# Preprocessing data

In [13]:
from sklearn.preprocessing import OneHotEncoder
onehotEncoder = OneHotEncoder()
onehotEncoder = onehotEncoder.fit(df_no_dup[nominal_columns])
onehot_result = onehotEncoder.transform(df_no_dup[nominal_columns]).toarray()

df_dummy = pd.get_dummies(df_no_dup[nominal_columns])
df_onehot = pd.DataFrame(onehot_result ,columns=df_dummy.columns)
df_onehot[numeric_columns] = df_no_dup[numeric_columns]
df_onehot.head(1)

Unnamed: 0,mainlocation_الرياض,mainlocation_جدة,sublocation_جنوب الرياض,sublocation_جنوب جدة,sublocation_شرق الرياض,sublocation_شمال الرياض,sublocation_شمال جدة,sublocation_غرب الرياض,sublocation_وسط الرياض,neighborhood_حي أحد,neighborhood_حي ابحر الجنوبية,neighborhood_حي ابحر الشمالية,neighborhood_حي اشبيلية,neighborhood_حي الازدهار,neighborhood_حي الاصالة,neighborhood_حي الامواج,neighborhood_حي الاندلس,neighborhood_حي البديعة,neighborhood_حي البساتين,neighborhood_حي البشائر,neighborhood_حي التعاون,neighborhood_حي الجزيرة,neighborhood_حي الجنادرية,neighborhood_حي الحائر,neighborhood_حي الحزم,neighborhood_حي الحمدانية,neighborhood_حي الحمراء,neighborhood_حي الخزامى,neighborhood_حي الخليج,neighborhood_حي الدار البيضاء,neighborhood_حي الدريهمية,neighborhood_حي الدفاع,neighborhood_حي الرائد,neighborhood_حي الربوة,neighborhood_حي الربيع,neighborhood_حي الرحمانية,neighborhood_حي الرفيعة,neighborhood_حي الرمال,neighborhood_حي الروابي,neighborhood_حي الروضة,...,neighborhood_حي بنبان,neighborhood_حي بني مالك,neighborhood_حي جرير,neighborhood_حي حطين,neighborhood_حي صلاح الدين,neighborhood_حي ضاحية نمار,neighborhood_حي طويق,neighborhood_حي طيبة,neighborhood_حي ظهرة البديعة,neighborhood_حي ظهرة لبن,neighborhood_حي ظهرة نمار,neighborhood_حي عرقة,neighborhood_حي عريض,neighborhood_حي عكاظ,neighborhood_حي عليشة,neighborhood_حي غرناطة,neighborhood_حي قرطبة,neighborhood_حي لبن,neighborhood_حي مركز الملك عبدالله للدراسات والبحوث,neighborhood_حي مريخ,neighborhood_حي مشرفة,neighborhood_حي منفوحة الجديدة,neighborhood_حي نمار,neighborhood_حي هيت,frontage_3 شوارع,frontage_4 شوارع,frontage_جنوب,frontage_جنوب شرقي,frontage_جنوب غربي,frontage_شرق,frontage_شمال,frontage_شمال شرقي,frontage_شمال غربي,frontage_غرب,purpose_تجاري,purpose_سكني,purpose_سكني أو تجاري,size,streetwidth,pricepermeter
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2220,20,3550


In [14]:
X = df_onehot.drop(["pricepermeter"], axis=1)
y = df_onehot["pricepermeter"]

print(X.shape)
print(y.shape)

(5852, 178)
(5852,)


# Splitting the data

In [15]:
# Import train_test_split
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, 
                                                    random_state = 5)
print("Training set has {} samples.".format(X_train.shape[0]))
print("Testing set has {} samples.".format(X_test.shape[0]))

Training set has 4681 samples.
Testing set has 1171 samples.


## Function to predict the new untrained data

In [21]:
def predict(main,sub,neig,front,purpose,size,street,model):
  er = onehotEncoder.transform([[main,sub,neig,front,purpose]]).toarray()
  return model.predict(np.concatenate((er,[[size,street]]), axis=1))

In [22]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
import tensorflow.compat.v2.feature_column as fc
import tensorflow as tf

# Building Neural Network, and trining

In [None]:
model = tf.keras.Sequential([
       tf.keras.Input(shape=(178,)),
        tf.keras.layers.Dense(256, activation = 'relu'),
        tf.keras.layers.Dense(128, activation = 'relu'),
        tf.keras.layers.Dense(1)
])


model.compile(loss="mean_absolute_error" , optimizer="adam")

batch_size = 64
epochs = 600

his = model.fit(
    X,
    y,
    batch_size=batch_size,
    epochs=epochs,
);

# Evaluating the model

In [19]:
model.evaluate(X_test, y_test)



473.50115966796875

In [23]:
prediction_train = model.predict(X_train)
prediction_test = model.predict(X_test)
print("Mean Absolute Error:" , 'train: ', mean_absolute_error(y_train,prediction_train),
          '| test: ', mean_absolute_error(y_test,prediction_test))
print("Root Mean Square Error:" , 'train: ',np.sqrt(mean_squared_error(y_train ,prediction_train)),
      '| test: ',np.sqrt(mean_squared_error(y_test,prediction_test)))
print("R2:" , 'train: ', r2_score(y_train,prediction_train),'| test: ', r2_score(y_test,prediction_test))

Mean Absolute Error: train:  626.2271945736801 | test:  473.5011431114171
Root Mean Square Error: train:  7722.515326856429 | test:  962.4643781679956
R2: train:  0.017999271122793092 | test:  0.6476962880285435


In [24]:
# main,
# sub,
# neig,
# front,
# purpose,
# size,
# street
predict(
    'الرياض',
    'شمال الرياض',
    'حي الملقا',
    '3 شوارع',
    'سكني',
    3816 ,
    25  ,
    model
)

array([[3992.248]], dtype=float32)

# Function to predict on the new dataset

In [118]:
def predictOnNewDataSet(df,model):
  er = onehotEncoder.transform(df[nominal_columns]).toarray()
  X = np.concatenate((er,df[numeric_columns[:2]]), axis=1)
  X = np.asarray(X).astype('float32')
  y = df[numeric_columns[2]]
  pred = model.predict(X)
  diff = np.array(pred[0] - y)
  print("Mean Absolute Error:" , mean_absolute_error(y,pred))
  print("Root Mean Square Error:" ,np.sqrt(mean_squared_error(y ,pred)))
  print("R2:" , r2_score(y,pred))
  return diff

# Loading the new dataset

In [119]:
df_n = pd.read_csv('land_north_riyadh.csv')

In [125]:
df_n = pd.concat([df_n,df])
df_n.drop_duplicates(keep='first',ignore_index=True)
df_n.shape

(18309, 8)

# Cleaning Data

In [126]:
df_n = df_n.query("streetwidth != 'غير محدد'")
df_n = df_n.query("purpose != 'غير محدد'")
df_n['streetwidth'] = df_n['streetwidth'].astype('int')
df_n.shape

(17189, 8)

In [127]:
df_n = df_n.query('neighborhood != "حي مطار الملك خالد الدولي"')
df_n = df_n.query('neighborhood != "حي جامعة الملك سعود"')

In [128]:
diff = predictOnNewDataSet(df_n,model)

Mean Absolute Error: 1130.4205849940909
Root Mean Square Error: 35506.582848121936
R2: -0.0004545386451817457


In [129]:
diff.max() # means that the model predicted the price per meter more with 3553 than the real price per meter

3553.5823

In [130]:
diff.min() # means that the model predicted the price per meter less with 2996445 than the real price per meter

-2996445.5