In [7]:
import os
import warnings
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import display
from pandas.api.types import CategoricalDtype

from category_encoders import MEstimateEncoder
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import KFold, cross_val_score
#from xgboost import XGBRegressor


# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)

# Mute warnings
warnings.filterwarnings('ignore')

In [39]:
df = pd.read_csv("newDataFrame.csv", error_bad_lines=False)
df

Unnamed: 0,date,houseId,city,address,houseType,status,numberRooms,numberBathrooms1,numberBathrooms,numberKitchens1,...,landArea,elevator1,elevator,carParking1,carParking,heating1,heating,price,priceType,priceUS
0,2020-01-29,240003,RB,- بيتونيا,apartment,A,1,0,,0,...,,1,نعم,1,نعم,0,,1300,شيكل,390
1,2020-01-29,240005,RB,- مدينة رام الله,apartment,A,1,1,1,1,...,,1,نعم,1,نعم,0,,550,دولار,550
2,2020-01-29,240006,RB,- مدينة رام الله,apartment,A,3,3,3,0,...,100.0,1,نعم,1,نعم,1,نعم,1000,دولار,1000
3,2020-01-29,240007,RB,- مدينة رام الله,apartment,A,3,3,3,1,...,,1,نعم,1,نعم,1,نعم,950,دولار,950
4,2020-01-29,240008,RB,- مدينة رام الله,apartment,A,3,3,3,0,...,,1,نعم,1,نعم,0,,800,دولار,800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8156,2021-03-03,260069,RB,- مدينة البيرة,apartment,A,2,2,2,1,...,,1,نعم,1,نعم,0,,600,دولار,600
8157,2021-03-03,260076,RB,- مدينة البيرة,apartment,B,3,2,2,1,...,,1,نعم,1,نعم,0,,500,دولار,500
8158,2021-03-03,260079,H,- مدينة الخليل,apartment,B,3,2,2,1,...,,0,,0,,0,,1600,شيكل,480
8159,2021-03-03,260080,N,- مدينة نابلس,apartment,A,1,1,1,1,...,3500.0,1,نعم,1,نعم,1,نعم,1000,شيكل,300


In [3]:
df.info

<bound method DataFrame.info of             date  houseId city            address  houseType status  \
0     2020-01-29   240003   RB          - بيتونيا  apartment      A   
1     2020-01-29   240005   RB  -  مدينة رام الله  apartment      A   
2     2020-01-29   240006   RB  -  مدينة رام الله  apartment      A   
3     2020-01-29   240007   RB  -  مدينة رام الله  apartment      A   
4     2020-01-29   240008   RB  -  مدينة رام الله  apartment      A   
...          ...      ...  ...                ...        ...    ...   
8156  2021-03-03   260069   RB    -  مدينة البيرة  apartment      A   
8157  2021-03-03   260076   RB    -  مدينة البيرة  apartment      B   
8158  2021-03-03   260079    H    -  مدينة الخليل  apartment      B   
8159  2021-03-03   260080    N     -  مدينة نابلس  apartment      A   
8160  2021-03-03   260085   RB    -  مدينة البيرة  apartment      A   

      numberRooms numberBathrooms1 numberBathrooms numberKitchens1  ...  \
0               1                0      

In [40]:
#Handling Missing Values
# get the number of missing data points per column
missing_values = df.isnull().sum()
# look at the # of missing points in the first 25 columns
missing_values[0:25]

date                   0
houseId                0
city                   0
address               55
houseType             17
status                 0
numberRooms            0
numberBathrooms1       0
numberBathrooms      272
numberKitchens1        0
numberKitchens       364
numberBalconies1       0
numberBalconies     1322
loungeType           851
houseSpace          2085
landArea            5709
elevator1              0
elevator            3273
carParking1            0
carParking          2828
heating1               0
heating             6350
price                  0
priceType              1
priceUS                0
dtype: int64

In [41]:
#total missing values do we have
total_cells = np.product(df.shape)
total_missing = missing_values.sum()
# percent of data that is missing
percent_missing = (total_missing/total_cells) * 100
print(percent_missing)

11.335375566719765


In [42]:
total_missing

23127

In [43]:
total_cells

204025

In [44]:
# replace all NA's the value that comes directly after it in the same column,
numberBathroomsNew = df.numberBathrooms
df.numberBathrooms = numberBathroomsNew.fillna(method='bfill', axis=0)
df.numberBathrooms.isnull().sum()


0

In [45]:
houseTypeNew = df.houseType
df.houseType = houseTypeNew.fillna(method='bfill')
df.houseType.isnull().sum()

0

In [46]:
addressNew = df.address
df.address = addressNew.fillna(method='bfill')
df.address.isnull().sum()

0

In [47]:
numberKitchensNew = df.numberKitchens
df.numberKitchens = numberKitchensNew.fillna(method='bfill')
df.numberKitchens.isnull().sum()

0

In [48]:
# replace all NA's with 0
numberBalconiesNew = df.numberBalconies
df.numberBalconies = numberBalconiesNew.fillna(0)
df.numberBalconies.isnull().sum()

0

In [50]:
df.elevator = df.elevator1
df.carParking = df.carParking1
df.heating = df.heating1


In [51]:
missing_values = df.isnull().sum()
missing_values[0:25]

date                   0
houseId                0
city                   0
address                0
houseType              0
status                 0
numberRooms            0
numberBathrooms1       0
numberBathrooms        0
numberKitchens1        0
numberKitchens         0
numberBalconies1       0
numberBalconies        0
loungeType           851
houseSpace          2085
landArea            5709
elevator1              0
elevator               0
carParking1            0
carParking             0
heating1               0
heating                0
price                  0
priceType              1
priceUS                0
dtype: int64

In [55]:
# get all the unique values in the 'loungeType , city ' column
lounge = df['loungeType'].unique()


In [60]:
#city
city = df['priceUS'].unique()


In [61]:
#drop extra columns
newdf = df.drop(['houseId', 'address', 'numberBathrooms1','numberKitchens1','numberBalconies1','loungeType','landArea','elevator1','carParking1','heating1','priceType','price'], axis=1)
newdf

Unnamed: 0,date,city,houseType,status,numberRooms,numberBathrooms,numberKitchens,numberBalconies,houseSpace,elevator,carParking,heating,priceUS
0,2020-01-29,RB,apartment,A,1,1,1,0,,1,1,0,390
1,2020-01-29,RB,apartment,A,1,1,1,1,80,1,1,0,550
2,2020-01-29,RB,apartment,A,3,3,1,2,190,1,1,1,1000
3,2020-01-29,RB,apartment,A,3,3,1,1,190,1,1,1,950
4,2020-01-29,RB,apartment,A,3,3,1,3,160,1,1,0,800
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8156,2021-03-03,RB,apartment,A,2,2,1,1,120,1,1,0,600
8157,2021-03-03,RB,apartment,B,3,2,1,1,140,1,1,0,500
8158,2021-03-03,H,apartment,B,3,2,1,2,150,0,0,0,480
8159,2021-03-03,N,apartment,A,1,1,1,1,70,1,1,1,300


In [63]:
missing_values = newdf.isnull().sum()
missing_values[0:13]

date                  0
city                  0
houseType             0
status                0
numberRooms           0
numberBathrooms       0
numberKitchens        0
numberBalconies       0
houseSpace         2085
elevator              0
carParking            0
heating               0
priceUS               0
dtype: int64

In [65]:
newdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8161 entries, 0 to 8160
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   date             8161 non-null   object
 1   city             8161 non-null   object
 2   houseType        8161 non-null   object
 3   status           8161 non-null   object
 4   numberRooms      8161 non-null   int64 
 5   numberBathrooms  8161 non-null   object
 6   numberKitchens   8161 non-null   object
 7   numberBalconies  8161 non-null   object
 8   houseSpace       6076 non-null   object
 9   elevator         8161 non-null   int64 
 10  carParking       8161 non-null   int64 
 11  heating          8161 non-null   int64 
 12  priceUS          8161 non-null   object
dtypes: int64(4), object(9)
memory usage: 829.0+ KB


In [77]:
#Handling unique data
uniqueBathrooms = newdf['numberBathrooms'].unique()
uniqueBathrooms

array(['1', '3', '2', '-2', '-1', '-3', '-4', '2 حمام', '4', '0', '5',
       '6'], dtype=object)

In [89]:
#Handling unique data
newdf['numberBathrooms'] = newdf['numberBathrooms'].replace(['حمامين'],'2')
newdf['numberBathrooms'] = newdf['numberBathrooms'].replace(['-1'],'1')
newdf['numberBathrooms'] = newdf['numberBathrooms'].replace(['-2'],'2')
newdf['numberBathrooms'] = newdf['numberBathrooms'].replace(['-3'],'3')
newdf['numberBathrooms'] = newdf['numberBathrooms'].replace(['-4'],'4')
newdf['numberBathrooms'] = newdf['numberBathrooms'].replace(['حمام'],'1')

In [108]:
uniqueBalconies = newdf['numberBalconies'].unique()
uniqueBalconies

array([0, '1', '2', '3', '4', '0', '5', '2 برنده', '120', '100', '12',
       '21', '150', '180', '6', '14'], dtype=object)

In [92]:
newdf['numberKitchens'] = newdf['numberKitchens'].replace(['واجد'],'1')

In [95]:
newdf['numberBalconies'] = newdf['numberBalconies'].replace(['واجده'],'1')

In [101]:
newdf['numberBalconies'] = newdf['numberBalconies'].replace(['-2'],'2')

In [103]:
newdf['numberBalconies'] = newdf['numberBalconies'].replace(['-1'],'1')

In [105]:
newdf['numberBalconies'] = newdf['numberBalconies'].replace(['-3'],'3')

In [113]:
uniqueSpace = newdf['houseSpace'].unique()
uniqueSpace

array([nan, '80', '190', '160', '150', '200', '130', '175', '135', '210',
       '120', '220', '90', '119', '140', '125', '174', '145', '50', '70',
       '195', '116', '-110', '165', '180', '170', '155', '20', '185',
       '100', '110', '25', '115', '300', '-150', '0', '280', '45', '124',
       '127', '144', '162', '123', '16', '172', '148', '122', '88', '118',
       '60', '138', '1', '85', '40', '26', '215', '166', '168', '1000',
       '132', '260', '270', '30', '75', '139', '169', '250', '-140',
       '350', '187', '1150', '105', '-115', '117', '230', '21', '32',
       '95', '420', '225', '107', '65', '89', '136', '500', '240', '153',
       '149', '28', '450', '-60', '35', '82', '86', '133', '57', '205',
       '147', '126', '167', '109', '2', '163', '183', '192', '178', '15',
       '-155', '3', '55', '176', '154', '235', '75متر', '157', '156',
       '128', '184', '750', '193', '152', '159', '142', '158', '54', '37',
       '131', '1100', '84', '400', '6', '216', '266', '19

In [112]:
newdf['houseSpace'] = newdf['houseSpace'].replace(['متر'],'1')

In [None]:
houseSpaceNew = df.houseSpace
df.houseSpace = houseSpaceNew.fillna(0)
df.houseSpace.isnull().sum()

In [111]:
uniquehouseType = newdf['houseType'].unique()
uniquehouseType

array(['apartment', 'Roof', 'villa'], dtype=object)

In [114]:
newdf.to_csv (r'df.csv', index = False, header=True)

In [3]:
df = pd.read_csv("df.csv", error_bad_lines=False)
df

Unnamed: 0,date,city,houseType,status,numberRooms,numberBathrooms,numberKitchens,numberBalconies,houseSpace,elevator,carParking,heating,priceUS
0,2020-01-29,RB,apartment,A,1,1,1,0,,1,1.0,0.0,390.0
1,2020-01-29,RB,apartment,A,1,1,1,1,80.0,1,1.0,0.0,550.0
2,2020-01-29,RB,apartment,A,3,3,1,2,190.0,1,1.0,1.0,1000.0
3,2020-01-29,RB,apartment,A,3,3,1,1,190.0,1,1.0,1.0,950.0
4,2020-01-29,RB,apartment,A,3,3,1,3,160.0,1,1.0,0.0,800.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8140,2021-03-03,RB,apartment,A,2,2,1,1,120.0,1,1.0,0.0,600.0
8141,2021-03-03,RB,apartment,B,3,2,1,1,140.0,1,1.0,0.0,500.0
8142,2021-03-03,H,apartment,B,3,2,1,2,150.0,0,0.0,0.0,480.0
8143,2021-03-03,N,apartment,A,1,1,1,1,70.0,1,1.0,1.0,300.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8145 entries, 0 to 8144
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   date             8145 non-null   object 
 1   city             8145 non-null   object 
 2   houseType        8145 non-null   object 
 3   status           8145 non-null   object 
 4   numberRooms      8145 non-null   int64  
 5   numberBathrooms  8145 non-null   int64  
 6   numberKitchens   8145 non-null   int64  
 7   numberBalconies  8145 non-null   int64  
 8   houseSpace       6063 non-null   float64
 9   elevator         8145 non-null   int64  
 10  carParking       8145 non-null   float64
 11  heating          8145 non-null   float64
 12  priceUS          8145 non-null   float64
dtypes: float64(4), int64(5), object(4)
memory usage: 827.4+ KB


In [5]:
df['heating'] = df['heating'].astype(int) 
df['carParking'] = df['carParking'].astype(int)  
# displaying the datatypes 
display(df.dtypes) 

date                object
city                object
houseType           object
status              object
numberRooms          int64
numberBathrooms      int64
numberKitchens       int64
numberBalconies      int64
houseSpace         float64
elevator             int64
carParking           int32
heating              int32
priceUS            float64
dtype: object

In [6]:
df.describe()

Unnamed: 0,numberRooms,numberBathrooms,numberKitchens,numberBalconies,houseSpace,elevator,carParking,heating,priceUS
count,8145.0,8145.0,8145.0,8145.0,6063.0,8145.0,8145.0,8145.0,8145.0
mean,2.495028,1.864702,1.014979,1.095519,135.996042,0.599386,0.654266,0.221977,1958.141217
std,0.86999,0.747368,0.254182,3.198064,54.645959,0.490053,0.475636,0.415601,13681.847786
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3
25%,2.0,1.0,1.0,1.0,110.0,0.0,0.0,0.0,390.0
50%,3.0,2.0,1.0,1.0,140.0,1.0,1.0,0.0,500.0
75%,3.0,2.0,1.0,1.0,160.0,1.0,1.0,0.0,700.0
max,6.0,6.0,11.0,180.0,1200.0,1.0,1.0,1.0,450000.0


In [None]:
index_names = df[ df['numberBalconies'] > 5 ].index 
index_names
# drop these row indexes 
# from dataFrame 
df.drop(index_names, inplace = True) 


In [None]:
index_names = df[ df['numberBathrooms'] > 4 ].index 
index_names
# drop these row indexes 
# from dataFrame 
df.drop(index_names, inplace = True) 


In [None]:
index_names = df[ df['numberKitchens'] > 3 ].index 
index_names
# drop these row indexes 
# from dataFrame 
df.drop(index_names, inplace = True) 

In [None]:
df.to_csv (r'dfC.csv', index = False, header=True)