### Import the csv file, check for null values

In [3]:
import pandas as pd
import numpy as np

with open('/home/becode/Downloads/page_2.csv') as f:
    new_dict = pd.read_csv('/home/becode/Downloads/page_2.csv')
print(new_dict.isna().sum())

ID                          0
Locality                    0
PricePr                     0
Tenement building         161
Venue of the sale        9972
Bedrooms                  428
Living area              1592
Kitchen type             3668
Furnished                2989
How many fireplaces?     9878
Terrace                  7750
Terrace surface          6861
Garden                   9325
Garden surface           8701
Surface of the plot      6588
Number of frontages      3252
Swimming pool           10127
Building condition       2962
Type                        0
Subtype                     0
dtype: int64


# Observation: many columns have null values

In [4]:
# rename the 'PricePr' column
new_dict.rename(columns={"PricePr": "Price"}, errors="raise", inplace=True)

In [5]:
#check the values for the 2 columns which need to be merged together 'Tenement building' & 'Venue of the sale'
for index, row in new_dict.iterrows():
    if not(pd.isna(new_dict.loc[index,'Venue of the sale'])):
        new_dict.loc[index,"Venue of the sale"] = 'new'
print(new_dict['Tenement building'].value_counts())
print(new_dict['Venue of the sale'].value_counts())
print(new_dict['Tenement building'].isna().sum())
print(new_dict['Venue of the sale'].isna().sum())

No     9291
Yes     675
Name: Tenement building, dtype: int64
new    155
Name: Venue of the sale, dtype: int64
161
9972


In [6]:
# Creation of the new column 'Type of sale', which will replace the 2 previous ones
new_dict['Type of sale'] = new_dict['Tenement building'].astype(str) + new_dict['Venue of the sale'].astype(str)
for index, row in new_dict.iterrows():
    if new_dict.loc[index,'Type of sale']=='Nonan' or new_dict.loc[index,'Type of sale']=='nannan':
        new_dict.loc[index,"Type of sale"] = 'none'
    if new_dict.loc[index,'Type of sale']=='Yesnan':
        new_dict.loc[index,"Type of sale"] = 'Tenement'
    if new_dict.loc[index,'Type of sale']=='Yesnew':
        new_dict.loc[index,"Type of sale"] = 'TenementNew'
    if new_dict.loc[index,'Type of sale']=='Nonew':
        new_dict.loc[index,"Type of sale"] = 'new'
print(new_dict['Type of sale'].value_counts())


none           9312
Tenement        660
new             140
TenementNew      15
Name: Type of sale, dtype: int64


# Remove unnecessary columns from the dataset
'Swimming pool' has only null values, it will not influence the result of the model

In [7]:
new_dict.drop(['Tenement building', 'Venue of the sale', 'Swimming pool'], axis = 1, inplace=True)

### Display distinct values in 'ID', 'Type of sale', 'Building condition'

In [8]:
dict2 = new_dict[['ID', 'Type of sale', 'Building condition']]
print(dict2['Type of sale'].value_counts())

none           9312
Tenement        660
new             140
TenementNew      15
Name: Type of sale, dtype: int64


### Check in small chunks for null values in columns

In [9]:
dict3 = new_dict[['ID', 'Garden surface', 'Garden', 'Terrace surface', 'Terrace']]
# display sum of rows, which are not null
print((~dict3[['Garden','Garden surface']].isnull()).sum())
# show all null vallues in the subset 'Garden','Garden surface' columns
print(dict3[['Garden','Garden surface']].isnull().sum())
# give all distinct values in 'Garden'
print(dict3[['Garden']].value_counts())

Garden             802
Garden surface    1426
dtype: int64
Garden            9325
Garden surface    8701
dtype: int64
Garden
Yes       802
dtype: int64


### Update bunch all the rest of the columns to remove null values, which are not necessary for model processing

In [10]:
for index, row in new_dict.iterrows():
    if not(pd.isna(new_dict.loc[index,'Garden surface'])) & (pd.isna(new_dict.loc[index,'Garden'])): 
        new_dict.loc[index,"Garden"] = 'Yes'
    else:
        new_dict.loc[index,"Garden"] = 'No'
    if not(pd.isna(new_dict.loc[index,'Terrace surface'])) & (pd.isna(new_dict.loc[index,'Terrace'])): 
        new_dict.loc[index,"Terrace"] = 'Yes'
    else:
        new_dict.loc[index,"Terrace"] = 'No'
    if pd.isna(new_dict.loc[index,'Garden surface']): 
        new_dict.loc[index,"Garden surface"] = 0
    if pd.isna(new_dict.loc[index,'Terrace surface']): 
        new_dict.loc[index,"Terrace surface"] = 0
    if pd.isna(new_dict.loc[index,'Bedrooms']): 
        new_dict.loc[index,"Bedrooms"] = np.mean(new_dict["Bedrooms"])
    if pd.isna(new_dict.loc[index,'Living area']): 
        new_dict.loc[index,"Living area"] = np.mean(new_dict["Living area"])
    if pd.isna(new_dict.loc[index,'Number of frontages']): 
        new_dict.loc[index,"Number of frontages"] = 0
    if pd.isna(new_dict.loc[index,'How many fireplaces?']): 
        new_dict.loc[index,"How many fireplaces?"] = 0
    if pd.isna(new_dict.loc[index,'Surface of the plot']): 
        new_dict.loc[index,"Surface of the plot"] = new_dict.loc[index,"Living area"]
    if not(pd.isna(new_dict.loc[index,'Kitchen type'])) & (pd.isna(new_dict.loc[index,'Furnished'])): 
        new_dict.loc[index,"Furnished"] = 'Yes'
    else:
        new_dict.loc[index,"Furnished"] = 'No'
    if pd.isna(new_dict.loc[index,'Kitchen type']): 
        new_dict.loc[index,"Kitchen type"] = 'Not installed'
    if pd.isna(new_dict.loc[index,'Building condition']): 
        new_dict.loc[index,"Building condition"] = 'To restore'

In [11]:
# check again for remaining null values - np.NaN
print(new_dict.isna().sum())

ID                      0
Locality                0
Price                   0
Bedrooms                0
Living area             0
Kitchen type            0
Furnished               0
How many fireplaces?    0
Terrace                 0
Terrace surface         0
Garden                  0
Garden surface          0
Surface of the plot     0
Number of frontages     0
Building condition      0
Type                    0
Subtype                 0
Type of sale            0
dtype: int64


### Save the clean data to csv file. Check the length of the rows in the file

In [12]:
with open('/home/becode/Downloads/page_1.csv','w') as f:
    new_dict.to_csv('/home/becode/Downloads/page_1.csv', index=False)

print(len(new_dict))

10127
