In [42]:
# imports and general settings

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

%matplotlib inline

In [43]:
# load the data from csv in a pandas dataframe

property_data = pd.read_csv(
    "data/property-data.csv"
)

In [44]:
# check how the dataframe looks

property_data.head()

Unnamed: 0,PROPERTY_REFERENCE,PRICE,BEDROOMS,BATHROOMS,HOUSE_NUMBER,ADDRESS,REGION,POSTCODE,PROPERTY_TYPE
0,1,1000000,7,2.0,12,Richard Lane,London,W1F 3FT,Mansion
1,2,100000,2,1.0,22,Brick Road,Sheffield,SH1 1AW,Terraced
2,3,225000,5,,40,Yellow Lane,Manchester,MA12 3ZY,Detached
3,4,150000,1,1.0,3B,Red Admiral Court,Essex,RM2 6ET,Flat
4,5,222250,3,1.0,36,Bear Road,Winchester,WI3 9TT,Detached


In [45]:
# general information about our data 

property_data.describe()

Unnamed: 0,PROPERTY_REFERENCE,PRICE,BEDROOMS,BATHROOMS
count,24.0,24.0,24.0,22.0
mean,12.5,805794.8,3.958333,2.0
std,7.071068,1518746.0,2.678078,1.480026
min,1.0,100000.0,1.0,1.0
25%,6.75,155750.0,2.0,1.0
50%,12.5,262500.0,3.0,1.0
75%,18.25,755000.0,5.0,2.0
max,24.0,7500000.0,11.0,6.0


In [46]:
# Find the mean price in the postcode outward ‘W1F’

'''
I'm filtering the df where the value of column POSTCODE starts with W1F
Then I'm getting only the PRICE column as a Series and apply the mean() function
The result is 1158750.0
'''
property_data.loc[property_data["POSTCODE"].str.startswith('W1F', na=False)]["PRICE"].mean() # 1158750.0

1158750.0

In [47]:
# Find the difference in average property prices between detached houses and flats

'''
By average we usually understand the mean. 
Here I am filtering the df by the PROPERTY_TYPE value and get the mean
We can see that the mean of flats is bigger than of detached houses
'''
detached_mean = property_data.loc[property_data["PROPERTY_TYPE"] == "Detached"]["PRICE"].mean()
print("Detached houses mean is {}".format(detached_mean)) # 320579.125
flats_mean = property_data.loc[property_data["PROPERTY_TYPE"] == "Flat"]["PRICE"].mean()
print("Flats mean is {}".format(flats_mean)) # 363999.75
difference_mean = detached_mean - flats_mean
print("Difference mean is {}".format(difference_mean)) # -43420.625

'''
If we instead calculate the median, we can see that the median of detacheed houses is bigger than that of flats
This is because there is an outlier flat which is very expensive
'''
detached_median = property_data.loc[property_data["PROPERTY_TYPE"] == "Detached"]["PRICE"].median()
print("Detached houses median is {}".format(detached_median)) # 247500
flats_median = property_data.loc[property_data["PROPERTY_TYPE"] == "Flat"]["PRICE"].median()
print("Flats median is {}".format(flats_median)) # 153000.0
difference_median = detached_median - flats_median
print("Difference median is {}".format(difference_median)) # 94500.0

Detached houses mean is 320579.125
Flats mean is 363999.75
Difference mean is -43420.625
Detached houses median is 247500.0
Flats median is 153000.0
Difference median is 94500.0


In [48]:
# Find the top 10% most expensive properties

'''
I'm sorting the values by PRICE, DESCENDING and get only the top 10%
The top 10% I'm calculating by getting the size of the df and divide it by 10 (floor division to get the whole integer)
'''
property_data.sort_values("PRICE", ascending=False).head(property_data.shape[0] // 10) # property_reference 12,13




Unnamed: 0,PROPERTY_REFERENCE,PRICE,BEDROOMS,BATHROOMS,HOUSE_NUMBER,ADDRESS,REGION,POSTCODE,PROPERTY_TYPE
11,12,7500000,11,4.0,,Brighton Road,Surrey,GU13 4DD,Mansion
12,13,2500000,7,2.0,1.0,Station Road,London,W1F 3UT,Mansion
