In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('./data/house_price.csv')

In [3]:
df.head()

Unnamed: 0,bedrooms,sq_meter,neighborhood,sales_price
0,3,200,Normalville,250000
1,2,80,Cool Town,300000
2,2,85,Normalville,150000
3,1,55,Normalville,78000
4,4,200,Skid Row,150000


In [4]:
df.head(3)

Unnamed: 0,bedrooms,sq_meter,neighborhood,sales_price
0,3,200,Normalville,250000
1,2,80,Cool Town,300000
2,2,85,Normalville,150000


First attempt

In [5]:
def estimate_house_sales_price(num_of_bedrooms, sqm, neighborhood):
    price = 0

    price_per_sqm = 200
    if neighborhood == 'Cool Town':
        price_per_sqm = 400
    elif neighborhood == 'Skid Row':
        price_per_sqm = 100
    
    price = price_per_sqm * sqm

    if num_of_bedrooms == 0:
        price -= 20000 
    else:
        price += num_of_bedrooms * 100000
    return price

In [6]:
estimate_house_sales_price(4, 175, 'Skid Row')

417500

Make it more flexible

In [7]:
def estimate_house_sales_price(num_of_bedrooms, sqm, neighborhood):
    price = 0

    price += num_of_bedrooms * 0.841232951398213
    price += sqm *1231.1231231
    price += neighborhood * 2.3242341421
    price += 201.23432095
    return price

In [8]:
for _, house in df.iterrows():
    print(estimate_house_sales_price(house.bedrooms, house.sq_meter, house.neighborhood), house.sales_price)

TypeError: can't multiply sequence by non-int of type 'float'

In [9]:
neighborhood_mapping = {
    'Skid Row' : 1,
    'Normalville' : 2,
    'Cool Town' : 3
}

In [10]:
for _, house in df.iterrows():
    print(int(estimate_house_sales_price(house.bedrooms, house.sq_meter, neighborhood_mapping[house.neighborhood])), house.sales_price)

246433 250000
98699 300000
104853 150000
67918 78000
246431 150000


Calculate absolute error

In [11]:
predictions = []
for _, house in df.iterrows():
    predictions.append((int(estimate_house_sales_price(house.bedrooms, house.sq_meter, neighborhood_mapping[house.neighborhood])), house.sales_price))
predictions

[(246433, 250000),
 (98699, 300000),
 (104853, 150000),
 (67918, 78000),
 (246431, 150000)]

In [12]:
absolute_errors = []
for prediction, price in predictions:
    absolute_errors.append(abs(prediction - price))
absolute_errors

[3567, 201301, 45147, 10082, 96431]

In [13]:
list_a = [1,2,3]
list_b = ['a','b','c']

list(zip(list_a, list_b))

[(1, 'a'), (2, 'b'), (3, 'c')]

In [14]:
def mean_absolute_error(y_true, y_pread):
    assert len(y_true) == len(y_pread), 'length of actual and predicted values must be the same'

    absolute_errors = [abs(true_value - pred_value) for true_value, pred_value in zip(y_true, y_pread)]

    return sum(absolute_errors) / len(y_true)

In [15]:
real_values = df.sales_price.values.tolist()
real_values

[250000, 300000, 150000, 78000, 150000]

In [16]:
# predictions = []
# for _, house in df.iterrows():
#     predictions.append((int(estimate_house_sales_price(house.bedrooms, house.sq_meter, neighborhood_mapping[house.neighborhood])), house.sales_price))
# predictions

predicted_values = [int(estimate_house_sales_price(house.bedrooms, house.sq_meter, neighborhood_mapping[house.neighborhood])) for _, house in df.iterrows()]
predicted_values

[246433, 98699, 104853, 67918, 246431]

In [17]:
mean_absolute_error(real_values, predicted_values)

71305.6

Create a weight dict to start with

In [18]:
weights_data = {
    'bedroom_weight' : 0.8,
    'sqm_weight' : 1.2,
    'neighborhood_weight' : 0.5,
    'bias' : 200
}

In [19]:
def estimate_house_sales_price(num_of_bedrooms, sqm, neighborhood, weights):
    price = 0

    price += num_of_bedrooms * weights['bedroom_weight']
    price += sqm * weights['sqm_weight']
    price += neighborhood * weights['neighborhood_weight']
    price += weights['bias']
    return price

In [None]:
def gradient_descent(X, y, weights, bias, learning_rate, num_iterations):
    pass

One Hot Encoding