In [60]:
import pandas as pd

In [61]:
df = pd.read_csv('./data/house_price.csv')

In [62]:
df.head()

Unnamed: 0,bedrooms,sq_meter,neighborhood,sales_price
0,3,200,Normalville,250000
1,2,80,Cool Town,300000
2,2,85,Normalville,150000
3,1,55,Normalville,78000
4,4,200,Skid Row,150000


In [63]:
df.head(3)

Unnamed: 0,bedrooms,sq_meter,neighborhood,sales_price
0,3,200,Normalville,250000
1,2,80,Cool Town,300000
2,2,85,Normalville,150000


First attempt

In [64]:
def estimate_house_sales_price(num_of_bedrooms, sqm, neighborhood):
    price = 0

    price_per_sqm = 200
    if neighborhood == 'Cool Town':
        price_per_sqm = 400
    elif neighborhood == 'Skid Row':
        price_per_sqm = 100
    
    price = price_per_sqm * sqm

    if num_of_bedrooms == 0:
        price -= 20000 
    else:
        price += num_of_bedrooms * 100000
    return price

In [65]:
estimate_house_sales_price(4, 175, 'Skid Row')

417500

Make it more flexible

In [66]:
def estimate_house_sales_price(num_of_bedrooms, sqm, neighborhood):
    price = 0

    price += num_of_bedrooms * 0.841232951398213
    price += sqm *1231.1231231
    price += neighborhood * 2.3242341421
    price += 201.23432095
    return price

In [67]:
# for _, house in df.iterrows():
#     print(estimate_house_sales_price(house.bedrooms, house.sq_meter, house.neighborhood), house.sales_price)

In [68]:
neighborhood_mapping = {
    'Skid Row' : 1,
    'Normalville' : 2,
    'Cool Town' : 3
}

In [69]:
for _, house in df.iterrows():
    print(int(estimate_house_sales_price(house.bedrooms, house.sq_meter, neighborhood_mapping[house.neighborhood])), house.sales_price)

246433 250000
98699 300000
104853 150000
67918 78000
246431 150000


Calculate absolute error

In [70]:
predictions = []
for _, house in df.iterrows():
    predictions.append((int(estimate_house_sales_price(house.bedrooms, house.sq_meter, neighborhood_mapping[house.neighborhood])), house.sales_price))
predictions

[(246433, 250000),
 (98699, 300000),
 (104853, 150000),
 (67918, 78000),
 (246431, 150000)]

In [71]:
absolute_errors = []
for prediction, price in predictions:
    absolute_errors.append(abs(prediction - price))
absolute_errors

[3567, 201301, 45147, 10082, 96431]

In [72]:
list_a = [1,2,3]
list_b = ['a','b','c']

list(zip(list_a, list_b))

[(1, 'a'), (2, 'b'), (3, 'c')]

In [73]:
def mean_absolute_error(y_true, y_pread):
    assert len(y_true) == len(y_pread), 'length of actual and predicted values must be the same'

    absolute_errors = [abs(true_value - pred_value) for true_value, pred_value in zip(y_true, y_pread)]

    return sum(absolute_errors) / len(y_true)

In [74]:
real_values = df.sales_price.values.tolist()
real_values

[250000, 300000, 150000, 78000, 150000]

In [75]:
# predictions = []
# for _, house in df.iterrows():
#     predictions.append((int(estimate_house_sales_price(house.bedrooms, house.sq_meter, neighborhood_mapping[house.neighborhood])), house.sales_price))
# predictions

predicted_values = [int(estimate_house_sales_price(house.bedrooms, house.sq_meter, neighborhood_mapping[house.neighborhood])) for _, house in df.iterrows()]
predicted_values

[246433, 98699, 104853, 67918, 246431]

In [76]:
mean_absolute_error(real_values, predicted_values)

71305.6

Create a weight dict to start with

In [77]:
weights_data = {
    'bedroom_weight' : 0.8,
    'sqm_weight' : 1.2,
    'neighborhood_weight' : 0.5,
    'bias' : 200
}

In [78]:
def estimate_house_sales_price(num_of_bedrooms, sqm, neighborhood, weights):
    price = 0

    price += num_of_bedrooms * weights['bedroom_weight']
    price += sqm * weights['sqm_weight']
    price += neighborhood * weights['neighborhood_weight']
    price += weights['bias']
    return price


In [93]:
def predict(X, weights, bias):
    return [sum(w*Xj for w, Xj in zip(weights, Xi)) + bias for Xi in X]

def gradient_descent(X, y, weights, bias, learning_rate, num_iterations):
    # pass
    num_samples, num_features = len(X), len(X[0])

    for _ in range(num_iterations):
        predictions = predict(X, weights, bias)
        print(predictions)

    #     errors = [pred - actual for pred, actual in zip(predictions, y)]

    #     gradient_weights = [sum(err*Xi[i] for err, Xi in zip(errors, X)) / num_samples for i in range(num_features)]
    #     gradient_bias = sum(errors) / num_samples

    #     weights = [w - learning_rate * gw for w, gw in zip(weights, gradient_weights)]
    #     bias = bias - learning_rate * gradient_bias
    # return weights, bias

One Hot Encoding

In [80]:
df_encoded = pd.get_dummies(df, columns = ['neighborhood'])
df_encoded

Unnamed: 0,bedrooms,sq_meter,sales_price,neighborhood_Cool Town,neighborhood_Normalville,neighborhood_Skid Row
0,3,200,250000,False,True,False
1,2,80,300000,True,False,False
2,2,85,150000,False,True,False
3,1,55,78000,False,True,False
4,4,200,150000,False,False,True


In [81]:
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   bedrooms                  5 non-null      int64
 1   sq_meter                  5 non-null      int64
 2   sales_price               5 non-null      int64
 3   neighborhood_Cool Town    5 non-null      bool 
 4   neighborhood_Normalville  5 non-null      bool 
 5   neighborhood_Skid Row     5 non-null      bool 
dtypes: bool(3), int64(3)
memory usage: 267.0 bytes


In [82]:
X = df_encoded.drop(columns=['sales_price']).values #Features
X

array([[3, 200, False, True, False],
       [2, 80, True, False, False],
       [2, 85, False, True, False],
       [1, 55, False, True, False],
       [4, 200, False, False, True]], dtype=object)

In [83]:
y = df_encoded['sales_price'].values #target
y

array([250000, 300000, 150000,  78000, 150000], dtype=int64)

In [84]:
weights =[
    weights_data['bedroom_weight'],
    weights_data['sqm_weight']] + \
    [weights_data['neighborhood_weight']] * 3

weights

[0.8, 1.2, 0.5, 0.5, 0.5]

In [85]:
# [1,2,3] + [4] * 3

[1,2,3] + \
[4] * 3


[1, 2, 3, 4, 4, 4]

In [86]:
bias = weights_data['bias']
bias

200

In [87]:
learning_rate = 0.01
num_iterations = 100

In [None]:
gradient_descent(X, y, weights, bias, learning_rate, num_iterations)