In [78]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [79]:
data = pd.read_csv('Housing.csv')
data.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


## напишіть функцію гіпотези лінійної регресії у векторному вигляді;



In [80]:
def linear_regression_hypothesis(w, X):
  """w_1 * x + w_0 for vectors"""
  X = X.values.reshape(-1, 1)
  number_of_rows = X.shape[0]
  vector_of_ones = np.ones((number_of_rows, 1))
  X_with_intersept = np.hstack((vector_of_ones, X))
  return np.dot(X_with_intersept, w)

## створіть функцію для обчислення функції втрат у векторному вигляді;

In [81]:
def mean_squared_coef(y_predicted, y):
  n = len(y)
  return np.sum((y_predicted - y) ** 2) / (2 * n)

## реалізуйте один крок градієнтного спуску;

In [82]:
def gradient_descent(X, y, iterations=1000, learning_rate=0.000001, stopping_threashold= 1e-6):
  w_0 = 0.01
  w_1 = 0.1

  costs = []
  weights = []
  previous_cost = None

  for i in range(iterations):
    w = (w_0, w_1)
    y_predicted = linear_regression_hypothesis(w, X)
    cost = mean_squared_error(y_predicted, y)

    if previous_cost and (abs(previous_cost - cost) <= stopping_threashold):
      break
    previous_cost = cost

    costs.append(cost)
    weights.append(w)

    w_0_derivative = - (2 / float(len(X))) * sum(y- y_predicted)
    w_1_derivative = - (2 / float(len(X))) * sum(X * (y- y_predicted))

    w_0 -= (learning_rate * w_0_derivative)
    w_1 -= (learning_rate * w_1_derivative)
    #print(f'Attempt {i + 1} Cost: {cost} W: {(w_0, w_1)}')
  return w_0, w_1, costs

In [83]:
X = data['area']

y = data['price']

#w_0, w_1, costs = gradient_descent(X, y, iterations=5, learning_rate=0.00000001, stopping_threashold= 1e-6)
#plt.plot(costs)

## знайдіть найкращі параметри $\vec{w}$ для датасету прогнозуючу ціну на будинок залежно від площі, кількості ванних кімнат та кількості спалень;


In [84]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [85]:
data.head()
# area, bathroom, bedroom

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [86]:
X = data[['area', 'bedrooms', 'bathrooms']]
y = data['price']
display(X.describe())

Unnamed: 0,area,bedrooms,bathrooms
count,545.0,545.0,545.0
mean,5150.541284,2.965138,1.286239
std,2170.141023,0.738064,0.50247
min,1650.0,1.0,1.0
25%,3600.0,2.0,1.0
50%,4600.0,3.0,1.0
75%,6360.0,3.0,2.0
max,16200.0,6.0,4.0


In [87]:
minmax_X = pd.DataFrame(MinMaxScaler().fit_transform(X))
standard_X = pd.DataFrame(StandardScaler().fit_transform(X))

costs = {}
for func in (LinearRegression, Lasso, Ridge):
  for X, X_name in zip((X, minmax_X, standard_X), ('orig_X', 'minmax_X', 'standard_X')):
    model = func()
    model.fit(X, y)

    intersept = model.intercept_
    slope = model.coef_

    y_predicted = X @ slope.T + intersept

    cost = mean_squared_error(y_predicted, y)
    costs[cost] =  {'model': model,
                    'regularization': X_name,
                    'intersept': intersept,
                    'slope': slope}

In [88]:
lowest_cost_from_descent = sorted(costs.keys())[0]
best_approach = costs[lowest_cost_from_descent]
gradient_descent_weights = np.concatenate((np.array([best_approach['intersept']]), best_approach['slope']), axis=0)
best_approach

{'model': LinearRegression(),
 'regularization': 'standard_X',
 'intersept': 4766729.247706422,
 'slope': array([821214.14349519, 299983.57107963, 695808.52272537])}

## знайдіть ці ж параметри за допомогою аналітичного рішення;

In [89]:
X_1 = np.column_stack((np.ones(len(X)), X))
X_1

array([[ 1.        ,  1.04672629,  1.40341936,  1.42181174],
       [ 1.        ,  1.75700953,  1.40341936,  5.40580863],
       [ 1.        ,  2.21823241,  0.04727831,  1.42181174],
       ...,
       [ 1.        , -0.70592066, -1.30886273, -0.57018671],
       [ 1.        , -1.03338891,  0.04727831, -0.57018671],
       [ 1.        , -0.5998394 ,  0.04727831, -0.57018671]])

In [90]:
analytical_weights = np.linalg.inv(X_1.T @ X_1) @ X_1.T @ y
analytical_weights

array([4766729.24770642,  821214.14349519,  299983.57107963,
        695808.52272538])

## порівняйте отримані результати.

In [91]:
analytical_weights, gradient_descent_weights

(array([4766729.24770642,  821214.14349519,  299983.57107963,
         695808.52272538]),
 array([4766729.24770642,  821214.14349519,  299983.57107963,
         695808.52272537]))

In [92]:
# check if weights are equal with tolerance 1e-6
np.allclose(analytical_weights, gradient_descent_weights, atol=1e-6)

True

In [93]:
y_analytical_predicted = X_1 @ analytical_weights
y_gradient_predicted = X_1 @ gradient_descent_weights

analytical_mse= mean_squared_error(y_analytical_predicted, y)
gradient_mse = mean_squared_error(y_gradient_predicted, y)

analytical_mae = mean_absolute_error(y_analytical_predicted, y)
gradient_mae = mean_absolute_error(y_gradient_predicted, y)

analytical_r2 = r2_score(y, y_analytical_predicted)
gradient_r2 = r2_score(y, y_gradient_predicted)

print(analytical_mse, gradient_mse)

print(analytical_mae, gradient_mae)

print(analytical_r2, gradient_r2)

1791170049977.3193 1791170049977.3193
999109.7245946018 999109.7245946018
0.4870830667058762 0.4870830667058762


# Conclusion
1.   Analytical and gradient descent weights are equal with tolerance = 1e-6
2.   Mean squared error of both approaches is 1791170049977.3193
3.   Mean absolute error of both approaches is 999109.7245946018
4.   R2 score is 0.4870830667058762 (A little bit worse than a coin toss, as I understand?)



