In [1544]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [1545]:
#reading csv
content = pd.read_csv("Housing.csv")

In [1546]:
#displaying information to see what is inside 
content.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [1547]:
#choosing only neccesary columns from original DataFrame
content = content[["price", "area", "bedrooms","bathrooms",]]
content.head()

Unnamed: 0,price,area,bedrooms,bathrooms
0,13300000,7420,4,2
1,12250000,8960,4,4
2,12250000,9960,3,2
3,12215000,7500,4,2
4,11410000,7420,4,1


In [1548]:
#types checking to be sure they are numbers not text
content.dtypes

price        int64
area         int64
bedrooms     int64
bathrooms    int64
dtype: object

In [1549]:
#Preparing data 
#importing MinMaxScaler to normalize y (price) because integers in this column are 
#significantly bigger than numbers in DataFrame X (I tried to do it without normalizing and the 
# models coeffs and MSE were so big...). 
from sklearn.preprocessing import MinMaxScaler

X = content[["area","bedrooms","bathrooms"]]
y = content["price"]

#calling scaler method
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

#scaling X because X["area"] has quite big numbers in comparison with others and without scaling w coefficients are also huge in output
y_scaled = scaler_y.fit_transform(y.values.reshape(-1,1)).flatten()
X_scaled = scaler_X.fit_transform(X)
X_scaled = np.c_[X_scaled, np.ones(X.shape[0])]

#checking sizes of X and y to be sure that dot product is possible to be fulfilled
print("X_scaled shape:", X_scaled.shape)
print("y_scaled shape: ", y_scaled.shape)
print("X_scaled type is: ", type(X_scaled))
print("y_scaled type is: ", type(y_scaled))

X_scaled shape: (545, 4)
y_scaled shape:  (545,)
X_scaled type is:  <class 'numpy.ndarray'>
y_scaled type is:  <class 'numpy.ndarray'>


In [1550]:
#напишіть функцію гіпотези лінійної регресії у векторному вигляді;
w = np.linalg.inv(X_scaled.T@X_scaled) @ X_scaled.T @ y_scaled
print("w shape:", w.shape)
print("w type:", type(w),"\n")

#predicting y (price) to use it in MSE function 
y_pred = X_scaled @ w
print("y_pred shape: ", y_pred.shape)
print("y_pred type: ", type(y_pred),"\n")


# Денормализация y_pred и y_scaled обратно в их исходные значения
y_pred_denorm = scaler_y.inverse_transform(y_pred.reshape(-1, 1)).flatten()
y_denorm = scaler_y.inverse_transform(y_scaled.reshape(-1, 1)).flatten()


def mse(y_true, y_pred):
   return np.mean((y_true - y_pred)**2)

print("Coefficients of linear regression using analytical formula: ")
print(dict(zip(X.columns, map(float, w.flatten()))),"\n")
print("MSE: ", mse(y_denorm, y_pred_denorm))


w shape: (4,)
w type: <class 'numpy.ndarray'> 

y_pred shape:  (545,)
y_pred type:  <class 'numpy.ndarray'> 

Coefficients of linear regression using analytical formula: 
{'area': 0.4771426895812334, 'bedrooms': 0.17611256873124567, 'bathrooms': 0.36001285656909104} 

MSE:  1791170049977.3193


In [1551]:
#створіть функцію для обчислення функції втрат у векторному вигляді;
def mse_func(X,y,w):    
    return np.linalg.norm(X @ w - y)**2 / len(X)

#derivative of the mse_func function
def grad(X, y, w):
    return 2 * X.T @ (X@w-y) / len(X)

In [1552]:
#реаліз градієнтного спуску;

# choose random coefficients, gamma = learning rate, max_iter = max amount of iterations, eps = accuracy I want to achieve
w = np.array([0.5, 0, 0, 0])
gamma = 1e-3
max_iter = 10000
eps = 1e-12

#calculating  current mse with random weights w
f_old = mse_func(X_scaled,y_scaled,w)

#making one step to the side against gradient
w = w - gamma*grad(X_scaled,y_scaled,w)

#calculation of a new error using above calculated weights w
f_new = mse_func(X_scaled,y_scaled,w)

# innitialisation of the first iteration 
i=1

#inside the loop we will calculate weights stepping against gradient while changing the mse function bigger than eps, or max_iter achieved
while np.abs(f_new - f_old) > eps and i < max_iter:
    w = w-gamma*grad(X_scaled,y_scaled,w)

    i +=1
    f_old = f_new
    f_new = mse_func(X_scaled,y_scaled,w)

#after we quit the loop printing weights
print(w)

# predictions based on scaled data
y_pred = X_scaled @ w

# denormalisation of predicted data
y_pred_denorm = scaler_y.inverse_transform(y_pred.reshape(-1, 1)).flatten()
y_denorm = scaler_y.inverse_transform(y_scaled.reshape(-1, 1)).flatten()

# MSE
def mse_denorm_func(y_true, y_pred):
    return np.mean((y_true - y_pred)**2)

# MSE based on denormalized values
mse_denorm = mse_denorm_func(y_denorm, y_pred_denorm)

# Вывод денормализованного MSE
print("MSE:", mse_denorm)


[0.52713663 0.11506405 0.16681094 0.07467153]
MSE: 1962965736458.9568


In [1553]:
#It was not the Home task, but I tried to use polinomial function gerression to see if i can have lower MSE and better prediction of y(price)
#adding to the X DataFrame squared features of area, bedrooms, bathrooms. The realisation is the same as above only:
#X is longer for 3 squared features(squared_area, squared_bedrooms, squared_bathrooms) and accordingly innitialized w is longer.

#Could you pls explaine me if it is correct to do this way trying to achieve better correspondence?

#X_for_polynom = X.copy()
#X_for_polynom.insert(3, "squared_area", X_for_polynom["area"]**2)
#X_for_polynom.insert(4, "squared_bedrooms",X_for_polynom["bedrooms"]**2)
#X_for_polynom.insert(5, "squared_bathrooms",X_for_polynom["bathrooms"]**2 )

#w = np.array([1,1,1,1,1,1,1])

#gamma = 1e-3
#max_iter = 10000
#eps = 1e-12

#f_old = mse_func(X_for_polynom,y,w)
#w = w - gamma*grad(X_for_polynom,y,w)
#f_new = mse_func(X_for_polynom,y,w)

#i = 1

#while np.abs(f_new-f_old)>eps and i < max_iter:
#    w = w-gamma*grad(X_for_polynom,y,w)
#    i+=1
#    f_old=f_new
#    f_new = mse_func(X_for_polynom,y,w)

#print(w)
#print("MSE: ", mse_func(X_for_polynom,y,w), "\n")

#X_for_polynom.head()

In [1554]:
#для перевірки спрогнозованих значень, використайте LinearRegression з бібліотеки scikit-learn 
#та порівняйте результати. 
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

#deletion of ones column because LinearRegression adds this column automatically
#X = X.drop(columns=["Intercept"])

#converting X from DataFrame to the ndarray
X = X.values

regressor = LinearRegression().fit(X, y)

print("Weights: ", regressor.coef_)
print("Intercept: ", regressor.intercept_, "\n")

#Could you please tell me: why i got this negative value of R-squared?
print("Models score:", regressor.score(X, y),"\n")

predictions = regressor.predict(X)
print(predictions.shape)

#prediction_denorm = scaler.inverse_transform(predictions_scaled.reshape(-1, 1)).flatten()
#y_denorm = scaler.inverse_transform(y.reshape(-1,1)).flatten

print("MSE:", mean_squared_error(y, predictions), "\n")

#print("Оригинальные значения y:", y_denorm[:10])
#print("Предсказанные значения:", prediction_denorm[:10])

Weights:  [3.78762754e+02 4.06820034e+05 1.38604950e+06]
Intercept:  -173171.60763263423 

Models score: 0.4870830667058762 

(545,)
MSE: 1791170049977.3193 



In [1557]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

# Использование Ridge Regression вместо обычной Linear Regression
ridge_regressor = Ridge(alpha=0.1)  # alpha — это коэффициент регуляризации
ridge_regressor.fit(X_scaled, y_scaled)

# Вывод весов
print("Weights: ", ridge_regressor.coef_)
print("Intercept: ", ridge_regressor.intercept_, "\n")

# Прогнозирование
predictions_scaled = ridge_regressor.predict(X_scaled)

# Денормализация predictions and y для правильного сравнения
predictions = scaler_y.inverse_transform(predictions_scaled.reshape(-1, 1)).flatten()
y_denorm = scaler_y.inverse_transform(y_scaled.reshape(-1, 1)).flatten()

# Оценка модели
print("Models score:", ridge_regressor.score(X_scaled, y_scaled),"\n")
print("MSE (Ridge Regression):", mean_squared_error(y_denorm, predictions), "\n")


Weights:  [0.47362676 0.17587376 0.35835022 0.        ]
Intercept:  0.04392578432116884 

Models score: 0.487066923842824 

MSE (Ridge Regression): 1791226422873.091 



In [1556]:
#loss function and gradient function with ridge normalisation


