In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
from sklearn.linear_model import Ridge, Lasso, LinearRegression, ElasticNet
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error
from sklearn import preprocessing
from typing import Tuple
import pandas_profiling
import pandas as pd
import numpy as np

In [0]:
def encode_categories(source_data, column_name):
    if column_name not in source_data:
      return
    le = preprocessing.LabelEncoder()
    le.fit(source_data[column_name])
    categories = le.classes_

    view_encoder = OneHotEncoder(categories=[categories], handle_unknown='ignore', sparse=False)
    new_view: np.ndarray = view_encoder.fit_transform(source_data[column_name].values.reshape(-1, 1))
    for i, c in enumerate(view_encoder.categories_[0]):
        source_data.insert(1, f'{column_name} №{c}', new_view[:, i])
    return source_data.drop([column_name], axis=1)

In [0]:
data = pd.read_csv('./drive/My Drive/prices_original.csv')

data = data.drop([
    'id', 'date', 'lat', 'long', 'sqft_living',
    'yr_built', 'yr_renovated', 'waterfront'
], axis=1)

data = encode_categories(data, 'view')
data = encode_categories(data, 'condition')
data = encode_categories(data, 'floors')
data = encode_categories(data, 'zipcode')

In [0]:
def separate_prices(to_separate_data: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
    x_arr = to_separate_data[:, list(range(1, len(to_separate_data[0])))]
    y_arr = to_separate_data[:, [0]]

    return x_arr, y_arr

In [0]:
np_data = data.to_numpy()

train_threshold_index = int(len(np_data) * 0.83284)

train_data = np_data[:train_threshold_index]
test_data = np_data[train_threshold_index:]

In [0]:
scaler = preprocessing.StandardScaler()
train_data = scaler.fit_transform(train_data)
test_data = scaler.transform(test_data)

train_inputs, train_prices = separate_prices(train_data)
test_inputs, test_prices = separate_prices(test_data)

In [8]:
model = Ridge(alpha=.07, max_iter=90000000)
print(model.fit(train_inputs, train_prices))

Ridge(alpha=0.07, copy_X=True, fit_intercept=True, max_iter=90000000,
      normalize=False, random_state=None, solver='auto', tol=0.001)


In [9]:
print('score', model.score(test_inputs, test_prices))

score 0.8109338538402734


In [10]:
predicted_sklearn = model.predict(test_inputs)

scaled_back_sklearn_predicted = scaler.inverse_transform(np.concatenate((predicted_sklearn, test_inputs), axis=1))
scaled_back_expected = scaler.inverse_transform(np.concatenate((test_prices, test_inputs), axis=1))

print('SKLEARN mean_absolute_error', mean_absolute_error(scaled_back_sklearn_predicted, scaled_back_expected))

SKLEARN mean_absolute_error 1006.1985796755744


In [44]:
# define own regression model
class HousesModel:
  def __init__(self, alpha=0.2, iters = 1000):
    self._alpha = alpha
    self._iters = iters
  
  def compute_cost(self, X, y, coefs):
    tobesummed = np.power(((X @ coefs)-y),2)
    return np.sum(tobesummed)/(2*len(X))

  def fit(self, X, y):
    # +1 for bias
    self._theta = np.random.uniform(0, 1, 1 + np.size(X, axis=1))
    
    ones = np.ones([X.shape[0],1])
    X = np.concatenate((ones,X),axis=1)
    y = y.reshape(-1)

    theta = self._theta
    alpha = self._alpha
    iters = self._iters
    
    print('X shape', X.shape)
    print('theta s', theta.shape)
    print('y shape', y.shape)
    print()

    for i in range(iters):
      prediction = np.dot(X, theta)
      theta = theta - (alpha/len(X)) * (X.T.dot(prediction - y))
      alpha *= 0.999999

      if i % (iters / 10) == 0:
        print('MSE:', self.compute_cost(X, y, theta))
    
    self._theta = theta
    return self
  
  def predict(self, X):
    ones = np.ones([X.shape[0],1])
    X = np.concatenate((ones,X),axis=1)
    return X.dot(self._theta).reshape(-1, 1)
  
own_model = HousesModel()
own_model.fit(train_inputs, train_prices)

predicted_own = own_model.predict(test_inputs)

scaled_back_own_predicted = scaler.inverse_transform(np.concatenate((predicted_own, test_inputs), axis=1))

print('OWN mean_absolute_error', mean_absolute_error(scaled_back_own_predicted, scaled_back_expected))

X shape (18000, 95)
theta s (95,)
y shape (18000,)

MSE: 3.5730860473576564
MSE: 0.10062523598835989
MSE: 0.10056243207115968
MSE: 0.10056221877071983
MSE: 0.10056221804163559
MSE: 0.10056221803914206
MSE: 0.10056221803913351
MSE: 0.1005622180391335
MSE: 0.1005622180391335
MSE: 0.10056221803913348
OWN mean_absolute_error 1006.2006206158147
