In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
from sklearn.linear_model import Ridge, Lasso, LinearRegression, ElasticNet
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error
from sklearn import preprocessing
from typing import Tuple
import pandas_profiling
import pandas as pd
import numpy as np

In [0]:
def encode_categories(source_data, column_name):
    if column_name not in source_data:
      return
    le = preprocessing.LabelEncoder()
    le.fit(source_data[column_name])
    categories = le.classes_

    view_encoder = OneHotEncoder(categories=[categories], handle_unknown='ignore', sparse=False)
    new_view: np.ndarray = view_encoder.fit_transform(source_data[column_name].values.reshape(-1, 1))
    for i, c in enumerate(view_encoder.categories_[0]):
        source_data.insert(1, f'{column_name} №{c}', new_view[:, i])
    return source_data.drop([column_name], axis=1)

In [0]:
data = pd.read_csv('./drive/My Drive/prices_original.csv')

data = data.drop([
    'id', 'date', 'lat', 'long', 'sqft_living',
    'yr_built', 'yr_renovated', 'waterfront'
], axis=1)

data = encode_categories(data, 'view')
data = encode_categories(data, 'condition')
data = encode_categories(data, 'floors')
data = encode_categories(data, 'zipcode')

In [0]:
def separate_prices(to_separate_data: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
    x_arr = to_separate_data[:, list(range(1, len(to_separate_data[0])))]
    y_arr = to_separate_data[:, [0]]

    return x_arr, y_arr

In [0]:
np_data = data.to_numpy()

train_threshold_index = int(len(np_data) * 0.53284)

train_data = np_data[:train_threshold_index]
test_data = np_data[train_threshold_index:]

In [0]:
scaler = preprocessing.StandardScaler()
train_data = scaler.fit_transform(train_data)
test_data = scaler.transform(test_data)

train_inputs, train_prices = separate_prices(train_data)
test_inputs, test_prices = separate_prices(test_data)

In [31]:
model = Ridge(alpha=.07, max_iter=90000000)
print(model.fit(train_inputs, train_prices))

Ridge(alpha=0.07, copy_X=True, fit_intercept=True, max_iter=90000000,
      normalize=False, random_state=None, solver='auto', tol=0.001)


In [42]:
print('score', model.score(test_inputs, test_prices))

score 0.8151131779510338


In [41]:
predicted = model.predict(test_inputs)

scaled_back_predicted = scaler.inverse_transform(np.concatenate((predicted, test_inputs), axis=1))
scaled_back_expected = scaler.inverse_transform(np.concatenate((test_prices, test_inputs), axis=1))

print('mean_absolute_error', mean_absolute_error(scaled_back_predicted, scaled_back_expected))

mean_absolute_error 1023.223559408838
