Dataset is taken from https://raw.githubusercontent.com/bdemeshev/em301/master/datasets/flats_moscow.txt
There is description as well https://github.com/bdemeshev/em301/blob/master/datasets/flats_moscow_description.txt

In [1]:
import numpy as np
import pandas as pd

import pickle

from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV, KFold

In [2]:
df = pd.read_csv('flats.csv', sep='\t')
df.head()

Unnamed: 0,n,price,totsp,livesp,kitsp,dist,metrdist,walk,brick,floor,code
0,1,81,58,40,6.0,12.5,7,1,1,1,3
1,2,75,44,28,6.0,13.5,7,1,0,1,6
2,3,128,70,42,6.0,14.5,3,1,1,1,3
3,4,95,61,37,6.0,13.5,7,1,0,1,1
4,5,330,104,60,11.0,10.5,7,0,1,1,3


In [3]:
df = df.drop(columns=['n', 'metrdist', 'walk', 'brick', 'floor'])

In [4]:
pipeline = Pipeline([
    ('code_encoder', ColumnTransformer([
        ('onehot', OneHotEncoder(sparse=False, drop='first'), ['code']),
    ], remainder='passthrough')),
    ('scaler', StandardScaler()),
    ('ridge', Ridge()),
])

In [5]:
gridsearch = GridSearchCV(
    pipeline,
    param_grid={
        'ridge__alpha': np.logspace(-4, 2, 5000),
    },
    scoring='neg_mean_squared_error',
    cv=KFold(n_splits=5),
    n_jobs=-1,
    verbose=5,
)

In [6]:
X, y = df[df.columns[1:]], df[['price']]

gridsearch.fit(X, y)

Fitting 5 folds for each of 5000 candidates, totalling 25000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 492 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 2508 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done 5100 tasks      | elapsed:   11.6s
[Parallel(n_jobs=-1)]: Done 8268 tasks      | elapsed:   17.4s
[Parallel(n_jobs=-1)]: Done 12012 tasks      | elapsed:   24.5s
[Parallel(n_jobs=-1)]: Done 16332 tasks      | elapsed:   32.8s
[Parallel(n_jobs=-1)]: Done 21228 tasks      | elapsed:   42.4s
[Parallel(n_jobs=-1)]: Done 25000 out of 25000 | elapsed:   49.0s finished


GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=False),
             error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('code_encoder',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='passthrough',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('onehot',
                                                                         OneHotEncoder(categories='auto',
                                                                                       drop='first',
                                                                                       dtype=<class 'numpy.float64'>,
                                                                          

In [7]:
gridsearch.best_params_, gridsearch.best_score_

({'ridge__alpha': 28.99295346572567}, -884.4878959560294)

In [8]:
best_model = gridsearch.best_estimator_

In [9]:
best_model.steps[2][1].coef_

array([[ -0.0966974 ,   2.04704369,   1.4340808 ,  -7.35654265,
         -5.32879845,  -4.77745472,  -1.07476658,  23.45943555,
         10.87235699,   5.56740703, -11.51677216]])

In [10]:
def mape(y_true, y_pred):
    return 100 * np.mean(np.abs((y_true - y_pred) / y_true))

In [12]:
print(mape(y, best_model.predict(X)))

price    13.31445
dtype: float64


In [13]:
with open('model.pickle', 'wb') as model_file:
    pickle.dump(best_model, model_file)