In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import joblib as jb
from sklearn.ensemble import BaggingRegressor, StackingRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [15]:
df = pd.read_csv("data/flats_moscow.csv", index_col="Unnamed: 0")

#### price - цена в 1000$
#### totsp - общая площадь квартиры (кв.м)
#### livesp - жилая площадь квартиры (кв.м)
#### kitsp - площадь кухни (кв.м)
#### dist - расстояние от центра Москвы в км
#### metrdist - расстоние до метро в минутах
#### walk - можно ли пешком до метро

In [16]:
df.drop(["brick", "floor", "code"], axis=1)

Unnamed: 0,price,totsp,livesp,kitsp,dist,metrdist,walk
1,81,58,40,6.0,12.5,7,1
2,75,44,28,6.0,13.5,7,1
3,128,70,42,6.0,14.5,3,1
4,95,61,37,6.0,13.5,7,1
5,330,104,60,11.0,10.5,7,0
...,...,...,...,...,...,...,...
2036,110,77,45,10.0,12.0,5,0
2037,95,60,43,6.0,9.0,5,0
2038,95,60,46,5.0,10.5,5,1
2039,129,76,48,10.0,12.5,5,0


In [17]:
X = df.drop("price", axis=1)
y = df["price"]

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
scaler = StandardScaler()
scaler.fit(X_train)

In [20]:
jb.dump(scaler, "FittedScaler.joblib")

['FittedScaler.joblib']

In [21]:
X_train = scaler.transform(X_train)

In [22]:
baseline_model = LinearRegression()

In [23]:
baseline_model.fit(X_train, y_train)

In [24]:
baseline_rmse_score = mean_squared_error(y_true=y_train, y_pred=baseline_model.predict(X_train)) ** 0.5

In [25]:
baseline_rmse_score

27.946866745559156

In [26]:
jb.dump(baseline_model, "BaseLineModel.joblib")

['BaseLineModel.joblib']

In [27]:
X_test = scaler.transform(X_test)

In [28]:
test_score = mean_squared_error(y_test, baseline_model.predict(X_test)) ** 0.5
test_score

34.19425531399671

In [29]:
# RandomForestRegressor

param_grid = {
    "n_estimators": [10, 15, 30, 50,],
    "max_depth": [None, 3, 5, 10, 12],
    "min_samples_leaf": [1, 2, 3, 5, 6]
}

random_forest_model = RandomForestRegressor()
grid_search = GridSearchCV(random_forest_model, param_grid=param_grid)
grid_search.fit(X_train, y_train)

In [30]:
grid_search.best_params_

{'max_depth': None, 'min_samples_leaf': 3, 'n_estimators': 50}

In [31]:
random_forest_model = RandomForestRegressor(max_depth=1, min_samples_leaf=3, n_estimators=50)
random_forest_model.fit(X_train, y_train)

In [32]:
jb.dump(random_forest_model, "RandomForestModel.joblib")

['RandomForestModel.joblib']

In [33]:
mean_squared_error(y_test, random_forest_model.predict(X_test)) ** 0.5

43.37442518549928

In [None]:
# GradientBoostingRegressor
parameters = {
    "loss":["deviance"],
    "learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
    "min_samples_split": np.linspace(0.1, 0.5, 12),
    "min_samples_leaf": np.linspace(0.1, 0.5, 12),
    "max_depth":[3,5,8],
    "subsample":[0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
    "n_estimators":[10, 30, 50, 100]
    }
gradient_boosting_model = GradientBoostingRegressor()
grid_search_GB = GridSearchCV(gradient_boosting_model, param_grid=parameters)
grid_search_GB.fit(X_train, y_train)
grid_search_GB.best_params_

In [69]:
gradient_boosting_model = GradientBoostingRegressor(learning_rate=0.6, max_depth=1, n_estimators=100)

In [70]:
gradient_boosting_model.fit(X_train, y_train)

In [40]:
jb.dump(gradient_boosting_model, "GradientBoostingModel.joblib")

['GradientBoostingModel.joblib']

In [71]:
(mean_squared_error(y_test, gradient_boosting_model.predict(X_test)) ** 0.5) * 1000 * 85

2255245.468113896

In [44]:
gradient_boosting_model = GradientBoostingRegressor()
gradient_boosting_model.fit(X_train, y_train)
(mean_squared_error(y_test, gradient_boosting_model.predict(X_test)) ** 0.5) * 1000 * 85

2549970.905191561