In [1]:
import pandas as pd
import numpy as np
from sklearn import model_selection
import warnings

warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('clean_data.csv')
df.head()

Unnamed: 0,Адрес,Жилая,Кухня,Метро,Общая,Цена,Цена за квадрат,Этаж,Этажность здания,Время,Способ передвижения,Широта дома,Долгота дома,Широта метро,Долгота долгота
0,"Юбилейная улица, 16, Мытищи, Московская област...",24.0,10.0,Медведково,48.0,6550000.0,136458.0,12,25,15.0,на транспорте,55.913162,37.713063,55.887473,37.661527
1,"Кутузовский проспект, 4/2, Москва, Россия",45.0,,Киевская,64.0,26000000.0,406250.0,4,7,13.0,пешком,55.750727,37.562272,55.743588,37.565587
2,"1-й Колобовский переулок, 13/14с3, Москва, Россия",23.4,8.9,Трубная,40.0,18000000.0,450000.0,2,3,5.0,пешком,55.769216,37.617411,55.767939,37.621884
3,"Мосфильмовская улица, 8, Москва, Россия",63.0,20.0,Парк Победы,98.0,59900000.0,611224.0,33,54,11.0,на транспорте,55.723257,37.52766,55.736934,37.516009
4,"Комсомольский проспект, 42с2, Москва, Россия",48.0,,Спортивная,67.3,37900000.0,563150.0,4,5,10.0,пешком,55.721731,37.572899,55.723247,37.564105


In [3]:
df= df.drop(columns='Адрес', axis=1)

In [4]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
df['Способ передвижения'] = le.fit_transform(df['Способ передвижения'])

In [5]:
df.head()

Unnamed: 0,Жилая,Кухня,Метро,Общая,Цена,Цена за квадрат,Этаж,Этажность здания,Время,Способ передвижения,Широта дома,Долгота дома,Широта метро,Долгота долгота
0,24.0,10.0,Медведково,48.0,6550000.0,136458.0,12,25,15.0,0,55.913162,37.713063,55.887473,37.661527
1,45.0,,Киевская,64.0,26000000.0,406250.0,4,7,13.0,1,55.750727,37.562272,55.743588,37.565587
2,23.4,8.9,Трубная,40.0,18000000.0,450000.0,2,3,5.0,1,55.769216,37.617411,55.767939,37.621884
3,63.0,20.0,Парк Победы,98.0,59900000.0,611224.0,33,54,11.0,0,55.723257,37.52766,55.736934,37.516009
4,48.0,,Спортивная,67.3,37900000.0,563150.0,4,5,10.0,1,55.721731,37.572899,55.723247,37.564105


## Добавим расстояние от дома до центра города

In [7]:
from geopy import distance

lon_center, lat_center = 55.753595, 37.621031

In [9]:
print(distance.distance((55.913162, 37.713063), (lon_center, lat_center)).km)

18.678507004392735


In [16]:
df['До центра'] = df[['Широта дома', 'Долгота дома']].apply(
    lambda x: distance.distance((x[0],x[1]),(lon_center, lat_center)).km, axis = 1
)

In [24]:
df['До метро'] = df[['Широта дома', 'Долгота дома', 'Широта метро', 'Долгота долгота']].apply(
    lambda x: distance.distance((x[0],x[1]),(x[2],x[3])).km, axis = 1
)

In [25]:
df.head()

Unnamed: 0,Жилая,Кухня,Метро,Общая,Цена,Цена за квадрат,Этаж,Этажность здания,Время,Способ передвижения,Широта дома,Долгота дома,Широта метро,Долгота долгота,До центра,До метро
0,24.0,10.0,Медведково,48.0,6550000.0,136458.0,12,25,15.0,0,55.913162,37.713063,55.887473,37.661527,18.678507,4.309681
1,45.0,,Киевская,64.0,26000000.0,406250.0,4,7,13.0,1,55.750727,37.562272,55.743588,37.565587,3.703355,0.821647
2,23.4,8.9,Трубная,40.0,18000000.0,450000.0,2,3,5.0,1,55.769216,37.617411,55.767939,37.621884,1.753987,0.314697
3,63.0,20.0,Парк Победы,98.0,59900000.0,611224.0,33,54,11.0,0,55.723257,37.52766,55.736934,37.516009,6.768081,1.689556
4,48.0,,Спортивная,67.3,37900000.0,563150.0,4,5,10.0,1,55.721731,37.572899,55.723247,37.564105,4.661193,0.577809


In [50]:
X = df.drop(columns=['Цена', 'Цена за квадрат', 'Метро'], axis=1)
y = df['Цена за квадрат']

In [51]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.33, random_state=42)

In [52]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='median')

In [53]:
imputer.fit(X_train)
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

In [54]:
from sklearn import linear_model

In [55]:
lr = linear_model.LinearRegression()
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [56]:
from sklearn import metrics

In [57]:
metrics.mean_absolute_error(y_train, lr.predict(X_train))

77460.09340991118

In [58]:
metrics.mean_absolute_error(y_test, lr.predict(X_test))

86241.80030620037

In [59]:
scale = preprocessing.StandardScaler()
scale.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [60]:
X_train_norm = scale.transform(X_train)
X_test_norm = scale.transform(X_test)

In [61]:
scale_target = preprocessing.StandardScaler()
scale_target.fit(y_train.values.reshape(-1, 1))

StandardScaler(copy=True, with_mean=True, with_std=True)

In [62]:
y_train_norm = scale_target.transform(y_train.values.reshape(-1, 1))
y_test_norm = scale_target.transform(y_test.values.reshape(-1, 1))

In [76]:
lr1 = linear_model.LinearRegression()
lr1.fit(X_train_norm, y_train_norm)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [77]:
metrics.mean_absolute_error(y_test_norm, lr1.predict(X_test_norm))

0.473861810105019

In [78]:
lr_r = linear_model.Ridge(alpha=10.0)
lr_r.fit(X_train_norm, y_train_norm)

Ridge(alpha=10.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [79]:
metrics.mean_absolute_error(y_train_norm, lr_r.predict(X_train_norm))

0.4248475350432782

In [80]:
metrics.mean_absolute_error(y_test_norm, lr_r.predict(X_test_norm))

0.4727736476806376

In [81]:
import xgboost

In [82]:
params = {
    "n_estimators": 1000,
    "max_depth": 10,
    "min_samples_split": 2,
    "learning_rate": 0.05,
}
xgb = xgboost.XGBRegressor(**params, silent=True)
xgb.fit(X_train_norm, y_train_norm)

XGBRegressor(base_score=0.5, booster=None, colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
       importance_type='gain', interaction_constraints=None,
       learning_rate=0.05, max_delta_step=0, max_depth=10,
       min_child_weight=1, min_samples_split=2, missing=nan,
       monotone_constraints=None, n_estimators=1000, n_jobs=0,
       num_parallel_tree=1, objective='reg:squarederror', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True,
       subsample=1, tree_method=None, validate_parameters=False,
       verbosity=None)

In [83]:
metrics.mean_absolute_error(y_test_norm, xgb.predict(X_test_norm))

0.2117866185891248

In [86]:
poly = preprocessing.PolynomialFeatures(2, interaction_only=True)

PolynomialFeatures(degree=2, include_bias=True, interaction_only=True)

In [84]:
xgb1 = xgboost.XGBRegressor(**params, silent=True)
xgb1.fit(poly.fit_transform(X_train_norm), y_train_norm)

XGBRegressor(base_score=0.5, booster=None, colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
       importance_type='gain', interaction_constraints=None,
       learning_rate=0.05, max_delta_step=0, max_depth=10,
       min_child_weight=1, min_samples_split=2, missing=nan,
       monotone_constraints=None, n_estimators=1000, n_jobs=0,
       num_parallel_tree=1, objective='reg:squarederror', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True,
       subsample=1, tree_method=None, validate_parameters=False,
       verbosity=None)

In [85]:
metrics.mean_absolute_error(y_test_norm, xgb1.predict(poly.fit_transform(X_test_norm)))

0.20611027591172565