In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('flat_dataset_2020.csv')
df

Unnamed: 0,city,floorNumber,floorsTotal,totalArea,price,rooms
0,Краснодар,1,5.0,18.0,5600000,2
1,Краснодар,1,5.0,15.0,4650000,1
2,Краснодар,1,5.0,11.9,2990000,3
3,Краснодар,1,7.0,18.4,4390000,2
4,Краснодар,2,5.0,17.6,4890000,4
...,...,...,...,...,...,...
63940,Санкт-Петербург,1,4.0,152.0,9999000,11
63941,Москва,5,5.0,167.0,10000000,5
63942,Ярославль,1,4.0,235.0,13500000,8
63943,Ярославль,4,5.0,271.0,13750000,6


In [35]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['city'] = le.fit_transform(df.city)

In [59]:
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(df.drop('price', axis = 1), df.price, test_size=0.15, random_state=42)

models = [LinearRegression(),LassoCV(),RidgeCV(), RandomForestRegressor(), GradientBoostingRegressor()]
for model in models:
    print(str(model), -np.mean(cross_val_score(model, X_train, y_train, cv=3, scoring='neg_mean_absolute_error')))
    


LinearRegression() 8401985.363689497
LassoCV() 8463612.525459489
RidgeCV(alphas=array([ 0.1,  1. , 10. ])) 8401990.739911938
RandomForestRegressor() 4836007.9184295945
GradientBoostingRegressor() 5376228.537641208


In [63]:
from sklearn.model_selection import RandomizedSearchCV

params = {
    'n_estimators':[50,70,100,150,200],
    'max_depth':[3,6,9]
    
}
rsCV = RandomizedSearchCV(RandomForestRegressor(),params)
rsCV.fit(X_train, y_train)


RandomizedSearchCV(estimator=RandomForestRegressor(),
                   param_distributions={'max_depth': [3, 6, 9],
                                        'n_estimators': [50, 70, 100, 150,
                                                         200]})

In [65]:
rsCV.best_params_

{'n_estimators': 70, 'max_depth': 9}

In [67]:
from sklearn.metrics import mean_absolute_error

predict = rsCV.best_estimator_.predict(X_test)
mean_absolute_error(y_test, predict)

4926413.643524185

## Сохранение

In [69]:
from joblib import dump

dump(rsCV.best_estimator_, 'model.joblib') 

['model.joblib']

In [70]:
np.save('classes.npy', le.classes_)

In [85]:
df.floorsTotal.mean()

14.757588552662444

## Использование модели

In [98]:
from joblib import load
from sklearn.preprocessing import LabelEncoder

def get_predictions(city, floor, area, rooms):

    model = load('model.joblib') 

    encoder = LabelEncoder()
    encoder.classes_ = np.load('classes.npy', allow_pickle=True)
    
    df = pd.DataFrame([[city, floor, area, rooms]], columns=['city', 'floorNumber', 'totalArea', 'rooms'])
    df['floorsTotal'] = 14.75
    df.city = encoder.transform(df.city)
    
    return round(model.predict(df)[0])



In [99]:
get_predictions('Москва',2,100,13)

3301470