In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

### Подготовка данных для обучения

In [2]:
data = pd.read_csv('train.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 20 columns):
Id               10000 non-null int64
DistrictId       10000 non-null int64
Rooms            10000 non-null float64
Square           10000 non-null float64
LifeSquare       7887 non-null float64
KitchenSquare    10000 non-null float64
Floor            10000 non-null int64
HouseFloor       10000 non-null float64
HouseYear        10000 non-null int64
Ecology_1        10000 non-null float64
Ecology_2        10000 non-null object
Ecology_3        10000 non-null object
Social_1         10000 non-null int64
Social_2         10000 non-null int64
Social_3         10000 non-null int64
Healthcare_1     5202 non-null float64
Helthcare_2      10000 non-null int64
Shops_1          10000 non-null int64
Shops_2          10000 non-null object
Price            10000 non-null float64
dtypes: float64(8), int64(9), object(3)
memory usage: 1.5+ MB


Столбцы, содержащие информацию о этажах и комнатах могут содержать только целые значения.

In [4]:
data['Rooms'] = data.Rooms.astype('int64')
data['HouseFloor'] = data.HouseFloor.astype('int64')

Бинарные категории, содержащиеся в данных столбцах можно легко преобразовать в 1 и 0.

In [5]:
data[['Ecology_2', 'Ecology_3', 'Shops_2']].nunique()

Ecology_2    2
Ecology_3    2
Shops_2      2
dtype: int64

In [6]:
data = data.replace({'B' : 1, 'A' : 0})

In [7]:
data_corr = data.corr()['Price'].drop(index = ['Price', 'Id'])
data_corr

DistrictId       0.265100
Rooms            0.550291
Square           0.520075
LifeSquare       0.081292
KitchenSquare    0.028864
Floor            0.128715
HouseFloor       0.088280
HouseYear        0.004305
Ecology_1       -0.058381
Ecology_2        0.022379
Ecology_3       -0.052585
Social_1         0.263286
Social_2         0.239226
Social_3         0.074878
Healthcare_1     0.138755
Helthcare_2      0.253090
Shops_1          0.180876
Shops_2          0.070388
Name: Price, dtype: float64

Очевидно, что квартира не может находиться на этаже выше, чем максимальный этаж в доме. Возможности достоверно узнать какое из полей заполнено с ошибкой нет, поэтому предположим, что информация о этаже, на котором находится квартира более значима для потенциального покупателя, чем информация о том, сколько всего этажей в доме. Из этого можно сделать вывод, что информция в поле Floor более достоверна, чем информация в поле HouseFloor.

In [8]:
data.loc[data['HouseFloor'] < data['Floor'], 'HouseFloor'] = data['Floor']

In [9]:
data = data.loc[data['Rooms'].between(1, 5)]

У некоторых квартир поле HouseYear содержит очевидно ошибочные данные. 2020 год не отсеиваем, т.к. предполагаем, что эти дома еще строятся.

In [10]:
data.loc[data['HouseYear'] > 2020]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
1497,10814,109,1,37.26507,20.239714,9.0,9,12,20052011,0.13633,1,1,30,6141,10,262.0,3,6,1,254084.534396
4189,11607,147,2,44.791836,28.360393,5.0,4,9,4968,0.319809,1,1,25,4756,16,2857.0,5,8,1,243028.603096


Похоже, что здесь были внесены года строительства дома.

In [11]:
data.loc[data['HouseYear'] == 20052011, 'HouseYear'] = 2011

Просто ошибка. Заменим на значение медианы по району.

In [12]:
data.loc[data['HouseYear'] == 4968, 'HouseYear'] = data.loc[data['DistrictId'] == 147, 'HouseYear'].median()

Самое высокое здание в Европе находится в Москве – в нём 95 этажей, а в среднем в самых высоких зданиях порядка 50 этажей. Даже если какие-то квартиры из них попали в наши данные, то цена их явно не соответствует действительности, а значит они не подходят для модели.

In [13]:
data = data.loc[data['HouseFloor'] < 40]

In [14]:
data = data.loc[data['Square'] < 300]

Здесь на реальные данные похожа только квартира с Id 5548. Её и оставим.

In [15]:
data.loc[(data['LifeSquare'] > 149) & (data['Square'] > data['LifeSquare'])]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
1982,5548,86,5,275.645284,233.949309,26.0,12,37,2011.0,0.161976,1,0,31,7010,5,4508.0,3,7,1,455264.882666
2603,5621,23,3,163.495333,161.504222,12.0,5,5,1977.0,0.014073,1,1,2,475,0,,0,0,1,207007.956663
4047,15362,23,1,163.286965,161.155275,1.0,4,4,1977.0,0.014073,1,1,2,475,0,,0,0,1,228572.520347
5444,14181,62,3,156.636577,153.685835,1.0,1,3,2005.0,0.072158,1,1,2,629,1,,0,0,0,502523.573863


In [16]:
data = data[~data.Id.isin([5621, 15362, 14181])]

Площадь квартиры не может быть меньше определённого значения.

In [17]:
data = data.loc[data['Square'] > 20]

In [18]:
data.loc[data['LifeSquare'] > 200]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
590,14990,23,2,48.449873,263.54202,5.0,6,6,1972.0,0.075779,1,1,6,1437,3,,0,2,1,141780.231857
1982,5548,86,5,275.645284,233.949309,26.0,12,37,2011.0,0.161976,1,0,31,7010,5,4508.0,3,7,1,455264.882666
4328,16550,27,3,81.694417,7480.592129,1.0,9,17,2016.0,0.017647,1,1,2,469,0,,0,0,1,217357.492366
8437,15886,85,3,78.059331,461.463614,10.0,12,16,1998.0,0.037178,1,1,52,11217,1,2300.0,1,7,1,394253.299978
9910,16568,27,4,200.334539,201.627361,25.0,1,2,2013.0,0.041116,1,1,53,14892,4,,1,4,1,528560.506016


In [19]:
data.loc[data.Id.isin([14990, 15886]), 'LifeSquare'] = data['LifeSquare'] / 10

In [20]:
data.loc[data['Id'] == 16550, 'LifeSquare'] = np.NaN

In [21]:
data.loc[data['LifeSquare'] < 10, 'LifeSquare'] = np.NaN

In [22]:
data.loc[data['Square'] < data['LifeSquare'], 'Square'] = data['LifeSquare']

In [23]:
data.loc[((data['Square'] - data['LifeSquare']) < 5), 'LifeSquare'] = np.NaN

In [24]:
data.loc[((data['Square'] - data['LifeSquare']) < 7) & (data['Square'] > 70), 'LifeSquare'] = np.NaN 

Проверим количество комнат и общую площадь на соответствие.

In [25]:
data.loc[data['Rooms'] == 1, 'Square'].mean()

41.36626728497954

In [26]:
data.loc[(data['Rooms'] == 5) & (data['Square'] < 50), 'Rooms'] = 1

In [27]:
data.loc[(data['Rooms'] == 4) & (data['Square'] < 60), 'Rooms'] = 1

In [28]:
data.loc[(data['Rooms'] == 3) & (data['Square'] < 40), 'Rooms'] = 1

Среднее соотношение общей и жилой площадей.

In [29]:
l_coef = (data.loc[data['LifeSquare'].isnull() == False, 'LifeSquare'] / data.loc[data['LifeSquare'].isnull() == False, 'Square']).mean()

In [30]:
l_coef # данный коэффициент пригодтся и для очистки данных для предсказания

0.6029745485310266

In [31]:
data.loc[data['LifeSquare'].isnull() == True, 'LifeSquare'] = data['Square'] * l_coef

Для заполнения пустых полей признака Healthcare_1 обучим модель.

In [32]:
feats = list(data.columns.drop(['Id', 'DistrictId', 'KitchenSquare', 'Healthcare_1', 'Price']))
feats

['Rooms',
 'Square',
 'LifeSquare',
 'Floor',
 'HouseFloor',
 'HouseYear',
 'Ecology_1',
 'Ecology_2',
 'Ecology_3',
 'Social_1',
 'Social_2',
 'Social_3',
 'Helthcare_2',
 'Shops_1',
 'Shops_2']

In [33]:
target = 'Healthcare_1'

In [34]:
X = data.loc[data['Healthcare_1'].isnull() == False, feats]

In [35]:
y = data.loc[data['Healthcare_1'].isnull() == False, target]

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 50)

In [37]:
h_model = RandomForestRegressor(max_depth = 12, random_state = 4, n_estimators = 250, max_features = 7)

In [38]:
h_model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=12,
           max_features=7, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=250, n_jobs=None, oob_score=False, random_state=4,
           verbose=0, warm_start=False)

In [39]:
y_pred = h_model.predict(X_test)

In [40]:
r2_score(y_test, y_pred)

0.9990181094380434

Подставим данные в пустые поля.

In [41]:
h_pred = h_model.predict(data.loc[data['Healthcare_1'].isnull() == True, feats])

In [42]:
data.loc[data['Healthcare_1'].isnull() == True, target] = h_pred

In [43]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9947 entries, 0 to 9999
Data columns (total 20 columns):
Id               9947 non-null int64
DistrictId       9947 non-null int64
Rooms            9947 non-null int64
Square           9947 non-null float64
LifeSquare       9947 non-null float64
KitchenSquare    9947 non-null float64
Floor            9947 non-null int64
HouseFloor       9947 non-null int64
HouseYear        9947 non-null float64
Ecology_1        9947 non-null float64
Ecology_2        9947 non-null int64
Ecology_3        9947 non-null int64
Social_1         9947 non-null int64
Social_2         9947 non-null int64
Social_3         9947 non-null int64
Healthcare_1     9947 non-null float64
Helthcare_2      9947 non-null int64
Shops_1          9947 non-null int64
Shops_2          9947 non-null int64
Price            9947 non-null float64
dtypes: float64(7), int64(13)
memory usage: 1.9 MB


In [44]:
data.describe()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
count,9947.0,9947.0,9947.0,9947.0,9947.0,9947.0,9947.0,9947.0,9947.0,9947.0,9947.0,9947.0,9947.0,9947.0,9947.0,9947.0,9947.0,9947.0,9947.0,9947.0
mean,8382.591937,50.465065,1.887906,56.258965,34.053425,6.27737,8.513421,13.381723,1984.843873,0.118817,0.990349,0.972957,24.690661,5353.569418,8.002011,1070.956172,1.315673,4.227908,0.917362,213936.531059
std,4858.664752,43.631393,0.810152,18.897072,12.385314,28.632464,5.201809,5.786065,18.404958,0.119143,0.09777,0.162218,17.529135,4007.170107,23.740126,805.488681,1.490933,4.797659,0.275348,92411.303196
min,0.0,0.0,1.0,20.213128,10.523868,0.0,1.0,1.0,1910.0,0.0,0.0,0.0,0.0,168.0,0.0,0.0,0.0,0.0,0.0,59174.778028
25%,4167.5,20.0,1.0,41.883265,24.171015,1.0,4.0,9.0,1974.0,0.017647,1.0,1.0,6.0,1564.0,0.0,540.0,0.0,1.0,1.0,153921.3453
50%,8394.0,36.0,2.0,52.599854,32.14984,6.0,7.0,14.0,1977.0,0.075424,1.0,1.0,25.0,5285.0,2.0,830.0,1.0,3.0,1.0,192198.448321
75%,12589.5,75.0,2.0,65.970123,41.336125,9.0,12.0,17.0,2001.0,0.195781,1.0,1.0,36.0,7227.0,5.0,1340.0,2.0,6.0,1.0,249056.780509
max,16798.0,209.0,5.0,275.645284,233.949309,2014.0,37.0,39.0,2020.0,0.521867,1.0,1.0,74.0,19083.0,141.0,4849.0,6.0,23.0,1.0,633233.46657


#### Работа с DistrictId

In [45]:
mp = pd.DataFrame(data.groupby(['DistrictId', 'Rooms'])['Price'].mean()).rename(columns = {'Price': 'mean_price'})

In [46]:
data = pd.merge(data, mp, on = ['DistrictId', 'Rooms'], how = 'left')

In [47]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9947 entries, 0 to 9946
Data columns (total 21 columns):
Id               9947 non-null int64
DistrictId       9947 non-null int64
Rooms            9947 non-null int64
Square           9947 non-null float64
LifeSquare       9947 non-null float64
KitchenSquare    9947 non-null float64
Floor            9947 non-null int64
HouseFloor       9947 non-null int64
HouseYear        9947 non-null float64
Ecology_1        9947 non-null float64
Ecology_2        9947 non-null int64
Ecology_3        9947 non-null int64
Social_1         9947 non-null int64
Social_2         9947 non-null int64
Social_3         9947 non-null int64
Healthcare_1     9947 non-null float64
Helthcare_2      9947 non-null int64
Shops_1          9947 non-null int64
Shops_2          9947 non-null int64
Price            9947 non-null float64
mean_price       9947 non-null float64
dtypes: float64(8), int64(13)
memory usage: 1.7 MB


#### Построим модель на полученных данных

KitchenSquare – слишком много недостоверных данных

Id – произвольное число, не имеющее отношения к цене квартиры

DistrictId – категориальный признак, содержащий слишком большое количество категорий

In [48]:
feats = list(data.columns.drop(['Id', 'DistrictId', 'Price', 'KitchenSquare']))
feats

['Rooms',
 'Square',
 'LifeSquare',
 'Floor',
 'HouseFloor',
 'HouseYear',
 'Ecology_1',
 'Ecology_2',
 'Ecology_3',
 'Social_1',
 'Social_2',
 'Social_3',
 'Healthcare_1',
 'Helthcare_2',
 'Shops_1',
 'Shops_2',
 'mean_price']

In [49]:
target = 'Price'

In [50]:
X = data[feats]

In [51]:
y = data[target]

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 50)

In [53]:
rf = RandomForestRegressor(max_depth = 14, random_state = 100, n_estimators = 250, max_features = 11)

In [54]:
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=14,
           max_features=11, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=250, n_jobs=None, oob_score=False,
           random_state=100, verbose=0, warm_start=False)

In [55]:
y_pred = rf.predict(X_test)

In [56]:
r2_score(y_test, y_pred)

0.780438346016114

### Подготовка данных для предсказания

In [57]:
data = pd.read_csv('test.csv')

In [58]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 19 columns):
Id               5000 non-null int64
DistrictId       5000 non-null int64
Rooms            5000 non-null float64
Square           5000 non-null float64
LifeSquare       3959 non-null float64
KitchenSquare    5000 non-null float64
Floor            5000 non-null int64
HouseFloor       5000 non-null float64
HouseYear        5000 non-null int64
Ecology_1        5000 non-null float64
Ecology_2        5000 non-null object
Ecology_3        5000 non-null object
Social_1         5000 non-null int64
Social_2         5000 non-null int64
Social_3         5000 non-null int64
Healthcare_1     2623 non-null float64
Helthcare_2      5000 non-null int64
Shops_1          5000 non-null int64
Shops_2          5000 non-null object
dtypes: float64(7), int64(9), object(3)
memory usage: 742.3+ KB


In [59]:
data['Rooms'] = data.Rooms.astype('int64')
data['HouseFloor'] = data.HouseFloor.astype('int64')
data['HouseYear'] = data.HouseYear.astype('int64')

In [60]:
data = data.replace({'B' : 1, 'A' : 0})

In [61]:
data.loc[data['HouseFloor'] < data['Floor'], 'HouseFloor'] = data['Floor']

In [62]:
data.loc[data['HouseFloor'] > 40]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
651,15864,27,3,47.722835,47.098813,9.0,18,99,1977,0.072158,1,1,2,629,1,,0,0,0
1587,5707,76,2,73.363523,38.74006,5.0,38,48,2010,0.0,1,1,7,1660,39,1786.0,1,5,1
3711,414,76,2,67.609808,41.175948,5.0,46,48,2002,0.0,1,1,7,1660,39,1786.0,1,5,1
3909,1397,76,3,221.138768,118.055342,4.0,32,48,2011,0.0,1,1,7,1660,39,1786.0,1,5,1
4698,15759,17,2,57.60187,37.744743,10.0,78,78,1989,0.0,1,1,25,5027,4,46.0,1,1,1


Похоже, просто опечатка. Должно быть 18, а не 78.

In [63]:
data.loc[data['DistrictId'] == 17, 'HouseFloor'].unique()

array([10, 17,  8,  9,  5, 22, 24,  6,  4, 15, 19, 27, 12, 16, 18, 78],
      dtype=int64)

In [64]:
data.loc[data['Id'] == 15759, 'Floor'] = 18
data.loc[data['Id'] == 15759, 'HouseFloor'] = 18

In [65]:
data.loc[data['Id'] == 15864, 'HouseFloor'] = data['Floor']

In [66]:
data.loc[data['Rooms'] > 5]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
2071,10793,23,6,110.750226,,0.0,2,2,2015,0.014073,1,1,2,475,0,,0,0,1
3217,4058,27,6,223.453689,104.113552,16.0,2,2,2017,0.041116,1,1,53,14892,4,,1,4,1
3398,1435,111,17,52.866107,32.528342,8.0,15,17,1987,0.093443,1,1,23,4635,5,3300.0,2,4,1


In [67]:
data.loc[data['Rooms'] == 2, 'Square'].mean()

56.689144099392124

In [68]:
data.loc[data['Id'] == 1435, 'Rooms'] = 2

In [69]:
data.loc[data['Rooms'] == 0]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
2406,3343,58,0,116.824201,113.692424,0.0,3,3,1977,0.437885,1,1,23,5735,3,1084.0,0,5,1
2524,10729,27,0,76.345154,42.820796,12.0,14,14,1977,0.017647,1,1,2,469,0,,0,0,1


In [70]:
data.loc[data['Rooms'] == 5, 'Square'].mean()

115.7889707451908

In [71]:
data.loc[data['Rooms'] == 3, 'Square'].mean()

77.20130449946679

In [72]:
data.loc[data['Id'] == 3343, 'Rooms'] = 5
data.loc[data['Id'] == 10729, 'Rooms'] = 3

In [73]:
data.loc[data['Square'] < 20]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
2,5480,190,1,13.597819,15.948246,12.0,2,5,1909,0.0,1,1,30,7538,87,4702.0,5,5,1
66,9011,53,1,1.378543,1.353573,1.0,1,1,1977,0.049637,1,1,34,7759,0,229.0,1,3,1
557,3458,101,1,19.043894,13.941145,3.0,4,5,1970,0.225825,0,1,41,6558,7,1548.0,3,2,1
608,16401,30,1,2.645046,4.338755,1.0,2,2,1977,7.8e-05,1,1,22,6398,141,1046.0,3,23,1
837,2138,27,1,5.647458,1.501582,1.0,1,1,1977,0.017647,1,1,2,469,0,,0,0,1
1165,10120,6,1,5.100672,3.86178,1.0,3,3,1977,0.243205,1,1,5,1564,0,540.0,0,0,1
1195,3969,101,1,18.149267,9.698766,3.0,4,5,1967,0.225825,0,1,41,6558,7,1548.0,3,2,1
1441,9884,29,1,17.273699,16.452172,2.0,3,5,1962,0.06966,1,1,31,6119,4,,1,2,1
2744,7533,66,1,16.319015,11.118981,1.0,4,5,1965,0.111627,1,1,50,12238,8,1970.0,2,3,1
2895,6894,84,1,19.166821,14.865172,0.0,6,6,1965,0.149666,1,1,22,4789,2,4087.0,4,1,1


In [74]:
data.loc[data['Id'] == 170, 'Square'] = data['LifeSquare']

In [75]:
data.loc[data['Square'] < 20, 'Square'] = data.loc[data['Rooms'] == 1, 'Square'].median()

In [76]:
data.loc[data['LifeSquare'] > 200]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
410,11533,94,2,48.713443,303.071094,6.0,5,12,1974,0.521867,1,1,25,6149,0,,0,0,1


In [77]:
data.loc[data['LifeSquare'] > 200, 'LifeSquare'] = data['LifeSquare'] / 10

In [78]:
data.loc[data['KitchenSquare'] > 200, 'KitchenSquare'] = data['KitchenSquare'] / 100

In [79]:
data.loc[data['LifeSquare'] < 10, 'LifeSquare'] = np.NaN

In [80]:
data.loc[data['Square'] < data['LifeSquare'], 'Square'] = data['LifeSquare']

In [81]:
data.loc[((data['Square'] - data['LifeSquare']) < 5), 'LifeSquare'] = np.NaN

In [82]:
data.loc[((data['Square'] - data['LifeSquare']) < 7) & (data['Square'] > 60), 'LifeSquare'] = np.NaN

In [83]:
data.loc[(data['Rooms'] == 4), 'Square'].median()

95.14424552779224

In [84]:
data.loc[(data['Rooms'] == 2), 'Square'].median()

55.89831490068351

In [85]:
data.loc[(data['Rooms'] == 1), 'Square'].median()

40.28678712067378

In [86]:
data.loc[(data['Rooms'] == 4) & (data['Square'] < 60), 'Rooms'] = 1

In [87]:
data.loc[(data['Rooms'] == 5) & (data['Square'] < 70), 'Rooms'] = 2

In [88]:
data.loc[(data['Rooms'] == 3) & (data['Square'] < 40), 'Rooms'] = 1

Заполним пустые значения LifeSquare, используя коэффициент, полученный ранее.

In [89]:
data.loc[data['LifeSquare'].isnull() == True, 'LifeSquare'] = (data['Square'] * l_coef)

Используем модель h_model, чтобы заполнить пустые поля признака Healthcare_1.

In [90]:
feats = list(data.columns.drop(['Id', 'DistrictId', 'KitchenSquare', 'Healthcare_1']))

In [91]:
target = 'Healthcare_1'

In [92]:
h_pred = h_model.predict(data.loc[data['Healthcare_1'].isnull() == True, feats])

In [93]:
data.loc[data['Healthcare_1'].isnull() == True, target] = h_pred

In [94]:
data = pd.merge(data, mp, on = ['DistrictId', 'Rooms'], how = 'left')

In [95]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5000 entries, 0 to 4999
Data columns (total 20 columns):
Id               5000 non-null int64
DistrictId       5000 non-null int64
Rooms            5000 non-null int64
Square           5000 non-null float64
LifeSquare       5000 non-null float64
KitchenSquare    5000 non-null float64
Floor            5000 non-null int64
HouseFloor       5000 non-null int64
HouseYear        5000 non-null int64
Ecology_1        5000 non-null float64
Ecology_2        5000 non-null int64
Ecology_3        5000 non-null int64
Social_1         5000 non-null int64
Social_2         5000 non-null int64
Social_3         5000 non-null int64
Healthcare_1     5000 non-null float64
Helthcare_2      5000 non-null int64
Shops_1          5000 non-null int64
Shops_2          5000 non-null int64
mean_price       4934 non-null float64
dtypes: float64(6), int64(14)
memory usage: 820.3 KB


In [96]:
for i in range(1, 6):
    data.loc[(data['mean_price'].isnull() == True)
             & (data['Rooms'] == i), 'mean_price'] = data.loc[(data['mean_price'].isnull() == False)
                                                              & (data['Rooms'] == i), 'mean_price'].mean()

In [97]:
data.loc[(data['mean_price'].isnull() == True)
         & (data['Rooms'] == 6), 'mean_price'] = data.loc[(data['mean_price'].isnull() == False)
                                                          & (data['Rooms'] == 5), 'mean_price'].mean()

In [98]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5000 entries, 0 to 4999
Data columns (total 20 columns):
Id               5000 non-null int64
DistrictId       5000 non-null int64
Rooms            5000 non-null int64
Square           5000 non-null float64
LifeSquare       5000 non-null float64
KitchenSquare    5000 non-null float64
Floor            5000 non-null int64
HouseFloor       5000 non-null int64
HouseYear        5000 non-null int64
Ecology_1        5000 non-null float64
Ecology_2        5000 non-null int64
Ecology_3        5000 non-null int64
Social_1         5000 non-null int64
Social_2         5000 non-null int64
Social_3         5000 non-null int64
Healthcare_1     5000 non-null float64
Helthcare_2      5000 non-null int64
Shops_1          5000 non-null int64
Shops_2          5000 non-null int64
mean_price       5000 non-null float64
dtypes: float64(6), int64(14)
memory usage: 820.3 KB


In [99]:
feats = list(data.columns.drop(['Id', 'DistrictId', 'KitchenSquare']))

In [100]:
feats

['Rooms',
 'Square',
 'LifeSquare',
 'Floor',
 'HouseFloor',
 'HouseYear',
 'Ecology_1',
 'Ecology_2',
 'Ecology_3',
 'Social_1',
 'Social_2',
 'Social_3',
 'Healthcare_1',
 'Helthcare_2',
 'Shops_1',
 'Shops_2',
 'mean_price']

In [101]:
y_pred = rf.predict(data[feats])

In [102]:
data['Price'] = y_pred

In [103]:
data.loc[:, ['Id', 'Price']].to_csv('predictions.csv', index = None)