### Курсовая работа 

In [1]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans

In [2]:
pd.options.display.max_columns = 25
pd.options.display.max_rows = 100

In [3]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             10000 non-null  int64  
 1   DistrictId     10000 non-null  int64  
 2   Rooms          10000 non-null  float64
 3   Square         10000 non-null  float64
 4   LifeSquare     7887 non-null   float64
 5   KitchenSquare  10000 non-null  float64
 6   Floor          10000 non-null  int64  
 7   HouseFloor     10000 non-null  float64
 8   HouseYear      10000 non-null  int64  
 9   Ecology_1      10000 non-null  float64
 10  Ecology_2      10000 non-null  object 
 11  Ecology_3      10000 non-null  object 
 12  Social_1       10000 non-null  int64  
 13  Social_2       10000 non-null  int64  
 14  Social_3       10000 non-null  int64  
 15  Healthcare_1   5202 non-null   float64
 16  Helthcare_2    10000 non-null  int64  
 17  Shops_1        10000 non-null  int64  
 18  Shops_2

# Приведем все признаки к числовым

In [4]:
train['Ecology_2'].value_counts()

B    9903
A      97
Name: Ecology_2, dtype: int64

In [5]:
train['Ecology_3'].value_counts()

B    9725
A     275
Name: Ecology_3, dtype: int64

In [6]:
train['Shops_2'].value_counts()

B    9175
A     825
Name: Shops_2, dtype: int64

In [7]:
train['Ecology_2'] = pd.get_dummies(train['Ecology_2'], drop_first=True)

In [8]:
train['Ecology_3'] = pd.get_dummies(train['Ecology_3'], drop_first=True)

In [9]:
train['Shops_2'] = pd.get_dummies(train['Shops_2'], drop_first=True)

In [10]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             10000 non-null  int64  
 1   DistrictId     10000 non-null  int64  
 2   Rooms          10000 non-null  float64
 3   Square         10000 non-null  float64
 4   LifeSquare     7887 non-null   float64
 5   KitchenSquare  10000 non-null  float64
 6   Floor          10000 non-null  int64  
 7   HouseFloor     10000 non-null  float64
 8   HouseYear      10000 non-null  int64  
 9   Ecology_1      10000 non-null  float64
 10  Ecology_2      10000 non-null  uint8  
 11  Ecology_3      10000 non-null  uint8  
 12  Social_1       10000 non-null  int64  
 13  Social_2       10000 non-null  int64  
 14  Social_3       10000 non-null  int64  
 15  Healthcare_1   5202 non-null   float64
 16  Helthcare_2    10000 non-null  int64  
 17  Shops_1        10000 non-null  int64  
 18  Shops_2

# Обработаем выбросы

In [11]:
train.loc[(train.Rooms > 5) & (train.Square < 100)]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
377,5927,57,10.0,59.056975,36.223072,10.0,22,22.0,2002,0.090799,1,1,74,19083,2,,5,15,1,317265.323792
1454,8491,1,19.0,42.006046,21.779288,7.0,17,17.0,2014,0.007122,1,1,1,264,0,,0,1,1,78364.616704
2170,14003,99,6.0,59.414334,38.702244,6.0,7,9.0,1969,0.033494,1,1,66,10573,1,1322.0,3,8,1,229661.964416
8849,14865,9,10.0,60.871266,38.420681,10.0,3,2.0,1994,0.161532,1,1,25,5648,1,30.0,2,4,1,172329.270863


In [12]:
train.loc[(train.Rooms > 8) & (train.Square < 100), 'Rooms'] = train['Rooms'].mode()[0]

In [13]:
train.loc[(train.Rooms == 0), ['DistrictId', 'Square', 'LifeSquare', 'Floor', 'HouseFloor', 'KitchenSquare', 'HouseYear', 'Price']]

Unnamed: 0,DistrictId,Square,LifeSquare,Floor,HouseFloor,KitchenSquare,HouseYear,Price
1397,27,138.427694,136.215499,4,3.0,0.0,2016,268394.744389
1981,27,212.932361,211.231125,2,3.0,0.0,2008,302211.260887
2269,27,41.790881,,13,0.0,0.0,1977,98129.976788
3911,28,49.483501,,16,0.0,0.0,2015,217009.338463
4366,6,81.491446,,4,0.0,0.0,1977,212864.799112
4853,27,2.377248,0.873147,1,0.0,0.0,1977,126596.941798
6149,88,38.697117,19.345131,9,16.0,9.0,1982,158998.110646
8834,27,87.762616,85.125471,5,15.0,0.0,1977,219281.918007


In [14]:
train.loc[(train.Rooms == 0) & (train.KitchenSquare > 0), 'Rooms'] = train['Rooms'].median()

In [15]:
train.describe()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
count,10000.0,10000.0,10000.0,10000.0,7887.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,5202.0,10000.0,10000.0,10000.0,10000.0
mean,8383.4077,50.4008,1.8874,56.315775,37.199645,6.2733,8.5267,12.6094,3990.166,0.118858,0.9903,0.9725,24.687,5352.1574,8.0392,1142.90446,1.3195,4.2313,0.9175,214138.857399
std,4859.01902,43.587592,0.813626,21.058732,86.241209,28.560917,5.241148,6.775974,200500.3,0.119025,0.098015,0.163543,17.532614,4006.799803,23.831875,1021.517264,1.493601,4.806341,0.275139,92872.293865
min,0.0,0.0,0.0,1.136859,0.370619,0.0,1.0,0.0,1910.0,0.0,0.0,0.0,0.0,168.0,0.0,0.0,0.0,0.0,0.0,59174.778028
25%,4169.5,20.0,1.0,41.774881,22.769832,1.0,4.0,9.0,1974.0,0.017647,1.0,1.0,6.0,1564.0,0.0,350.0,0.0,1.0,1.0,153872.633942
50%,8394.5,36.0,2.0,52.51331,32.78126,6.0,7.0,13.0,1977.0,0.075424,1.0,1.0,25.0,5285.0,2.0,900.0,1.0,3.0,1.0,192269.644879
75%,12592.5,75.0,2.0,65.900625,45.128803,9.0,12.0,17.0,2001.0,0.195781,1.0,1.0,36.0,7227.0,5.0,1548.0,2.0,6.0,1.0,249135.462171
max,16798.0,209.0,6.0,641.065193,7480.592129,2014.0,42.0,117.0,20052010.0,0.521867,1.0,1.0,74.0,19083.0,141.0,4849.0,6.0,23.0,1.0,633233.46657


In [16]:
train.loc[(train.Square > 150) & (train.Square > train.LifeSquare * 2)] 

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
5087,15947,129,4.0,185.906396,84.830074,31.0,5,3.0,2015,0.020741,1,1,24,5613,5,1340.0,2,5,1,340273.238253
6451,3914,24,4.0,155.930023,76.831293,15.0,6,22.0,2012,0.111627,1,1,50,12238,8,1970.0,2,3,1,559886.965348
7201,3233,129,4.0,186.692602,87.48708,32.0,3,3.0,2011,0.020741,1,1,24,5613,5,1340.0,2,5,1,423443.464367


In [17]:
train.loc[(train.Square < 5) & (train.LifeSquare > 6)]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
3280,10527,27,1.0,4.380726,40.805837,1.0,10,17.0,2013,0.211401,1,1,9,1892,0,,0,1,1,97560.720383
8030,13265,1,3.0,4.823679,79.767964,0.0,6,17.0,1977,0.007122,1,1,1,264,0,,0,1,1,237716.681261


In [18]:
train.loc[(train.Square < 5) & (train.LifeSquare > 70), 'Square'] = 124

In [19]:
train.loc[(train.Square < 5) & (train.LifeSquare > 6), 'Square'] = 43.80726



In [20]:
train.loc[(train.Square * 1.2 < train.LifeSquare) & (train.LifeSquare > 150)] 

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
590,14990,23,2.0,48.449873,263.54202,5.0,6,5.0,1972,0.075779,1,1,6,1437,3,,0,2,1,141780.231857
4328,16550,27,3.0,81.694417,7480.592129,1.0,9,17.0,2016,0.017647,1,1,2,469,0,,0,0,1,217357.492366
6332,8961,27,1.0,33.398983,164.15336,6.0,3,5.0,1965,0.211401,1,1,9,1892,0,,0,1,1,104891.073757
8437,15886,85,3.0,78.059331,461.463614,10.0,12,16.0,1998,0.037178,1,1,52,11217,1,2300.0,1,7,1,394253.299978


In [21]:
train.loc[(train.Square * 1.2 < train.LifeSquare) & (train.LifeSquare > 150), 'LifeSquare'] = train.loc[(train.Square * 1.2 < train.LifeSquare) & (train.LifeSquare > 150), 'LifeSquare'] / 10

In [22]:
train.loc[(train.Square * 1.2 < train.LifeSquare) & (train.LifeSquare > 150), 'LifeSquare'] = train.loc[(train.Square * 1.2 < train.LifeSquare) & (train.LifeSquare > 150), 'LifeSquare'] / 10

In [23]:
train.loc[(train.LifeSquare < 8) & (train.KitchenSquare > 1)] 

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
1800,13113,27,1.0,37.805231,2.228592,11.0,6,0.0,2015,0.017647,1,1,2,469,0,,0,0,1,137402.092529
4588,448,48,1.0,41.186904,1.626502,41.0,2,1.0,1977,0.041125,1,1,46,9515,5,,1,10,1,216882.265408
6036,12666,34,2.0,60.603363,2.400832,58.0,14,22.0,1977,0.069753,1,1,53,13670,4,,1,11,1,261733.472106
9602,5112,6,2.0,63.59174,0.795539,10.0,17,17.0,2014,0.243205,1,1,5,1564,0,540.0,0,0,1,174741.998061


In [24]:
for el in [448, 12666]:
    x1 = train.loc[(train.Id == el), 'LifeSquare']
    x2 = train.loc[(train.Id == el), 'KitchenSquare']
    train.loc[(train.Id == el), 'LifeSquare'] = x2
    train.loc[(train.Id == el), 'KitchenSquare'] = x1
    print(x1, x2)

4588    1.626502
Name: LifeSquare, dtype: float64 4588    41.0
Name: KitchenSquare, dtype: float64
6036    2.400832
Name: LifeSquare, dtype: float64 6036    58.0
Name: KitchenSquare, dtype: float64


In [25]:
x1 = train.loc[(train.Id == 5112), 'LifeSquare'] * 100
x2 = train.loc[(train.Id == 5112), 'Square']
train.loc[(train.Id == 5112), 'LifeSquare'] = x2
train.loc[(train.Id == 5112), 'Square'] = x1
print(train.loc[(train.Id == 5112)])

        Id  DistrictId  Rooms     Square  LifeSquare  KitchenSquare  Floor  \
9602  5112           6    2.0  79.553947    63.59174           10.0     17   

      HouseFloor  HouseYear  Ecology_1  Ecology_2  Ecology_3  Social_1  \
9602        17.0       2014   0.243205          1          1         5   

      Social_2  Social_3  Healthcare_1  Helthcare_2  Shops_1  Shops_2  \
9602      1564         0         540.0            0        0        1   

              Price  
9602  174741.998061  


In [26]:
train.loc[(train.Id == 13113), 'LifeSquare'] = train.loc[(train.Id == 13113), 'LifeSquare'] * 10

In [27]:
train.loc[train.Square < train.LifeSquare] 

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
28,8054,23,1.0,42.530043,43.967759,1.0,3,9.0,2014,0.034656,1,1,0,168,0,,0,0,1,95338.198549
44,10521,38,3.0,104.211396,106.340403,0.0,20,0.0,2017,0.060753,1,1,15,2787,2,520.0,0,7,1,435462.048070
52,2301,1,2.0,61.400054,65.224603,0.0,17,22.0,2016,0.007122,1,1,1,264,0,,0,1,1,199215.452229
123,8753,25,3.0,85.952306,89.803753,1.0,4,3.0,2017,0.069753,1,1,53,13670,4,,1,11,1,309688.592681
153,9870,62,1.0,51.831473,53.491301,1.0,5,1.0,2015,0.072158,1,1,2,629,1,,0,0,0,131797.472284
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9884,41,30,2.0,59.015896,59.439082,12.0,5,5.0,2016,0.000078,1,1,22,6398,141,1046.0,3,23,1,126281.142781
9889,12918,23,2.0,51.440463,53.134243,51.0,3,17.0,2017,0.005767,1,1,1,388,0,,0,0,1,88150.012510
9895,2737,27,3.0,123.430072,125.806981,123.0,5,10.0,2015,0.017647,1,1,2,469,0,,0,0,1,234194.837047
9902,14001,73,1.0,44.098768,44.267551,1.0,7,24.0,2014,0.042032,1,1,37,6856,84,1940.0,2,5,1,381937.404161


In [28]:
train.loc[train.Square * 1.2 <  train.LifeSquare] 

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
212,1748,88,2.0,5.497061,67.628717,1.0,24,22.0,1977,0.127376,1,1,43,8429,3,,3,9,1,412511.088764
1608,10202,6,1.0,2.596351,4.604943,1.0,3,25.0,2014,0.243205,1,1,5,1564,0,540.0,0,0,1,137597.601458
4900,4504,27,3.0,4.390331,5.610772,1.0,8,19.0,2016,0.211401,1,1,9,1892,0,,0,1,1,161379.067034
6392,14786,1,1.0,1.136859,4.525736,1.0,3,1.0,1977,0.007122,1,1,1,264,0,,0,1,1,181434.825589
8283,15744,34,1.0,1.988943,2.642219,1.0,21,4.0,1977,0.069753,1,1,53,13670,4,,1,11,1,458378.777006
9294,6782,45,1.0,2.954309,5.257278,1.0,3,1.0,1977,0.195781,1,1,23,5212,6,,3,2,1,438005.182323


In [29]:
def change_cols(place, col_1, col_2):
    for el in place:
        x1 = train.loc[(train.Id == el), col_1]
        x2 = train.loc[(train.Id == el), col_2]
        train.loc[(train.Id == el), col_1] = x2
        train.loc[(train.Id == el), col_2] = x1

In [30]:
train.loc[(train.Id == 1748), 'Square'] = train.loc[(train.Id == 1748), 'Square'] * 10

In [31]:
change_cols(train.loc[train.Square < train.LifeSquare, 'Id'].values, 'Square', 'LifeSquare')

In [32]:
train.loc[train.KitchenSquare > 200, 'KitchenSquare'] = train['KitchenSquare'].median()

In [33]:
train.describe()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
count,10000.0,10000.0,10000.0,10000.0,7887.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,5202.0,10000.0,10000.0,10000.0,10000.0
mean,8383.4077,50.4008,1.8874,56.416989,36.081818,5.866603,8.5267,12.6094,3990.166,0.118858,0.9903,0.9725,24.687,5352.1574,8.0392,1142.90446,1.3195,4.2313,0.9175,214138.857399
std,4859.01902,43.587592,0.813626,21.065636,19.353397,5.135976,5.241148,6.775974,200500.3,0.119025,0.098015,0.163543,17.532614,4006.799803,23.831875,1021.517264,1.493601,4.806341,0.275139,92872.293865
min,0.0,0.0,0.0,2.377248,0.370619,0.0,1.0,0.0,1910.0,0.0,0.0,0.0,0.0,168.0,0.0,0.0,0.0,0.0,0.0,59174.778028
25%,4169.5,20.0,1.0,41.866858,22.783354,1.0,4.0,9.0,1974.0,0.017647,1.0,1.0,6.0,1564.0,0.0,350.0,0.0,1.0,1.0,153872.633942
50%,8394.5,36.0,2.0,52.61961,32.78126,6.0,7.0,13.0,1977.0,0.075424,1.0,1.0,25.0,5285.0,2.0,900.0,1.0,3.0,1.0,192269.644879
75%,12592.5,75.0,2.0,66.002322,45.048935,9.0,12.0,17.0,2001.0,0.195781,1.0,1.0,36.0,7227.0,5.0,1548.0,2.0,6.0,1.0,249135.462171
max,16798.0,209.0,6.0,641.065193,638.163193,123.0,42.0,117.0,20052010.0,0.521867,1.0,1.0,74.0,19083.0,141.0,4849.0,6.0,23.0,1.0,633233.46657


In [34]:
train.loc[(train.Floor > train.HouseFloor) & (train.HouseFloor == 0)]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
7,11993,74,2.0,80.312926,,0.0,14,0.0,1977,0.075779,1,1,6,1437,3,,0,2,1,221244.156664
23,6641,54,3.0,118.907612,,0.0,2,0.0,1977,0.006076,1,1,30,5285,0,645.0,6,6,1,571069.052600
26,4378,27,3.0,106.958871,0.641822,0.0,17,0.0,2018,0.072158,1,1,2,629,1,,0,0,0,337299.867936
39,9371,23,2.0,60.503248,,0.0,16,0.0,1977,0.034656,1,1,0,168,0,,0,0,1,229778.057902
44,10521,38,3.0,106.340403,104.211396,0.0,20,0.0,2017,0.060753,1,1,15,2787,2,520.0,0,7,1,435462.048070
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9707,12302,30,1.0,48.307844,46.072913,0.0,4,0.0,1977,0.000078,1,1,22,6398,141,1046.0,3,23,1,161403.094034
9769,9384,59,3.0,98.607328,,0.0,14,0.0,1977,0.019509,1,1,37,7687,11,176.0,5,5,1,492700.257473
9878,11441,62,2.0,72.453232,71.985335,0.0,5,0.0,1977,0.072158,1,1,2,629,1,,0,0,0,158841.624543
9908,3171,62,1.0,35.873961,,0.0,25,0.0,1977,0.072158,1,1,2,629,1,,0,0,0,134379.130962


In [35]:
change_cols(train.loc[(train.Floor > train.HouseFloor) & (train.HouseFloor == 0), 'Id'].values, 'Floor', 'HouseFloor')

In [36]:
import random
wrong_floor = train.loc[train.Floor > train.HouseFloor].index
train.loc[wrong_floor, 'Floor'] = train.loc[wrong_floor, 'HouseFloor'].apply(lambda x: random.randint(1, x))

In [37]:
train.loc[train.HouseYear > 2020]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
1497,10814,109,1.0,37.26507,20.239714,9.0,9.0,12.0,20052011,0.13633,1,1,30,6141,10,262.0,3,6,1,254084.534396
4189,11607,147,2.0,44.791836,28.360393,5.0,4.0,9.0,4968,0.319809,1,1,25,4756,16,2857.0,5,8,1,243028.603096


In [38]:
train.loc[train.HouseYear > 5020, 'HouseYear'] = 2005

In [39]:
train.loc[train.HouseYear > 2020, 'HouseYear'] = 1968

In [40]:
train.loc[train.KitchenSquare > 50]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
227,16395,2,3.0,79.722243,44.731219,72.0,12.0,16.0,1987,0.130618,1,1,39,10418,9,900.0,1,9,1,370148.625285
1369,2371,27,2.0,68.841073,64.234956,66.0,2.0,2.0,2014,0.017647,1,1,2,469,0,,0,0,1,189244.249909
1455,12507,54,2.0,79.810535,79.578961,78.0,10.0,15.0,2014,0.006076,1,1,30,5285,0,645.0,6,6,1,438708.707579
1860,4265,161,2.0,53.216778,32.644859,53.0,7.0,17.0,1994,0.000699,1,1,14,3369,24,4129.0,0,3,1,261125.669724
2916,12390,72,3.0,99.323558,97.490674,96.0,22.0,25.0,2019,0.210473,1,1,11,2398,2,1994.0,3,0,1,445074.956552
2969,7441,62,3.0,114.734473,112.589083,112.0,3.0,3.0,2015,0.072158,1,1,2,629,1,,0,0,0,315245.521059
4079,6508,23,2.0,67.146049,33.959154,63.0,5.0,17.0,2019,0.034656,1,1,0,168,0,,0,0,1,193130.585871
4110,299,27,2.0,66.787523,64.616662,60.0,14.0,20.0,2015,0.017647,1,1,2,469,0,,0,0,1,179466.094235
4651,12552,58,3.0,116.405693,113.109653,112.0,3.0,3.0,2016,0.437885,1,1,23,5735,3,1084.0,0,5,1,296165.936689
5149,13703,42,1.0,38.071692,19.723548,73.0,9.0,10.0,2006,0.158249,1,1,21,5731,0,,1,0,1,160488.033165


# Столбцы LifeSquare & Healthcare 1 имеют пропуски, заполним их

In [41]:
train.describe()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
count,10000.0,10000.0,10000.0,10000.0,7887.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,5202.0,10000.0,10000.0,10000.0,10000.0
mean,8383.4077,50.4008,1.8874,56.416989,36.081818,5.866603,7.2065,12.8522,1984.8657,0.118858,0.9903,0.9725,24.687,5352.1574,8.0392,1142.90446,1.3195,4.2313,0.9175,214138.857399
std,4859.01902,43.587592,0.813626,21.065636,19.353397,5.135976,5.147301,6.536821,18.411517,0.119025,0.098015,0.163543,17.532614,4006.799803,23.831875,1021.517264,1.493601,4.806341,0.275139,92872.293865
min,0.0,0.0,0.0,2.377248,0.370619,0.0,0.0,1.0,1910.0,0.0,0.0,0.0,0.0,168.0,0.0,0.0,0.0,0.0,0.0,59174.778028
25%,4169.5,20.0,1.0,41.866858,22.783354,1.0,3.0,9.0,1974.0,0.017647,1.0,1.0,6.0,1564.0,0.0,350.0,0.0,1.0,1.0,153872.633942
50%,8394.5,36.0,2.0,52.61961,32.78126,6.0,6.0,14.0,1977.0,0.075424,1.0,1.0,25.0,5285.0,2.0,900.0,1.0,3.0,1.0,192269.644879
75%,12592.5,75.0,2.0,66.002322,45.048935,9.0,10.0,17.0,2001.0,0.195781,1.0,1.0,36.0,7227.0,5.0,1548.0,2.0,6.0,1.0,249135.462171
max,16798.0,209.0,6.0,641.065193,638.163193,123.0,42.0,117.0,2020.0,0.521867,1.0,1.0,74.0,19083.0,141.0,4849.0,6.0,23.0,1.0,633233.46657


In [42]:
train.loc[(train.LifeSquare.isna()) & (train.HouseYear != 1977)].describe()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
count,256.0,256.0,256.0,256.0,0.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,107.0,256.0,256.0,256.0,256.0
mean,8467.980469,34.761719,1.773438,59.709943,,2.882812,8.257812,14.59375,2015.855469,0.084271,0.996094,1.0,16.957031,4120.582031,26.78125,901.757009,1.082031,6.371094,0.890625,201883.586926
std,4869.124747,28.411297,0.784305,19.729941,,3.760728,6.065326,7.173959,1.354227,0.106948,0.0625,0.0,16.054384,3844.367687,53.250564,625.0478,1.333251,8.407221,0.312721,100361.568994
min,32.0,0.0,0.0,27.245731,,0.0,0.0,1.0,2012.0,0.0,0.0,1.0,0.0,168.0,0.0,0.0,0.0,0.0,0.0,64927.358711
25%,4208.5,23.0,1.0,42.569331,,1.0,3.0,12.0,2015.0,0.007122,1.0,1.0,2.0,629.0,0.0,540.0,0.0,0.0,1.0,131167.219758
50%,8534.0,30.0,2.0,57.355835,,1.0,7.0,17.0,2016.0,0.041125,1.0,1.0,18.0,3594.0,1.0,1046.0,0.0,2.0,1.0,180626.175902
75%,12425.75,48.0,2.0,73.78111,,1.0,13.0,19.0,2017.0,0.09014,1.0,1.0,23.0,6398.0,6.0,1046.0,3.0,11.0,1.0,234140.209759
max,16753.0,169.0,3.0,136.727224,,15.0,25.0,27.0,2019.0,0.437885,1.0,1.0,53.0,13670.0,141.0,3855.0,6.0,23.0,1.0,633233.46657


Чуть больше 10 процентов обьектов "не имеющих" жилой площади были построны не в 1977 году.
Все дома "не имеющие" жилой площади не отличатся от других, судя по describe
Будем считать, что "отутсвие" жилой площади это ошибка в данных.
Заполним пропуски умножив на среднее между отношениями ср. арифметических и медианнан площади и жил. площади 

In [43]:
train.loc[~train['LifeSquare'].isna()].describe()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
count,7887.0,7887.0,7887.0,7887.0,7887.0,7887.0,7887.0,7887.0,7887.0,7887.0,7887.0,7887.0,7887.0,7887.0,7887.0,4457.0,7887.0,7887.0,7887.0,7887.0
mean,8417.37099,56.670217,1.916445,55.36259,36.081818,7.067583,7.134145,12.49702,1985.711804,0.122521,0.988462,0.965132,28.480284,6097.788006,7.272981,1211.382769,1.521491,4.531254,0.930519,223409.768022
std,4864.99023,45.348356,0.823964,20.435518,19.353397,4.962873,4.886203,6.30815,19.577849,0.120083,0.1068,0.183456,16.65166,3908.899268,19.682824,1068.252389,1.517006,4.430431,0.254287,94951.990144
min,0.0,0.0,0.0,2.377248,0.370619,0.0,0.0,1.0,1910.0,0.0,0.0,0.0,0.0,168.0,0.0,30.0,0.0,0.0,0.0,59174.778028
25%,4222.5,22.0,1.0,41.305545,22.783354,5.0,3.0,9.0,1971.0,0.025609,1.0,1.0,19.0,3681.0,1.0,325.0,0.0,1.0,1.0,162060.71931
50%,8419.0,46.0,2.0,50.95736,32.78126,7.0,6.0,12.0,1979.0,0.081943,1.0,1.0,28.0,5664.0,2.0,1015.0,1.0,4.0,1.0,200014.336149
75%,12655.0,88.0,3.0,64.88872,45.048935,9.0,10.0,17.0,2004.0,0.194489,1.0,1.0,38.0,7759.0,5.0,1894.0,3.0,6.0,1.0,262296.554087
max,16798.0,209.0,6.0,641.065193,638.163193,123.0,42.0,117.0,2020.0,0.521867,1.0,1.0,74.0,19083.0,141.0,4849.0,6.0,23.0,1.0,625678.644994


In [44]:
train['LifeSquare'].fillna(train['Square']*0.67348861, inplace=True)

In [45]:
train.loc[(train.Healthcare_1 == 0), 'DistrictId'].value_counts()

23    1
Name: DistrictId, dtype: int64

In [46]:
train.loc[(train.DistrictId == 23), 'DistrictId'].value_counts()

23    565
Name: DistrictId, dtype: int64

In [47]:
train.loc[(train.Healthcare_1 == 0), 'Healthcare_1'] = np.nan

In [48]:
healthcare1_mode = train[~train['Healthcare_1'].isna()]['Healthcare_1'].mode()[0]
for i in range(train['DistrictId'].max()):
    if train.loc[(train['DistrictId'] == i), 'Healthcare_1'].value_counts().empty:
        health_mode = healthcare1_mode
    else:
        health_mode = train.loc[((train.DistrictId == i)&(~train.DistrictId.isna())), 'Healthcare_1'].mode()[0]
    train.loc[((train['Healthcare_1'].isnull())&(train['DistrictId'] == i)), 'Healthcare_1'] = health_mode
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             10000 non-null  int64  
 1   DistrictId     10000 non-null  int64  
 2   Rooms          10000 non-null  float64
 3   Square         10000 non-null  float64
 4   LifeSquare     10000 non-null  float64
 5   KitchenSquare  10000 non-null  float64
 6   Floor          10000 non-null  float64
 7   HouseFloor     10000 non-null  float64
 8   HouseYear      10000 non-null  int64  
 9   Ecology_1      10000 non-null  float64
 10  Ecology_2      10000 non-null  uint8  
 11  Ecology_3      10000 non-null  uint8  
 12  Social_1       10000 non-null  int64  
 13  Social_2       10000 non-null  int64  
 14  Social_3       10000 non-null  int64  
 15  Healthcare_1   10000 non-null  float64
 16  Helthcare_2    10000 non-null  int64  
 17  Shops_1        10000 non-null  int64  
 18  Shops_2

In [49]:
X = train.iloc[:, 2:-1]
y = pd.Series(train['Price'])
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Rooms          10000 non-null  float64
 1   Square         10000 non-null  float64
 2   LifeSquare     10000 non-null  float64
 3   KitchenSquare  10000 non-null  float64
 4   Floor          10000 non-null  float64
 5   HouseFloor     10000 non-null  float64
 6   HouseYear      10000 non-null  int64  
 7   Ecology_1      10000 non-null  float64
 8   Ecology_2      10000 non-null  uint8  
 9   Ecology_3      10000 non-null  uint8  
 10  Social_1       10000 non-null  int64  
 11  Social_2       10000 non-null  int64  
 12  Social_3       10000 non-null  int64  
 13  Healthcare_1   10000 non-null  float64
 14  Helthcare_2    10000 non-null  int64  
 15  Shops_1        10000 non-null  int64  
 16  Shops_2        10000 non-null  uint8  
dtypes: float64(8), int64(6), uint8(3)
memory usage: 1.1

In [50]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [52]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)

In [53]:
X_test_scaled = scaler.transform(X_test)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

In [54]:
parameters = [{'n_estimators': [480, 700],
'max_features': np.arange(3, 7),
'max_depth': np.arange(13, 17)}]

In [55]:
forest_best = RandomForestRegressor(max_depth=15, max_features=4, n_estimators=600, random_state=42)

In [56]:
forest_best.fit(X_train_scaled, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=15, max_features=4, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=600, n_jobs=None, oob_score=False,
                      random_state=42, verbose=0, warm_start=False)

In [57]:
forest_best.score(X_test_scaled, y_test)

0.7282384085985407

Пробовал ГриидСеарч для случ леса 
[{'n_estimators': [5, 700],
'max_features': np.arange(3, 20),
'max_depth': np.arange(7, 25)}] 
# непонятно почему то при задании пересекающихся диапозонов 
# бест парамс с разными значениями внутри этих диапзонов??????

In [58]:
from sklearn.ensemble import GradientBoostingRegressor

In [59]:
parameters_2 = [{'n_estimators': [500, 600],
'max_features': np.arange(1, 5),
'max_depth': np.arange(5, 12)}]

In [60]:
boost_best = GradientBoostingRegressor(max_depth=7, max_features=2, n_estimators=550, random_state=23)

In [61]:
boost_best.fit(X_train_scaled, y_train)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=7,
                          max_features=2, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=550,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=23, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [62]:
boost_best.score(X_test_scaled, y_test)

0.733573280774759

In [63]:
test

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
0,725,58,2.0,49.882643,33.432782,6.0,6,14.0,1972,0.310199,B,B,11,2748,1,,0,0,B
1,15856,74,2.0,69.263183,,1.0,6,1.0,1977,0.075779,B,B,6,1437,3,,0,2,B
2,5480,190,1.0,13.597819,15.948246,12.0,2,5.0,1909,0.000000,B,B,30,7538,87,4702.0,5,5,B
3,15664,47,2.0,73.046609,51.940842,9.0,22,22.0,2007,0.101872,B,B,23,4583,3,,3,3,B
4,14275,27,1.0,47.527111,43.387569,1.0,17,17.0,2017,0.072158,B,B,2,629,1,,0,0,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,8180,11,3.0,67.133911,50.809797,6.0,5,9.0,1973,0.000170,B,B,36,5992,0,,1,1,B
4996,4695,1,1.0,40.198472,21.807061,10.0,12,17.0,2017,0.007122,B,B,1,264,0,,0,1,B
4997,5783,12,3.0,77.842178,48.282625,9.0,23,22.0,1989,0.090799,B,B,74,19083,2,,5,15,B
4998,4780,62,2.0,81.305222,,0.0,4,0.0,1977,0.072158,B,B,2,629,1,,0,0,A


In [64]:
test_id = test.iloc[:, 0]
test_id.head()

0      725
1    15856
2     5480
3    15664
4    14275
Name: Id, dtype: int64

In [65]:
test['Ecology_2'].value_counts()

B    4952
A      48
Name: Ecology_2, dtype: int64

In [66]:
test['Ecology_3'].value_counts()

B    4851
A     149
Name: Ecology_3, dtype: int64

In [67]:
test['Shops_2'].value_counts()

B    4588
A     412
Name: Shops_2, dtype: int64

In [68]:
test['Ecology_2'] = pd.get_dummies(test['Ecology_2'], drop_first=True)

In [69]:
test['Ecology_3'] = pd.get_dummies(test['Ecology_3'], drop_first=True)

In [70]:
test['Shops_2'] = pd.get_dummies(test['Shops_2'], drop_first=True)

In [71]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             5000 non-null   int64  
 1   DistrictId     5000 non-null   int64  
 2   Rooms          5000 non-null   float64
 3   Square         5000 non-null   float64
 4   LifeSquare     3959 non-null   float64
 5   KitchenSquare  5000 non-null   float64
 6   Floor          5000 non-null   int64  
 7   HouseFloor     5000 non-null   float64
 8   HouseYear      5000 non-null   int64  
 9   Ecology_1      5000 non-null   float64
 10  Ecology_2      5000 non-null   uint8  
 11  Ecology_3      5000 non-null   uint8  
 12  Social_1       5000 non-null   int64  
 13  Social_2       5000 non-null   int64  
 14  Social_3       5000 non-null   int64  
 15  Healthcare_1   2623 non-null   float64
 16  Helthcare_2    5000 non-null   int64  
 17  Shops_1        5000 non-null   int64  
 18  Shops_2 

# Обработаем выбросы

In [72]:
test.loc[(test.Rooms > 5) & (test.Square < 100)]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
3398,1435,111,17.0,52.866107,32.528342,8.0,15,17.0,1987,0.093443,1,1,23,4635,5,3300.0,2,4,1


In [73]:
test.loc[(test.Rooms == 0) & (test.KitchenSquare > 0), 'Rooms'] = test['Rooms'].median()

In [74]:
test.describe()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
count,5000.0,5000.0,5000.0,5000.0,3959.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,2623.0,5000.0,5000.0,5000.0
mean,8412.5954,51.2792,1.9104,56.4495,36.15881,5.9768,8.632,12.601,1984.3926,0.119874,0.9904,0.9702,24.9338,5406.9,8.2626,1146.657263,1.3194,4.2428,0.9176
std,4832.674037,44.179466,0.83816,19.092787,17.825287,9.950018,5.483228,6.789213,18.573149,0.12007,0.097518,0.170052,17.532202,4026.614773,23.863762,1044.744231,1.47994,4.777365,0.275001
min,1.0,0.0,0.0,1.378543,0.33349,0.0,1.0,0.0,1908.0,0.0,0.0,0.0,0.0,168.0,0.0,0.0,0.0,0.0,0.0
25%,4221.75,21.0,1.0,41.906231,23.092026,1.0,4.0,9.0,1973.0,0.019509,1.0,1.0,6.0,1564.0,0.0,325.0,0.0,1.0,1.0
50%,8320.5,37.0,2.0,52.92134,32.925087,6.0,7.0,12.0,1977.0,0.072158,1.0,1.0,25.0,5285.0,2.0,900.0,1.0,3.0,1.0
75%,12598.25,77.0,2.0,66.285129,45.174091,9.0,12.0,17.0,2000.0,0.195781,1.0,1.0,36.0,7287.0,5.0,1548.0,2.0,6.0,1.0
max,16795.0,212.0,17.0,223.453689,303.071094,620.0,78.0,99.0,2020.0,0.521867,1.0,1.0,74.0,19083.0,141.0,4849.0,6.0,23.0,1.0


In [75]:
test.loc[(test.Square > 150) & (test.Square > test.LifeSquare * 2)] 

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
1264,13589,27,3.0,173.97448,76.289475,9.0,3,3.0,2017,0.041116,1,1,53,14892,4,,1,4,1
2039,3357,27,3.0,170.476326,75.973612,8.0,2,2.0,2017,0.041116,1,1,53,14892,4,,1,4,1
3217,4058,27,6.0,223.453689,104.113552,16.0,2,2.0,2017,0.041116,1,1,53,14892,4,,1,4,1


In [76]:
test.loc[(test.Square < 5) & (test.LifeSquare > 30), 'Square'] = test['Square'].mean()

In [77]:
test.loc[(test.Square * 1.2 < test.LifeSquare) & (test.LifeSquare > 150)] = test['LifeSquare'].mean()

In [78]:
test.loc[test.Square < test.LifeSquare] 

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
2,5480.0,190.0,1.0,13.597819,15.948246,12.0,2.0,5.0,1909.0,0.000000,1.0,1.0,30.0,7538.0,87.0,4702.0,5.0,5.0,1.0
27,11160.0,58.0,1.0,48.610661,48.752502,1.0,4.0,3.0,1977.0,0.437885,1.0,1.0,23.0,5735.0,3.0,1084.0,0.0,5.0,1.0
39,6624.0,27.0,1.0,42.413793,42.434887,10.0,12.0,17.0,2017.0,0.011654,1.0,1.0,4.0,915.0,0.0,,0.0,0.0,1.0
43,11513.0,1.0,1.0,37.484057,40.593036,0.0,4.0,17.0,1977.0,0.007122,1.0,1.0,1.0,264.0,0.0,,0.0,1.0,1.0
70,1872.0,30.0,2.0,52.449057,52.798349,1.0,17.0,17.0,2016.0,0.000078,1.0,1.0,22.0,6398.0,141.0,1046.0,3.0,23.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4927,14686.0,1.0,3.0,77.229885,79.958685,1.0,18.0,17.0,1977.0,0.007122,1.0,1.0,1.0,264.0,0.0,,0.0,1.0,1.0
4951,3711.0,94.0,1.0,41.281057,42.392594,10.0,13.0,13.0,2014.0,0.282798,1.0,1.0,33.0,8667.0,2.0,,0.0,6.0,1.0
4962,5820.0,94.0,1.0,33.680382,34.198977,1.0,10.0,9.0,1972.0,0.127376,1.0,1.0,43.0,8429.0,3.0,,3.0,9.0,1.0
4969,4821.0,74.0,2.0,82.542507,82.585069,0.0,4.0,17.0,2015.0,0.309479,1.0,1.0,35.0,7715.0,4.0,990.0,0.0,6.0,1.0


In [79]:
def change_cols(place, col_1, col_2):
    for el in place:
        x1 = test.loc[(test.Id == el), col_1]
        x2 = test.loc[(test.Id == el), col_2]
        test.loc[(test.Id == el), col_1] = x2
        test.loc[(test.Id == el), col_2] = x1

In [80]:
change_cols(test.loc[test.Square < test.LifeSquare, 'Id'].values, 'Square', 'LifeSquare')

In [81]:
test.loc[test.KitchenSquare > 200, 'KitchenSquare'] = test['KitchenSquare'].median()

In [82]:
test.describe()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
count,5000.0,5000.0,5000.0,5000.0,3959.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,2624.0,5000.0,5000.0,5000.0
mean,8410.296032,51.267632,1.917232,56.537841,35.990177,5.860032,8.638232,12.605832,1984.005032,0.127001,0.997432,0.977232,24.936032,5405.677432,8.269832,1146.234054,1.326632,4.250032,0.924632
std,4833.923979,44.17585,0.96804,19.093769,17.172991,4.873548,5.496789,6.797377,33.227471,0.52359,0.506827,0.525895,17.53292,4027.317345,23.866736,1044.770003,1.559688,4.798263,0.569221
min,1.0,0.0,0.0,1.378543,0.33349,0.0,1.0,0.0,36.15881,0.0,0.0,0.0,0.0,36.15881,0.0,0.0,0.0,0.0,0.0
25%,4220.75,21.0,1.0,41.97776,23.092026,1.0,4.0,9.0,1973.0,0.019509,1.0,1.0,6.0,1564.0,0.0,325.0,0.0,1.0,1.0
50%,8319.5,37.0,2.0,52.93055,32.892831,6.0,7.0,12.0,1977.0,0.072158,1.0,1.0,25.0,5285.0,2.0,900.0,1.0,3.0,1.0
75%,12598.25,77.0,2.0,66.392939,45.07042,9.0,12.0,17.0,2000.0,0.195781,1.0,1.0,36.0,7287.0,5.0,1548.0,2.0,6.0,1.0
max,16795.0,212.0,36.15881,223.453689,168.729035,112.0,78.0,99.0,2020.0,36.15881,36.15881,36.15881,74.0,19083.0,141.0,4849.0,36.15881,36.15881,36.15881


In [83]:
test.loc[(test.Floor > test.HouseFloor) & (test.HouseFloor == 0)]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
22,12694.0,27.0,3.0,83.670032,46.358356,0.0,8.0,0.0,2015.0,0.072158,1.0,1.0,2.0,629.0,1.0,,0.0,0.0,0.0
24,8968.0,27.0,2.0,69.849239,,0.0,3.0,0.0,1977.0,0.011654,1.0,1.0,4.0,915.0,0.0,,0.0,0.0,1.0
30,2982.0,6.0,2.0,63.460684,,0.0,13.0,0.0,1977.0,0.243205,1.0,1.0,5.0,1564.0,0.0,540.0,0.0,0.0,1.0
32,2449.0,1.0,1.0,66.426585,,0.0,14.0,0.0,1977.0,0.007122,1.0,1.0,1.0,264.0,0.0,,0.0,1.0,1.0
46,12309.0,45.0,1.0,42.899569,,0.0,25.0,0.0,1977.0,0.195781,1.0,1.0,23.0,5212.0,6.0,,3.0,2.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4856,10262.0,27.0,2.0,69.196440,,0.0,10.0,0.0,1977.0,0.017647,1.0,1.0,2.0,469.0,0.0,,0.0,0.0,1.0
4887,12116.0,88.0,3.0,89.549559,,0.0,20.0,0.0,1977.0,0.127376,1.0,1.0,43.0,8429.0,3.0,,3.0,9.0,1.0
4892,16357.0,58.0,1.0,49.205978,,0.0,4.0,0.0,1977.0,0.437885,1.0,1.0,23.0,5735.0,3.0,1084.0,0.0,5.0,1.0
4976,4723.0,95.0,1.0,45.542940,,0.0,6.0,0.0,1977.0,0.000699,1.0,1.0,14.0,3369.0,24.0,4129.0,0.0,3.0,1.0


In [84]:
change_cols(test.loc[(test.Floor > test.HouseFloor) & (test.HouseFloor == 0), 'Id'].values, 'Floor', 'HouseFloor')

In [85]:
import random
wrong_floor = test.loc[test.Floor > test.HouseFloor].index
test.loc[wrong_floor, 'Floor'] = test.loc[wrong_floor, 'HouseFloor'].apply(lambda x: random.randint(1, x))

In [86]:
test.loc[test.HouseYear < 1900, 'HouseYear'] = test['HouseYear'].mode()[0]

In [87]:
test.loc[test.KitchenSquare > 50]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
40,5428.0,27.0,2.0,62.326044,,61.0,12.0,17.0,1977.0,0.072158,1.0,1.0,2.0,629.0,1.0,,0.0,0.0,0.0
1456,5260.0,73.0,3.0,69.358242,51.247581,65.0,6.0,6.0,1931.0,0.042032,1.0,1.0,37.0,6856.0,84.0,1940.0,2.0,5.0,1.0
1777,3341.0,62.0,3.0,112.247841,112.114019,112.0,3.0,3.0,2017.0,0.072158,1.0,1.0,2.0,629.0,1.0,,0.0,0.0,0.0
3816,12612.0,27.0,2.0,60.988496,33.646726,60.0,5.0,17.0,2013.0,0.072158,1.0,1.0,2.0,629.0,1.0,,0.0,0.0,0.0
4281,8015.0,27.0,1.0,66.099096,33.639611,62.0,3.0,7.0,2016.0,0.014058,1.0,1.0,1.0,290.0,0.0,,0.0,0.0,1.0
4405,5199.0,27.0,2.0,61.647531,59.05499,57.0,6.0,12.0,2016.0,0.211401,1.0,1.0,9.0,1892.0,0.0,,0.0,1.0,1.0
4555,12640.0,6.0,2.0,54.629142,31.486308,97.0,4.0,17.0,2015.0,0.243205,1.0,1.0,5.0,1564.0,0.0,540.0,0.0,0.0,1.0


# Столбцы LifeSquare & Healthcare 1 имеют пропуски, заполним их

In [88]:
test['LifeSquare'].fillna(test['Square']*0.67348861, inplace=True)

In [89]:
test.loc[(test.Healthcare_1 == 0), 'DistrictId'].value_counts()

23.0    1
Name: DistrictId, dtype: int64

In [90]:
test.loc[(test.DistrictId == 23), 'DistrictId'].value_counts()

23.0    264
Name: DistrictId, dtype: int64

In [91]:
test.loc[(test.Healthcare_1 == 0), 'Healthcare_1'] = np.nan

In [92]:
healthcare1_mode = test[~test['Healthcare_1'].isna()]['Healthcare_1'].mode()[0]
for i in range(212):
    if test.loc[(test['DistrictId'] == i), 'Healthcare_1'].value_counts().empty:
        health_mode = healthcare1_mode
    else:
        health_mode = test.loc[((test.DistrictId == i)&(~test.DistrictId.isna())), 'Healthcare_1'].mode()[0]
    test.loc[((test['Healthcare_1'].isnull())&(test['DistrictId'] == i)), 'Healthcare_1'] = health_mode
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             5000 non-null   float64
 1   DistrictId     5000 non-null   float64
 2   Rooms          5000 non-null   float64
 3   Square         5000 non-null   float64
 4   LifeSquare     5000 non-null   float64
 5   KitchenSquare  5000 non-null   float64
 6   Floor          5000 non-null   float64
 7   HouseFloor     5000 non-null   float64
 8   HouseYear      5000 non-null   float64
 9   Ecology_1      5000 non-null   float64
 10  Ecology_2      5000 non-null   float64
 11  Ecology_3      5000 non-null   float64
 12  Social_1       5000 non-null   float64
 13  Social_2       5000 non-null   float64
 14  Social_3       5000 non-null   float64
 15  Healthcare_1   5000 non-null   float64
 16  Helthcare_2    5000 non-null   float64
 17  Shops_1        5000 non-null   float64
 18  Shops_2 

In [93]:
test = test.iloc[:, 2:]

In [94]:
test_scaled = scaler.transform(test)
test_scaled = pd.DataFrame(test_scaled, columns=test.columns)

In [95]:
test_scaled

Unnamed: 0,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
0,0.140728,-0.318058,-0.200076,0.015826,-0.236691,0.167094,-0.705631,1.622043,0.096058,0.172379,-0.783021,-0.652234,-0.295144,0.122668,-0.882358,-0.883079,0.301699
1,0.140728,0.626259,0.541067,-0.918906,-1.205827,-1.798214,-0.432530,-0.354673,0.096058,0.172379,-1.067393,-0.979046,-0.211351,0.019141,-0.882358,-0.468543,0.301699
2,-1.090644,-1.971511,-1.312474,1.137506,-1.012000,-1.193504,-4.146708,-0.993666,0.096058,0.172379,0.297591,0.541838,3.307966,4.107367,2.477360,0.153260,0.301699
3,0.140728,0.810606,0.837906,0.576666,2.864546,1.376515,1.206078,-0.134641,0.096058,0.172379,-0.100529,-0.194797,-0.211351,-0.806875,1.133473,-0.261276,0.301699
4,-1.090644,-0.432831,0.358215,-0.918906,1.895410,0.620627,1.752280,-0.385208,0.096058,0.172379,-1.294890,-1.180468,-0.295144,-0.476468,-0.882358,-0.883079,-3.314559
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,1.372100,0.522510,0.774474,0.015826,-0.430518,-0.588793,-0.651011,-0.992235,0.096058,0.172379,0.638837,0.156445,-0.337040,0.046675,-0.210414,-0.675811,0.301699
4996,-1.090644,-0.789919,-0.852078,0.763613,0.926273,0.620627,1.752280,-0.933608,0.096058,0.172379,-1.351764,-1.271457,-0.337040,-0.806875,-0.882358,-0.675811,0.301699
4997,1.372100,1.044270,0.632743,0.576666,-0.236691,1.376515,0.222913,-0.228016,0.096058,0.172379,2.800060,3.419827,-0.253247,-0.476468,2.477360,2.225936,0.301699
4998,0.140728,1.213007,0.995907,-1.105853,-1.399655,-1.344681,-0.432530,-0.385208,0.096058,0.172379,-1.294890,-1.180468,-0.295144,1.461915,-0.882358,-0.883079,-3.314559


In [96]:
y_predict = boost_best.predict(test_scaled)
y_predict

array([143321.91275738, 210248.3857109 , 206132.02080832, ...,
       323278.02734541, 186672.55781257, 190587.00627325])

In [97]:
ABorodin_predicts = pd.DataFrame({'Id':test_id,
                       'Price':y_predict}, columns=['Id', 'Price'])
ABorodin_predicts

Unnamed: 0,Id,Price
0,725,143321.912757
1,15856,210248.385711
2,5480,206132.020808
3,15664,338964.622128
4,14275,143095.629909
...,...,...
4995,8180,221168.712239
4996,4695,144263.516855
4997,5783,323278.027345
4998,4780,186672.557813


In [98]:
ABorodin_predicts.to_csv('ABorodin_predictions_1.csv', index=False)