In [201]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score as r2
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split

In [730]:
data = pd.read_csv('train.csv')

In [731]:
data.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
0,14038,35,2.0,47.981561,29.442751,6.0,7,9.0,1969,0.08904,B,B,33,7976,5,,0,11,B,184966.93073
1,15053,41,3.0,65.68364,40.049543,8.0,7,9.0,1978,7e-05,B,B,46,10309,1,240.0,1,16,B,300009.450063
2,4765,53,2.0,44.947953,29.197612,0.0,8,12.0,1968,0.049637,B,B,34,7759,0,229.0,1,3,B,220925.908524
3,5809,58,2.0,53.352981,52.731512,9.0,8,17.0,1977,0.437885,B,B,23,5735,3,1084.0,0,5,B,175616.227217
4,10783,99,1.0,39.649192,23.776169,7.0,11,12.0,1976,0.012339,B,B,35,5776,1,2078.0,2,4,B,150226.531644


In [732]:
categorical_columns = [c for c in data.columns if data[c].dtype.name == 'object']
numerical_columns   = [c for c in data.columns if data[c].dtype.name != 'object']
print (categorical_columns)
print (numerical_columns)

['Ecology_2', 'Ecology_3', 'Shops_2']
['Id', 'DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare', 'Floor', 'HouseFloor', 'HouseYear', 'Ecology_1', 'Social_1', 'Social_2', 'Social_3', 'Healthcare_1', 'Helthcare_2', 'Shops_1', 'Price']


In [733]:
data.count(axis=0)

Id               10000
DistrictId       10000
Rooms            10000
Square           10000
LifeSquare        7887
KitchenSquare    10000
Floor            10000
HouseFloor       10000
HouseYear        10000
Ecology_1        10000
Ecology_2        10000
Ecology_3        10000
Social_1         10000
Social_2         10000
Social_3         10000
Healthcare_1      5202
Helthcare_2      10000
Shops_1          10000
Shops_2          10000
Price            10000
dtype: int64

In [734]:
data = data.fillna(data.median(axis=0), axis=0)

In [735]:
data.count(axis=0)

Id               10000
DistrictId       10000
Rooms            10000
Square           10000
LifeSquare       10000
KitchenSquare    10000
Floor            10000
HouseFloor       10000
HouseYear        10000
Ecology_1        10000
Ecology_2        10000
Ecology_3        10000
Social_1         10000
Social_2         10000
Social_3         10000
Healthcare_1     10000
Helthcare_2      10000
Shops_1          10000
Shops_2          10000
Price            10000
dtype: int64

In [736]:
data_describe = data.describe(include=[object])
data_describe
binary_columns    = [c for c in categorical_columns if data_describe[c]['unique'] == 2]
nonbinary_columns = [c for c in categorical_columns if data_describe[c]['unique'] > 2]
print (binary_columns, nonbinary_columns)

['Ecology_2', 'Ecology_3', 'Shops_2'] []


In [739]:
for c in binary_columns[1:]:
    top = data_describe[c]['top']
    top_items = data[c] == top
    data.loc[top_items, c] = 0
    data.loc[np.logical_not(top_items), c] = 1

In [740]:
data[binary_columns].describe()

Unnamed: 0,Ecology_2,Ecology_3,Shops_2
count,10000.0,10000.0,10000.0
mean,0.0097,0.0275,0.0825
std,0.098015,0.163543,0.275139
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,0.0,0.0
75%,0.0,0.0,0.0
max,1.0,1.0,1.0


In [741]:
numerical = ['DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare', 'Floor', 'HouseFloor', 'HouseYear', 'Ecology_1', 
             'Social_1', 'Social_2', 'Social_3', 'Healthcare_1', 'Helthcare_2', 'Shops_1', 'Price']
data_numerical = data[numerical]

In [742]:
from sklearn import preprocessing
standardized_X = preprocessing.scale(data_numerical)
data_numerical.describe()

  


Unnamed: 0,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Price
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,50.4008,1.8905,56.315775,36.26604,6.2733,8.5267,12.6094,3990.166,0.118858,24.687,5352.1574,8.0392,1026.3589,1.3195,4.2313,214138.857399
std,43.587592,0.839512,21.058732,76.609981,28.560917,5.241148,6.775974,200500.3,0.119025,17.532614,4006.799803,23.831875,746.662828,1.493601,4.806341,92872.293865
min,0.0,0.0,1.136859,0.370619,0.0,1.0,0.0,1910.0,0.0,0.0,168.0,0.0,0.0,0.0,0.0,59174.778028
25%,20.0,1.0,41.774881,25.527399,1.0,4.0,9.0,1974.0,0.017647,6.0,1564.0,0.0,830.0,0.0,1.0,153872.633942
50%,36.0,2.0,52.51331,32.78126,6.0,7.0,13.0,1977.0,0.075424,25.0,5285.0,2.0,900.0,1.0,3.0,192269.644879
75%,75.0,2.0,65.900625,41.427234,9.0,12.0,17.0,2001.0,0.195781,36.0,7227.0,5.0,990.0,2.0,6.0,249135.462171
max,209.0,19.0,641.065193,7480.592129,2014.0,42.0,117.0,20052010.0,0.521867,74.0,19083.0,141.0,4849.0,6.0,23.0,633233.46657


In [743]:
ID = ['Id']
ID_add =  data[ID]

In [745]:
data = pd.concat((ID_add, data_numerical, data[binary_columns]), axis=1)
data = pd.DataFrame(data, dtype=float)
print (data.shape)
print (data.columns)

(10000, 20)
Index(['Id', 'DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare',
       'Floor', 'HouseFloor', 'HouseYear', 'Ecology_1', 'Social_1', 'Social_2',
       'Social_3', 'Healthcare_1', 'Helthcare_2', 'Shops_1', 'Price',
       'Ecology_2', 'Ecology_3', 'Shops_2'],
      dtype='object')


In [746]:
fts = ['Square', 'Rooms','Square','HouseFloor','DistrictId', 'Floor','Shops_1', 'Ecology_1',
       'Ecology_2', 'Ecology_3', 'Shops_2','Social_1', 'Social_2',
       'Social_3']
X = data.loc[:, fts]
y = data['Price']

In [710]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=11)

In [711]:
from sklearn.ensemble import RandomForestRegressor as RF

rf = RF(n_estimators=20, max_depth=10, min_samples_leaf=2, random_state=42)

In [712]:
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=2, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [713]:
pred = rf.predict(X_train)

In [714]:
pred

array([273670.95078274, 235175.13532304, 189437.79405334, ...,
       154688.16439332, 148488.88339614, 200603.71526266])

In [715]:
r2(y_train, pred)

0.8242342549968399

In [716]:
pred_valid = rf.predict(X_test)

In [717]:
pred_valid

array([160018.78192307, 148952.37246737, 180509.13260894, ...,
       229940.67983783, 164221.15041028, 168407.80554243])

In [718]:
r2(y_test, pred_valid)

0.7314974156241891

## Предсказания на тесте

In [759]:
test = pd.read_csv('test.csv')

In [760]:
test.shape

(5000, 19)

In [761]:
test.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
0,725,58,2.0,49.882643,33.432782,6.0,6,14.0,1972,0.310199,B,B,11,2748,1,,0,0,B
1,15856,74,2.0,69.263183,,1.0,6,1.0,1977,0.075779,B,B,6,1437,3,,0,2,B
2,5480,190,1.0,13.597819,15.948246,12.0,2,5.0,1909,0.0,B,B,30,7538,87,4702.0,5,5,B
3,15664,47,2.0,73.046609,51.940842,9.0,22,22.0,2007,0.101872,B,B,23,4583,3,,3,3,B
4,14275,27,1.0,47.527111,43.387569,1.0,17,17.0,2017,0.072158,B,B,2,629,1,,0,0,A


In [762]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 19 columns):
Id               5000 non-null int64
DistrictId       5000 non-null int64
Rooms            5000 non-null float64
Square           5000 non-null float64
LifeSquare       3959 non-null float64
KitchenSquare    5000 non-null float64
Floor            5000 non-null int64
HouseFloor       5000 non-null float64
HouseYear        5000 non-null int64
Ecology_1        5000 non-null float64
Ecology_2        5000 non-null object
Ecology_3        5000 non-null object
Social_1         5000 non-null int64
Social_2         5000 non-null int64
Social_3         5000 non-null int64
Healthcare_1     2623 non-null float64
Helthcare_2      5000 non-null int64
Shops_1          5000 non-null int64
Shops_2          5000 non-null object
dtypes: float64(7), int64(9), object(3)
memory usage: 742.3+ KB


In [763]:
tcategorical_columns = [c for c in test.columns if test[c].dtype.name == 'object']
tnumerical_columns   = [c for c in test.columns if test[c].dtype.name != 'object']
print (tcategorical_columns)
print (tnumerical_columns)

['Ecology_2', 'Ecology_3', 'Shops_2']
['Id', 'DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare', 'Floor', 'HouseFloor', 'HouseYear', 'Ecology_1', 'Social_1', 'Social_2', 'Social_3', 'Healthcare_1', 'Helthcare_2', 'Shops_1']


In [764]:
test_describe = test.describe(include=[object])
test_describe
tbinary_columns    = [c for c in tcategorical_columns if test_describe[c]['unique'] == 2]
tnonbinary_columns = [c for c in tcategorical_columns if test_describe[c]['unique'] > 2]
print (tbinary_columns, tnonbinary_columns)

['Ecology_2', 'Ecology_3', 'Shops_2'] []


In [767]:
for c in tbinary_columns[1:]:
    top = test_describe[c]['top']
    top_items = test[c] == top
    test.loc[top_items, c] = 0
    test.loc[np.logical_not(top_items), c] = 1

In [768]:
test[tbinary_columns].describe()

Unnamed: 0,Ecology_2,Ecology_3,Shops_2
count,5000.0,5000.0,5000.0
mean,0.0096,0.0298,0.0824
std,0.097518,0.170052,0.275001
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,0.0,0.0
75%,0.0,0.0,0.0
max,1.0,1.0,1.0


In [769]:
test.count(axis=0)

Id               5000
DistrictId       5000
Rooms            5000
Square           5000
LifeSquare       3959
KitchenSquare    5000
Floor            5000
HouseFloor       5000
HouseYear        5000
Ecology_1        5000
Ecology_2        5000
Ecology_3        5000
Social_1         5000
Social_2         5000
Social_3         5000
Healthcare_1     2623
Helthcare_2      5000
Shops_1          5000
Shops_2          5000
dtype: int64

In [770]:
test = test.fillna(data.median(axis=0), axis=0)

In [771]:
test.count(axis=0)

Id               5000
DistrictId       5000
Rooms            5000
Square           5000
LifeSquare       5000
KitchenSquare    5000
Floor            5000
HouseFloor       5000
HouseYear        5000
Ecology_1        5000
Ecology_2        5000
Ecology_3        5000
Social_1         5000
Social_2         5000
Social_3         5000
Healthcare_1     5000
Helthcare_2      5000
Shops_1          5000
Shops_2          5000
dtype: int64

In [773]:
numerical = ['DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare', 'Floor', 'HouseFloor', 'HouseYear', 'Ecology_1', 
             'Social_1', 'Social_2', 'Social_3', 'Healthcare_1', 'Helthcare_2', 'Shops_1']
test_numerical = test[numerical]

In [774]:
from sklearn import preprocessing
standardized_t = preprocessing.scale(test_numerical)
test_numerical.describe()

  


Unnamed: 0,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,51.2792,1.91,56.4495,35.455604,5.9768,8.632,12.601,1984.3926,0.119874,24.9338,5406.9,8.2626,1029.3964,1.3194,4.2428
std,44.179466,0.838594,19.092787,15.920269,9.950018,5.483228,6.789213,18.573149,0.12007,17.532202,4026.614773,23.863762,766.595258,1.47994,4.777365
min,0.0,0.0,1.378543,0.33349,0.0,1.0,0.0,1908.0,0.0,0.0,168.0,0.0,0.0,0.0,0.0
25%,21.0,1.0,41.906231,25.850152,1.0,4.0,9.0,1973.0,0.019509,6.0,1564.0,0.0,810.0,0.0,1.0
50%,37.0,2.0,52.92134,32.78126,6.0,7.0,12.0,1977.0,0.072158,25.0,5285.0,2.0,900.0,1.0,3.0
75%,77.0,2.0,66.285129,41.769526,9.0,12.0,17.0,2000.0,0.195781,36.0,7287.0,5.0,990.0,2.0,6.0
max,212.0,17.0,223.453689,303.071094,620.0,78.0,99.0,2020.0,0.521867,74.0,19083.0,141.0,4849.0,6.0,23.0


In [775]:
ID = ['Id']
ID_add = test[ID]

In [783]:
test = pd.concat((ID_add, test_numerical, test[tbinary_columns]), axis=1)
test = pd.DataFrame(test, dtype=int)
print (test.shape)
print (test.columns)
test.head()

(5000, 19)
Index(['Id', 'DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare',
       'Floor', 'HouseFloor', 'HouseYear', 'Ecology_1', 'Social_1', 'Social_2',
       'Social_3', 'Healthcare_1', 'Helthcare_2', 'Shops_1', 'Ecology_2',
       'Ecology_3', 'Shops_2'],
      dtype='object')


Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Ecology_2,Ecology_3,Shops_2
0,725,58,2,49,33,6,6,14,1972,0,11,2748,1,900,0,0,0,0,0
1,15856,74,2,69,32,1,6,1,1977,0,6,1437,3,900,0,2,0,0,0
2,5480,190,1,13,15,12,2,5,1909,0,30,7538,87,4702,5,5,0,0,0
3,15664,47,2,73,51,9,22,22,2007,0,23,4583,3,900,3,3,0,0,0
4,14275,27,1,47,43,1,17,17,2017,0,2,629,1,900,0,0,0,0,1


In [784]:
pred_test = rf.predict(test.loc[:, fts])
pred_test

array([145036.84482809, 185742.63425238, 181958.40904754, ...,
       394124.91137629, 222554.19565041, 181017.93830332])

In [785]:
pred_test.shape

(5000,)

In [786]:
test['Price'] = pred_test

In [789]:
test.head(50)

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Ecology_2,Ecology_3,Shops_2,Price
0,725,58,2,49,33,6,6,14,1972,0,11,2748,1,900,0,0,0,0,0,145036.844828
1,15856,74,2,69,32,1,6,1,1977,0,6,1437,3,900,0,2,0,0,0,185742.634252
2,5480,190,1,13,15,12,2,5,1909,0,30,7538,87,4702,5,5,0,0,0,181958.409048
3,15664,47,2,73,51,9,22,22,2007,0,23,4583,3,900,3,3,0,0,0,449853.846058
4,14275,27,1,47,43,1,17,17,2017,0,2,629,1,900,0,0,0,0,1,114694.009463
5,7633,53,1,40,32,1,21,21,1977,0,34,7759,0,229,1,3,0,0,0,182620.870936
6,13329,23,2,68,64,1,2,17,1977,0,6,1437,3,900,0,2,0,0,0,175732.879288
7,5502,32,2,48,32,6,5,14,1972,0,46,7960,6,350,3,11,0,0,0,217590.860812
8,4220,96,3,72,45,9,17,17,1997,0,53,14892,4,900,1,4,0,0,0,321536.808868
9,11538,6,3,80,47,9,13,17,2014,0,5,1564,0,540,0,0,0,0,0,228016.579208


In [788]:
test.loc[:, ['Id', 'Price']].to_csv('EReukova_predictions.csv', index=None)