In [201]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score as r2
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split

In [257]:
data = pd.read_csv('train.csv')

In [258]:
data.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
0,14038,35,2.0,47.981561,29.442751,6.0,7,9.0,1969,0.08904,B,B,33,7976,5,,0,11,B,184966.93073
1,15053,41,3.0,65.68364,40.049543,8.0,7,9.0,1978,7e-05,B,B,46,10309,1,240.0,1,16,B,300009.450063
2,4765,53,2.0,44.947953,29.197612,0.0,8,12.0,1968,0.049637,B,B,34,7759,0,229.0,1,3,B,220925.908524
3,5809,58,2.0,53.352981,52.731512,9.0,8,17.0,1977,0.437885,B,B,23,5735,3,1084.0,0,5,B,175616.227217
4,10783,99,1.0,39.649192,23.776169,7.0,11,12.0,1976,0.012339,B,B,35,5776,1,2078.0,2,4,B,150226.531644


In [259]:
categorical_columns = [c for c in data.columns if data[c].dtype.name == 'object']
numerical_columns   = [c for c in data.columns if data[c].dtype.name != 'object']
print (categorical_columns)
print (numerical_columns)

['Ecology_2', 'Ecology_3', 'Shops_2']
['Id', 'DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare', 'Floor', 'HouseFloor', 'HouseYear', 'Ecology_1', 'Social_1', 'Social_2', 'Social_3', 'Healthcare_1', 'Helthcare_2', 'Shops_1', 'Price']


In [260]:
data.count(axis=0)

Id               10000
DistrictId       10000
Rooms            10000
Square           10000
LifeSquare        7887
KitchenSquare    10000
Floor            10000
HouseFloor       10000
HouseYear        10000
Ecology_1        10000
Ecology_2        10000
Ecology_3        10000
Social_1         10000
Social_2         10000
Social_3         10000
Healthcare_1      5202
Helthcare_2      10000
Shops_1          10000
Shops_2          10000
Price            10000
dtype: int64

In [261]:
data = data.fillna(data.median(axis=0), axis=0)

In [262]:
data.count(axis=0)

Id               10000
DistrictId       10000
Rooms            10000
Square           10000
LifeSquare       10000
KitchenSquare    10000
Floor            10000
HouseFloor       10000
HouseYear        10000
Ecology_1        10000
Ecology_2        10000
Ecology_3        10000
Social_1         10000
Social_2         10000
Social_3         10000
Healthcare_1     10000
Helthcare_2      10000
Shops_1          10000
Shops_2          10000
Price            10000
dtype: int64

In [263]:
data_describe = data.describe(include=[object])
data_describe
binary_columns    = [c for c in categorical_columns if data_describe[c]['unique'] == 2]
nonbinary_columns = [c for c in categorical_columns if data_describe[c]['unique'] > 2]
print (binary_columns, nonbinary_columns)

['Ecology_2', 'Ecology_3', 'Shops_2'] []


In [448]:
for c in binary_columns[1:]:
    top = data_describe[c]['top']
    top_items = data[c] == top
    data.loc[top_items, c] = 0
    data.loc[np.logical_not(top_items), c] = 1

In [267]:
data[binary_columns].describe()

Unnamed: 0,Ecology_2,Ecology_3,Shops_2
count,10000.0,10000.0,10000.0
mean,0.0097,0.0275,0.0825
std,0.098015,0.163543,0.275139
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,0.0,0.0
75%,0.0,0.0,0.0
max,1.0,1.0,1.0


In [268]:
data_numerical = data[numerical_columns]
data_numerical = (data_numerical - data_numerical.mean()) / data_numerical.std()
data_numerical.describe()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Price
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5.77316e-17,-7.354117000000001e-17,-5.808687000000001e-17,4.078737e-16,-1.08485e-14,1.15101e-16,3.501643e-17,-2.519984e-16,-2.039558e-16,-3.586826e-14,-6.1645130000000004e-18,-4.6621040000000004e-17,1.285211e-15,8.626433e-17,1.193456e-15,-3.351763e-16,-7.169175e-16
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-1.725329,-1.156311,-2.251904,-2.620239,-0.4685476,-0.2196463,-1.436078,-1.860899,-0.01037488,-0.9985907,-1.408062,-1.29384,-0.3373297,-1.374595,-0.8834356,-0.8803579,-1.668572
25%,-0.8672342,-0.6974645,-1.060735,-0.6904924,-0.1401729,-0.1846334,-0.8636848,-0.5326762,-0.01005568,-0.8503276,-1.065842,-0.9454322,-0.3373297,-0.262982,-0.8834356,-0.6722994,-0.648915
50%,0.002282827,-0.3303876,0.1304329,-0.1805648,-0.04548728,-0.00956902,-0.2912911,0.05764485,-0.01004072,-0.3649128,0.01785244,-0.01676086,-0.2534085,-0.1692315,-0.2139126,-0.2561824,-0.2354762
75%,0.8662432,0.5643624,0.1304329,0.4551485,0.06736973,0.09546962,0.6626983,0.6479659,-0.009921016,0.6462791,0.6452546,0.4679152,-0.1275267,-0.04869521,0.4556104,0.3679931,0.376825
max,1.731747,3.638632,20.38029,27.76755,97.17175,70.2963,6.386635,15.40599,99.99,3.385916,2.812644,3.426885,5.579116,5.119635,3.133702,3.904987,4.51259


In [293]:
data = pd.concat((data_numerical, data[binary_columns]), axis=1)
data = pd.DataFrame(data, dtype=float)
print (data.shape)
print (data.columns)

(10000, 20)
Index(['Id', 'DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare',
       'Floor', 'HouseFloor', 'HouseYear', 'Ecology_1', 'Social_1', 'Social_2',
       'Social_3', 'Healthcare_1', 'Helthcare_2', 'Shops_1', 'Price',
       'Ecology_2', 'Ecology_3', 'Shops_2'],
      dtype='object')


In [450]:
fts = ['Square', 'Rooms','Square','HouseFloor','DistrictId', 'Floor','Shops_1',
       'Social_1', 'Social_2',
       'Social_3']
X = data.loc[:, fts]
y = data['Price']

In [451]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=11)

In [452]:
from sklearn.ensemble import RandomForestRegressor as RF

rf = RF(n_estimators=20, max_depth=10, min_samples_leaf=2, random_state=42)

In [453]:
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=2, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [454]:
pred = rf.predict(X_train)

In [455]:
pred

array([ 0.5752549 ,  0.20775902, -0.26085305, ..., -0.52128753,
       -0.7116104 , -0.16025895])

In [456]:
r2(y_train, pred)

0.8206721192170889

In [457]:
pred_valid = rf.predict(X_test)

In [458]:
pred_valid

array([-0.54213328, -0.66420139, -0.46755009, ...,  0.19059092,
       -0.55013113, -0.47681023])

In [459]:
r2(y_test, pred_valid)

0.7324978588500186

## Предсказания на тесте

In [460]:
test = pd.read_csv('test.csv')

In [461]:
test.shape

(5000, 19)

In [462]:
test.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
0,725,58,2.0,49.882643,33.432782,6.0,6,14.0,1972,0.310199,B,B,11,2748,1,,0,0,B
1,15856,74,2.0,69.263183,,1.0,6,1.0,1977,0.075779,B,B,6,1437,3,,0,2,B
2,5480,190,1.0,13.597819,15.948246,12.0,2,5.0,1909,0.0,B,B,30,7538,87,4702.0,5,5,B
3,15664,47,2.0,73.046609,51.940842,9.0,22,22.0,2007,0.101872,B,B,23,4583,3,,3,3,B
4,14275,27,1.0,47.527111,43.387569,1.0,17,17.0,2017,0.072158,B,B,2,629,1,,0,0,A


In [463]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 19 columns):
Id               5000 non-null int64
DistrictId       5000 non-null int64
Rooms            5000 non-null float64
Square           5000 non-null float64
LifeSquare       3959 non-null float64
KitchenSquare    5000 non-null float64
Floor            5000 non-null int64
HouseFloor       5000 non-null float64
HouseYear        5000 non-null int64
Ecology_1        5000 non-null float64
Ecology_2        5000 non-null object
Ecology_3        5000 non-null object
Social_1         5000 non-null int64
Social_2         5000 non-null int64
Social_3         5000 non-null int64
Healthcare_1     2623 non-null float64
Helthcare_2      5000 non-null int64
Shops_1          5000 non-null int64
Shops_2          5000 non-null object
dtypes: float64(7), int64(9), object(3)
memory usage: 742.3+ KB


In [464]:
categorical_columns = [c for c in data.columns if data[c].dtype.name == 'object']
numerical_columns   = [c for c in data.columns if data[c].dtype.name != 'object']
print (categorical_columns)
print (numerical_columns)

[]
['Id', 'DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare', 'Floor', 'HouseFloor', 'HouseYear', 'Ecology_1', 'Social_1', 'Social_2', 'Social_3', 'Healthcare_1', 'Helthcare_2', 'Shops_1', 'Price', 'Ecology_2', 'Ecology_3', 'Shops_2']


In [465]:
test.count(axis=0)

Id               5000
DistrictId       5000
Rooms            5000
Square           5000
LifeSquare       3959
KitchenSquare    5000
Floor            5000
HouseFloor       5000
HouseYear        5000
Ecology_1        5000
Ecology_2        5000
Ecology_3        5000
Social_1         5000
Social_2         5000
Social_3         5000
Healthcare_1     2623
Helthcare_2      5000
Shops_1          5000
Shops_2          5000
dtype: int64

In [466]:
test = test.fillna(test.median(axis=0), axis=0)

In [467]:
test.count(axis=0)

Id               5000
DistrictId       5000
Rooms            5000
Square           5000
LifeSquare       5000
KitchenSquare    5000
Floor            5000
HouseFloor       5000
HouseYear        5000
Ecology_1        5000
Ecology_2        5000
Ecology_3        5000
Social_1         5000
Social_2         5000
Social_3         5000
Healthcare_1     5000
Helthcare_2      5000
Shops_1          5000
Shops_2          5000
dtype: int64

In [468]:
pred_test = rf.predict(test.loc[:, fts])
pred_test

array([1.10794138, 1.45930164, 0.66118523, ..., 1.04700135, 1.46597252,
       0.95118463])

In [469]:
pred_test.shape

(5000,)

In [470]:
test['Price'] = pred_test

In [471]:
test.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
0,725,58,2.0,49.882643,33.432782,6.0,6,14.0,1972,0.310199,B,B,11,2748,1,900.0,0,0,B,1.107941
1,15856,74,2.0,69.263183,32.925087,1.0,6,1.0,1977,0.075779,B,B,6,1437,3,900.0,0,2,B,1.459302
2,5480,190,1.0,13.597819,15.948246,12.0,2,5.0,1909,0.0,B,B,30,7538,87,4702.0,5,5,B,0.661185
3,15664,47,2.0,73.046609,51.940842,9.0,22,22.0,2007,0.101872,B,B,23,4583,3,900.0,3,3,B,1.258893
4,14275,27,1.0,47.527111,43.387569,1.0,17,17.0,2017,0.072158,B,B,2,629,1,900.0,0,0,A,0.726871


In [474]:
test.loc[:, ['Id', 'Price']].to_csv('EReukova_predictions.csv', index=None)