In [201]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score as r2
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split

In [257]:
data = pd.read_csv('train.csv')

In [258]:
data.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
0,14038,35,2.0,47.981561,29.442751,6.0,7,9.0,1969,0.08904,B,B,33,7976,5,,0,11,B,184966.93073
1,15053,41,3.0,65.68364,40.049543,8.0,7,9.0,1978,7e-05,B,B,46,10309,1,240.0,1,16,B,300009.450063
2,4765,53,2.0,44.947953,29.197612,0.0,8,12.0,1968,0.049637,B,B,34,7759,0,229.0,1,3,B,220925.908524
3,5809,58,2.0,53.352981,52.731512,9.0,8,17.0,1977,0.437885,B,B,23,5735,3,1084.0,0,5,B,175616.227217
4,10783,99,1.0,39.649192,23.776169,7.0,11,12.0,1976,0.012339,B,B,35,5776,1,2078.0,2,4,B,150226.531644


In [259]:
categorical_columns = [c for c in data.columns if data[c].dtype.name == 'object']
numerical_columns   = [c for c in data.columns if data[c].dtype.name != 'object']
print (categorical_columns)
print (numerical_columns)

['Ecology_2', 'Ecology_3', 'Shops_2']
['Id', 'DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare', 'Floor', 'HouseFloor', 'HouseYear', 'Ecology_1', 'Social_1', 'Social_2', 'Social_3', 'Healthcare_1', 'Helthcare_2', 'Shops_1', 'Price']


In [260]:
data.count(axis=0)

Id               10000
DistrictId       10000
Rooms            10000
Square           10000
LifeSquare        7887
KitchenSquare    10000
Floor            10000
HouseFloor       10000
HouseYear        10000
Ecology_1        10000
Ecology_2        10000
Ecology_3        10000
Social_1         10000
Social_2         10000
Social_3         10000
Healthcare_1      5202
Helthcare_2      10000
Shops_1          10000
Shops_2          10000
Price            10000
dtype: int64

In [261]:
data = data.fillna(data.median(axis=0), axis=0)

In [262]:
data.count(axis=0)

Id               10000
DistrictId       10000
Rooms            10000
Square           10000
LifeSquare       10000
KitchenSquare    10000
Floor            10000
HouseFloor       10000
HouseYear        10000
Ecology_1        10000
Ecology_2        10000
Ecology_3        10000
Social_1         10000
Social_2         10000
Social_3         10000
Healthcare_1     10000
Helthcare_2      10000
Shops_1          10000
Shops_2          10000
Price            10000
dtype: int64

In [263]:
data_describe = data.describe(include=[object])
data_describe
binary_columns    = [c for c in categorical_columns if data_describe[c]['unique'] == 2]
nonbinary_columns = [c for c in categorical_columns if data_describe[c]['unique'] > 2]
print (binary_columns, nonbinary_columns)

['Ecology_2', 'Ecology_3', 'Shops_2'] []


In [448]:
for c in binary_columns[1:]:
    top = data_describe[c]['top']
    top_items = data[c] == top
    data.loc[top_items, c] = 0
    data.loc[np.logical_not(top_items), c] = 1

In [267]:
data[binary_columns].describe()

Unnamed: 0,Ecology_2,Ecology_3,Shops_2
count,10000.0,10000.0,10000.0
mean,0.0097,0.0275,0.0825
std,0.098015,0.163543,0.275139
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,0.0,0.0
75%,0.0,0.0,0.0
max,1.0,1.0,1.0


In [268]:
data_numerical = data[numerical_columns]
data_numerical = (data_numerical - data_numerical.mean()) / data_numerical.std()
data_numerical.describe()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Price
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5.77316e-17,-7.354117000000001e-17,-5.808687000000001e-17,4.078737e-16,-1.08485e-14,1.15101e-16,3.501643e-17,-2.519984e-16,-2.039558e-16,-3.586826e-14,-6.1645130000000004e-18,-4.6621040000000004e-17,1.285211e-15,8.626433e-17,1.193456e-15,-3.351763e-16,-7.169175e-16
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-1.725329,-1.156311,-2.251904,-2.620239,-0.4685476,-0.2196463,-1.436078,-1.860899,-0.01037488,-0.9985907,-1.408062,-1.29384,-0.3373297,-1.374595,-0.8834356,-0.8803579,-1.668572
25%,-0.8672342,-0.6974645,-1.060735,-0.6904924,-0.1401729,-0.1846334,-0.8636848,-0.5326762,-0.01005568,-0.8503276,-1.065842,-0.9454322,-0.3373297,-0.262982,-0.8834356,-0.6722994,-0.648915
50%,0.002282827,-0.3303876,0.1304329,-0.1805648,-0.04548728,-0.00956902,-0.2912911,0.05764485,-0.01004072,-0.3649128,0.01785244,-0.01676086,-0.2534085,-0.1692315,-0.2139126,-0.2561824,-0.2354762
75%,0.8662432,0.5643624,0.1304329,0.4551485,0.06736973,0.09546962,0.6626983,0.6479659,-0.009921016,0.6462791,0.6452546,0.4679152,-0.1275267,-0.04869521,0.4556104,0.3679931,0.376825
max,1.731747,3.638632,20.38029,27.76755,97.17175,70.2963,6.386635,15.40599,99.99,3.385916,2.812644,3.426885,5.579116,5.119635,3.133702,3.904987,4.51259


In [293]:
data = pd.concat((data_numerical, data[binary_columns]), axis=1)
data = pd.DataFrame(data, dtype=float)
print (data.shape)
print (data.columns)

(10000, 20)
Index(['Id', 'DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare',
       'Floor', 'HouseFloor', 'HouseYear', 'Ecology_1', 'Social_1', 'Social_2',
       'Social_3', 'Healthcare_1', 'Helthcare_2', 'Shops_1', 'Price',
       'Ecology_2', 'Ecology_3', 'Shops_2'],
      dtype='object')


In [574]:
fts = ['Square', 'Rooms','Square','HouseFloor','DistrictId', 'Floor','Shops_1', 'Ecology_1',
       'Ecology_2', 'Ecology_3', 'Shops_2','Social_1', 'Social_2',
       'Social_3']
X = data.loc[:, fts]
y = data['Price']

In [575]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=11)

In [576]:
from sklearn.ensemble import RandomForestRegressor as RF

rf = RF(n_estimators=20, max_depth=10, min_samples_leaf=2, random_state=42)

In [577]:
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=2, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [578]:
pred = rf.predict(X_train)

In [579]:
pred

array([ 0.64101026,  0.22650757, -0.26596805, ..., -0.56189074,
       -0.70688438, -0.14573929])

In [580]:
r2(y_train, pred)

0.8241818681347406

In [581]:
pred_valid = rf.predict(X_test)

In [582]:
pred_valid

array([-0.5827365 , -0.70189377, -0.36210718, ...,  0.18321323,
       -0.53748761, -0.49240791])

In [583]:
r2(y_test, pred_valid)

0.7327922160543129

## Предсказания на тесте

In [523]:
test = pd.read_csv('test.csv')

In [524]:
test.shape

(5000, 19)

In [525]:
test.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
0,725,58,2.0,49.882643,33.432782,6.0,6,14.0,1972,0.310199,B,B,11,2748,1,,0,0,B
1,15856,74,2.0,69.263183,,1.0,6,1.0,1977,0.075779,B,B,6,1437,3,,0,2,B
2,5480,190,1.0,13.597819,15.948246,12.0,2,5.0,1909,0.0,B,B,30,7538,87,4702.0,5,5,B
3,15664,47,2.0,73.046609,51.940842,9.0,22,22.0,2007,0.101872,B,B,23,4583,3,,3,3,B
4,14275,27,1.0,47.527111,43.387569,1.0,17,17.0,2017,0.072158,B,B,2,629,1,,0,0,A


In [526]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 19 columns):
Id               5000 non-null int64
DistrictId       5000 non-null int64
Rooms            5000 non-null float64
Square           5000 non-null float64
LifeSquare       3959 non-null float64
KitchenSquare    5000 non-null float64
Floor            5000 non-null int64
HouseFloor       5000 non-null float64
HouseYear        5000 non-null int64
Ecology_1        5000 non-null float64
Ecology_2        5000 non-null object
Ecology_3        5000 non-null object
Social_1         5000 non-null int64
Social_2         5000 non-null int64
Social_3         5000 non-null int64
Healthcare_1     2623 non-null float64
Helthcare_2      5000 non-null int64
Shops_1          5000 non-null int64
Shops_2          5000 non-null object
dtypes: float64(7), int64(9), object(3)
memory usage: 742.3+ KB


In [546]:
tcategorical_columns = [c for c in test.columns if test[c].dtype.name == 'object']
tnumerical_columns   = [c for c in test.columns if test[c].dtype.name != 'object']
print (tcategorical_columns)
print (tnumerical_columns)

['Ecology_2', 'Ecology_3', 'Shops_2']
['Id', 'DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare', 'Floor', 'HouseFloor', 'HouseYear', 'Ecology_1', 'Social_1', 'Social_2', 'Social_3', 'Healthcare_1', 'Helthcare_2', 'Shops_1', 'Price']


In [547]:
test_describe = test.describe(include=[object])
test_describe
tbinary_columns    = [c for c in tcategorical_columns if test_describe[c]['unique'] == 2]
tnonbinary_columns = [c for c in tcategorical_columns if test_describe[c]['unique'] > 2]
print (tbinary_columns, tnonbinary_columns)

['Ecology_2', 'Ecology_3', 'Shops_2'] []


In [553]:
for c in tbinary_columns[:1]:
    top = test_describe[c]['top']
    top_items = test[c] == top
    test.loc[top_items, c] = 0
    test.loc[np.logical_not(top_items), c] = 1

In [554]:
test[tbinary_columns].describe()

Unnamed: 0,Ecology_2,Ecology_3,Shops_2
count,5000.0,5000.0,5000.0
mean,0.0096,0.0298,0.0824
std,0.097518,0.170052,0.275001
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,0.0,0.0
75%,0.0,0.0,0.0
max,1.0,1.0,1.0


In [555]:
test.count(axis=0)

Id               5000
DistrictId       5000
Rooms            5000
Square           5000
LifeSquare       5000
KitchenSquare    5000
Floor            5000
HouseFloor       5000
HouseYear        5000
Ecology_1        5000
Ecology_2        5000
Ecology_3        5000
Social_1         5000
Social_2         5000
Social_3         5000
Healthcare_1     5000
Helthcare_2      5000
Shops_1          5000
Shops_2          5000
Price            5000
dtype: int64

In [556]:
test = test.fillna(data.median(axis=0), axis=0)

In [557]:
test.count(axis=0)

Id               5000
DistrictId       5000
Rooms            5000
Square           5000
LifeSquare       5000
KitchenSquare    5000
Floor            5000
HouseFloor       5000
HouseYear        5000
Ecology_1        5000
Ecology_2        5000
Ecology_3        5000
Social_1         5000
Social_2         5000
Social_3         5000
Healthcare_1     5000
Helthcare_2      5000
Shops_1          5000
Shops_2          5000
Price            5000
dtype: int64

In [558]:
test_numerical = test[tnumerical_columns]
test_numerical = (test_numerical - test_numerical.mean()) / test_numerical.std()
test_numerical.describe()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Price
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,-2.9198870000000005e-17,-3.6959320000000006e-17,6.830647e-17,4.378004e-15,1.771048e-14,-8.111289000000001e-17,6.433742000000001e-17,-1.233236e-16,5.486611e-15,4.907785e-15,-1.227352e-16,-1.509459e-16,-2.783551e-16,1.595286e-16,-1.409717e-15,2.756018e-16,-2.651104e-14
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-1.740568,-1.160702,-2.277621,-2.884385,-2.20869,-0.6006823,-1.391881,-1.856032,-4.113067,-0.9983686,-1.422172,-1.301068,-0.3462405,-1.342816,-0.8915228,-0.8881047,-2.138507
25%,-0.8671898,-0.6853682,-1.085149,-0.7617154,-0.6054156,-0.50018,-0.8447579,-0.5304002,-0.6133909,-0.8358898,-1.079944,-0.9543749,-0.3462405,-0.2861959,-0.8915228,-0.6787843,-0.4077526
50%,-0.01905682,-0.323209,0.1073224,-0.1847902,-0.1608801,0.002331654,-0.2976349,-0.08852277,-0.3980262,-0.3974049,0.003775909,-0.03027357,-0.2624314,-0.1687936,-0.2158196,-0.2601434,-0.106253
75%,0.8661156,0.582189,0.1073224,0.5151489,0.3948377,0.3038386,0.6142367,0.6479396,0.8403206,0.6321919,0.6311928,0.4669183,-0.1367178,-0.0513914,0.4598836,0.3678179,0.6996465
max,1.734527,3.637907,17.9944,8.746978,16.81305,61.71076,12.65094,12.72592,1.917144,3.347996,2.798633,3.396426,5.5623,4.982556,3.162696,3.926265,4.900124


In [568]:
test = pd.concat((test_numerical, test[tbinary_columns]), axis=1)
test = pd.DataFrame(test, dtype=float)
print (test.shape)
print (test.columns)

(5000, 20)
Index(['Id', 'DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare',
       'Floor', 'HouseFloor', 'HouseYear', 'Ecology_1', 'Social_1', 'Social_2',
       'Social_3', 'Healthcare_1', 'Helthcare_2', 'Shops_1', 'Price',
       'Ecology_2', 'Ecology_3', 'Shops_2'],
      dtype='object')


In [584]:
pred_test = rf.predict(test.loc[:, fts])
pred_test

array([-0.75009135,  0.36131817,  2.15368183, ...,  1.3197813 ,
       -0.02049804, -0.15581557])

In [585]:
pred_test.shape

(5000,)

In [586]:
test['Price'] = pred_test

In [588]:
test.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Price,Ecology_2,Ecology_3,Shops_2
0,-1.590754,0.152125,0.107322,-0.343944,-0.12898,0.002332,-0.480009,0.206062,-0.667232,1.585122,-0.794755,-0.660331,-0.304336,-0.168794,-0.891523,-0.888105,-0.750091,0.0,0.0,0.0
1,1.540225,0.514284,0.107322,0.671127,-0.16088,-0.50018,-0.480009,-1.70874,-0.398026,-0.367246,-1.079944,-0.985915,-0.220527,-0.168794,-0.891523,-0.469464,0.361318,0.0,0.0,0.0
2,-0.606827,3.139938,-1.085149,-2.244391,-1.227576,0.605346,-1.209507,-1.11957,-4.059226,-0.998369,0.288965,0.529254,3.299455,4.790799,2.486993,0.158497,2.153682,0.0,0.0,0.0
3,1.500495,-0.096859,0.107322,0.869287,1.033926,0.303839,2.43798,1.384402,1.217209,-0.149925,-0.1103,-0.204614,-0.220527,-0.168794,1.135587,-0.260143,1.611686,0.0,0.0,0.0
4,1.213077,-0.549558,-1.085149,-0.467317,0.496503,-0.50018,1.526108,0.64794,1.75562,-0.397405,-1.308096,-1.18658,-0.304336,-0.168794,-0.891523,-0.888105,-0.869087,0.0,0.0,1.0


In [589]:
test.loc[:, ['Id', 'Price']].to_csv('EReukova_predictions.csv', index=None)