# Подгрузка пакетов

In [1]:
import numpy as np
import pandas as pd

from tqdm import tqdm

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import scale
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import cross_val_score

# Метрики
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error

# Определение функций

In [2]:
def rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

# Загрузка данных

In [3]:
dataset = pd.read_csv(r'data\train.csv')

In [4]:
dataset.describe()

Unnamed: 0,target,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,20aa07010,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
count,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,...,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0
mean,5944923.0,14654.93,1390.895,26722.45,4530.164,26409.96,30708.11,16865.22,4669.208,2569407.0,...,467605.7,444623.9,805621.9,781296.6,143.529939,121380.9,35734.51,312374.1,92199.6,227910.0
std,8234312.0,389329.8,64283.02,569965.2,235912.4,1514730.0,577059.0,751275.6,187944.9,9610183.0,...,4068038.0,4428889.0,4513246.0,6839451.0,9584.318507,4720709.0,1614622.0,4318501.0,1635993.0,1811139.0
min,30000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,600000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2260000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,8000000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,600000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,40000000.0,20000000.0,4000000.0,20000000.0,14800000.0,100000000.0,20708000.0,40000000.0,10400000.0,319612000.0,...,76000000.0,123588000.0,130000000.0,144400000.0,640000.0,301312000.0,106420000.0,140000000.0,61768000.0,43200000.0


In [5]:
X = dataset.drop(['target', 'ID'], axis=1)
Y = dataset.target

train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size=0.251)
train_X, val_1_X, train_Y, val_1_Y = train_test_split(train_X, train_Y, test_size=0.3333)
val_2_X, ans_X, val_2_Y, ans_Y = train_test_split(train_X, train_Y, test_size=0.5)

In [6]:
test_Y.shape, val_1_Y.shape, val_2_Y.shape, ans_Y.shape

((1120,), (1113,), (1113,), (1113,))

# Построение первой модели

In [7]:
from sklearn.ensemble import RandomForestRegressor

In [8]:
parameters = {'max_depth':[10], 'n_estimators':[100, 150, 200]}

RFR = RandomForestRegressor(max_depth=10, n_estimators=200, n_jobs=-1)#, random_state=123)
#RFR = GridSearchCV(RFR, parameters, scoring='r2', cv=None)
RFR.fit(val_2_X, val_2_Y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=-1,
                      oob_score=False, random_state=None, verbose=0,
                      warm_start=False)

### Смотрим значение метрики на валидационном множестве

In [9]:
RFR_pred_val = RFR.predict(ans_X)
print('r2 score test: ', r2_score(ans_Y, RFR_pred_val))
print('rmsle score test: ', rmsle(ans_Y, RFR_pred_val))

r2 score test:  0.21962996894917974
rmsle score test:  1.9035920421395607


In [10]:
# r2 score test:  0.14942894737336188
# rmsle score test:  1.8407483726739988

### Смотрим значение метрики на тестовом множестве

In [11]:
RFR_pred_test = RFR.predict(test_X)
print('r2 score test: ', r2_score(test_Y, RFR_pred_test))
print('rmsle score test: ', rmsle(test_Y, RFR_pred_test))

r2 score test:  0.21547948206459366
rmsle score test:  1.9841915120422828


# Построение второй модели

In [12]:
import xgboost

In [13]:
parameters = {'n_estimators':[90, 100, 130]}

xg = xgboost.XGBRegressor(n_estimators=200, n_jobs=-1)#, random_state=123)
#xg = GridSearchCV(xg, parameters, scoring='r2', cv=None)
xg.fit(ans_X, ans_Y)

  if getattr(data, 'base', None) is not None and \




XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=200,
             n_jobs=-1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

### Смотрим значение метрики на валидационном множестве

In [14]:
XGP_pred_val = xg.predict(val_1_X)
print('r2 score test: ', r2_score(val_1_Y, XGP_pred_val))
#print('rmsle score test: ', rmsle(val_1_Y, XGP_pred_val))

r2 score test:  0.1674000246675451


### Смотрим значение метрики на тестовом множестве

In [15]:
XGP_pred_test = xg.predict(test_X)
print('r2 score test: ', r2_score(test_Y, XGP_pred_test))
#print('rmsle score test: ', rmsle(test_Y, XGP_pred_test))

r2 score test:  0.22367355014442225


# Построение Ансамбля

In [16]:
Ensemble_X = np.vstack((RFR_pred_val, XGP_pred_val)).T
Ensemble_X

array([[ 4471320.33917409,  5127246.5       ],
       [ 4402114.78552383,  4419711.5       ],
       [ 4398897.47654351, 26233580.        ],
       ...,
       [ 4402114.78552383,  4196656.5       ],
       [ 4476522.52324623, 16453308.        ],
       [ 4267868.94393856,  5078638.5       ]])

In [17]:
from sklearn.linear_model import LinearRegression

In [18]:
LR = LinearRegression(n_jobs=-1,fit_intercept=False)
LR.fit(Ensemble_X, val_1_Y)

LinearRegression(copy_X=True, fit_intercept=False, n_jobs=-1, normalize=False)

In [19]:
LR.coef_.sum()

0.945962237107689

In [20]:
weights = LR.coef_
Ensemble_pred = RFR_pred_test * weights[0] + XGP_pred_test * weights[1]
print('r2 score test: ', r2_score(test_Y, Ensemble_pred))
print('rmsle score test: ', rmsle(test_Y, Ensemble_pred))

r2 score test:  0.2361458464504511
rmsle score test:  1.8589712623545622
