In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

RANDOM_STATE = 42

In [2]:
print(pd.__version__)
print(np.__version__)

1.0.3
1.17.0


In [3]:
train_data = pd.read_csv('contest_train_df.csv', parse_dates = ['Date','Start_date'])
test_data = pd.read_csv('contest_test_df.csv', parse_dates = ['Date', 'Start_date'])

print('train_shape: ',train_data.shape)
print('test_shape', test_data.shape)

train_shape:  (2782441, 34)
test_shape (675077, 34)


In [4]:
# генерация признаков 

def simple_feature_estimator(well_history):
    
    id = well_history.id.values[0]
    
    well_history_before_GRP = well_history[well_history.Date<well_history.Start_date.values[0]]
    mean_Q_ois = well_history_before_GRP.Q_OIS.mean()
    median_Q_ois = well_history_before_GRP.Q_OIS.mean()
    std_Q_ois = well_history_before_GRP.Q_OIS.std()
    
    return pd.Series([id, mean_Q_ois, median_Q_ois, std_Q_ois], ['id','mean_Q_ois', 'median_Q_ois', 'std_Q_ois'])

def target_estimator(well_history):
    #print(well_history)
    id = well_history.id.values[0]
    target = well_history[well_history.VNR==1].Q_OIS.values[0]
    
    return pd.Series([id, target], ['id', 'target'])

In [6]:
train_X = train_data.groupby('id', as_index=False).apply(simple_feature_estimator)
train_y = train_data.groupby('id', as_index=False).apply(target_estimator)

test_X = test_data.groupby('id', as_index=False).apply(simple_feature_estimator)


#Заполним промуски в признаках:
train_X.dropna(inplace=True)
test_X.fillna(train_X.mean(), inplace=True)

#join train X и y, чтобы убедиться, что target соответствует признакам
train_dataset = pd.merge(train_X, train_y, on = 'id')

X_train = train_dataset.drop(['id', 'target'], axis=1)
y_train = train_dataset.target

train_dataset.head()

Unnamed: 0,id,mean_Q_ois,median_Q_ois,std_Q_ois,target
0,2.0,1122.068571,1122.068571,86.470019,1104.0
1,4.0,418.269006,418.269006,14.376375,450.0
2,5.0,418.177215,418.177215,13.837666,304.0
3,7.0,1128.247126,1128.247126,20.449342,1118.0
4,10.0,1062.788235,1062.788235,13.633924,1100.0


In [7]:
print(X_train.shape)
print(y_train.shape)

(15611, 3)
(15611,)


In [8]:
def make_sub(model, test_data):
    
    X = test_data.drop('id',axis=1)
    test_predictions = model.predict(X)

    submission = pd.DataFrame()
    submission['id'] = test_data.id.values
    submission['Q_OIS'] = test_predictions
    submission.to_csv('simple_submission.csv', index=False)

In [8]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV

In [6]:
from xgboost import XGBRegressor

In [14]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV

xgb = XGBRegressor(random_state=42)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

params = {'min_child_weight':[4,5], 'gamma':[i/10.0 for i in range(3,6)],  'subsample':[i/10.0 for i in range(6,11)],
'colsample_bytree':[i/10.0 for i in range(6,11)], 'max_depth': [2,3,4]}


optimizer_1 = GridSearchCV(estimator=xgb, param_grid=params, cv=3)
optimizer_1.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None, gamma=None,
                                    gpu_id=None, importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constrain...
                                    reg_lambda=None, scale_pos_weight=None,
                                    subsample=None, tree_method=None,
                                    validate_parameters=False, verbosity=None),
             iid='warn', n_jobs=None,
             param_grid={'colsample_bytree': [0.6, 0.7, 0.8, 0.9,

In [16]:
optimizer_1.best_params_

{'colsample_bytree': 1.0,
 'gamma': 0.3,
 'max_depth': 2,
 'min_child_weight': 4,
 'subsample': 0.9}

In [17]:
make_sub(optimizer_1.best_estimator_, test_X)

**Получил ошибку: 31.815**

## Новое решение

In [10]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

rf = RandomForestRegressor(random_state=42)

#skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

params = {'bootstrap': [True, False],
 'max_depth': [5, 10, 20, 30, 40, 50, 60],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000]}

'''
params = {'min_child_weight':[4,5], 'gamma':[i/10.0 for i in range(3,6)],  'subsample':[i/10.0 for i in range(6,11)],
'colsample_bytree':[i/10.0 for i in range(6,11)], 'max_depth': [2,3,4]}
'''

optimizer_1 = RandomizedSearchCV(estimator=rf, param_distributions = params, n_iter = 100, cv = 3, random_state=42)
optimizer_1.fit(X_train, y_train)

RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators='warn',
                                                   n_jobs=None, oob_score=False,
                                                   random_sta...


In [11]:
make_sub(optimizer_1.best_estimator_, test_X)

In [12]:
optimizer_1.best_params_

{'n_estimators': 400,
 'min_samples_split': 10,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 10,
 'bootstrap': True}

**Ошибка: 28.59**