In [1]:
import pandas as pd 
import numpy as np
from tqdm import tqdm
import os
import lightgbm as lgb
import warnings
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, KFold
import gc
from gensim.models import Word2Vec
import time

In [2]:
train2018 = pd.read_csv('../管网压力预测-数据/train_水压数据_2018.csv')
train2019 = pd.read_csv('../管网压力预测-数据/train_水压数据_2019.csv')
train2020 = pd.read_csv('../管网压力预测-数据/test_水压数据_2020.csv')
test = pd.read_csv('../管网压力预测-数据/to_predict.csv')
submit = pd.read_csv('../管网压力预测-数据/submit.csv')

In [3]:
train2018.head()

Unnamed: 0,Time,MeasName,H0,H1,H2,H3,H4,H5,H6,H7,...,H14,H15,H16,H17,H18,H19,H20,H21,H22,H23
0,2018-01-01,站点4,0.40275,0.407625,0.418125,0.42525,0.426,0.42525,0.417375,0.426375,...,0.34875,0.35925,0.3555,0.34425,0.352125,0.35625,0.34725,0.343875,0.356625,0.418875
1,2018-01-01,站点7,0.214375,0.22675,0.232375,0.233125,0.235,0.23275,0.230875,0.22,...,0.187375,0.19675,0.19975,0.19225,0.18625,0.18325,0.17725,0.163375,0.16525,0.199375
2,2018-01-01,站点22,0.247,0.248125,0.271375,0.251125,0.272125,0.256375,0.257125,0.2425,...,0.2455,0.242875,0.238375,0.230875,0.23725,0.236875,0.2365,0.2365,0.241,0.2545
3,2018-01-01,站点21,0.28425,0.289875,0.2835,0.28125,0.288375,0.28875,0.28575,0.25575,...,0.227625,0.238125,0.2385,0.218625,0.207,0.212625,0.20925,0.189,0.217875,0.27
4,2018-01-01,站点20,0.292875,0.295875,0.30525,0.298875,0.310125,0.30075,0.288375,0.2625,...,0.2475,0.241125,0.243375,0.2325,0.233625,0.22425,0.219375,0.202125,0.219375,0.2865


In [4]:
train2020.head()

Unnamed: 0,Time,MeasName,H0,H1,H2,H3,H4,H5,H6,H7,...,H14,H15,H16,H17,H18,H19,H20,H21,H22,H23
0,2020-01-01,站点4,0.417375,0.431625,0.437625,0.4395,0.447375,0.445125,0.4365,0.42225,...,0.337875,0.336,0.33975,0.327,0.32025,0.33075,0.336375,0.326625,0.350625,0.386625
1,2020-01-01,站点7,0.292375,0.32125,0.32725,0.324625,0.33175,0.31675,0.30925,0.29875,...,0.268,0.26725,0.264625,0.261625,0.250375,0.256375,0.255625,0.23725,0.257875,0.291625
2,2020-01-01,站点22,0.25525,0.307,0.327625,0.3235,0.343,0.3145,0.304,0.259375,...,0.231625,0.226375,0.22825,0.229,0.2305,0.226375,0.229375,0.226,0.22975,0.2395
3,2020-01-01,站点21,0.292875,0.313125,0.321,0.314625,0.321375,0.318375,0.303375,0.288375,...,0.26325,0.25725,0.265125,0.253875,0.2475,0.258,0.247875,0.23775,0.252,0.286125
4,2020-01-01,站点20,0.288375,0.3165,0.32025,0.31425,0.3225,0.318,0.30075,0.283875,...,0.264,0.256875,0.26175,0.258375,0.253125,0.259125,0.252,0.24,0.256125,0.285375


In [5]:
test.head()

Unnamed: 0,id,Time,MeasName,Hour
0,0,2020-02-03,站点4,H0
1,1,2020-02-03,站点4,H1
2,2,2020-02-03,站点4,H2
3,3,2020-02-03,站点4,H3
4,4,2020-02-03,站点4,H4


In [6]:
submit.head()

Unnamed: 0,id,pressure
0,0,0.4335
1,1,0.4335
2,2,0.4335
3,3,0.4335
4,4,0.4335


In [7]:
def reshape_data(df1):
    time = df1["Time"].values
    meas = df1["MeasName"].values

    df_list = []

    for i in range(0,24):
        hour = "H"+str(i)
        pressure = df1[hour].values
        df2 = pd.DataFrame()
        df2["Time"] = time
        df2["MeasName"] = meas
        df2["Hour"] = hour
        df2["pressure"] = pressure

        df_list.append(df2)

    df3 = pd.concat(df_list)
    df3.sort_values(by = ['Time', 'MeasName'], inplace = True)
    df3 = df3.reset_index(drop=True)
    return df3

In [8]:
train2018 = reshape_data(train2018)
train2019 = reshape_data(train2019)
train2020 = reshape_data(train2020)

In [9]:
train2018.head()

Unnamed: 0,Time,MeasName,Hour,pressure
0,2018-01-01,站点1,H0,0.288625
1,2018-01-01,站点1,H1,0.292
2,2018-01-01,站点1,H2,0.2905
3,2018-01-01,站点1,H3,0.2995
4,2018-01-01,站点1,H4,0.30025


#### 特征工程

In [10]:
train2018['Time_time'] = pd.to_datetime(train2018['Time'])
train2019['Time_time'] = pd.to_datetime(train2019['Time'])
train2020['Time_time'] = pd.to_datetime(train2020['Time'])
test['Time_time'] = pd.to_datetime(test['Time'])

In [11]:
def abnormal(df):
    '''
    处理-9999异常值: 上一个值填充
    '''
    index_value = list(df[df['pressure'] == -99999].index)
    for i in index_value:
        value = df[df.index== (i - 1)]['pressure'].iloc[0]
        df.loc[i, 'pressure'] = value
    return df
train2018 = abnormal(train2018)
train2019 = abnormal(train2019)
train2020 = abnormal(train2020)

In [12]:
def feature1(df):
    df['Day'] = df['Time'].apply(lambda x: int(x.split('-')[-1]))
    df['Hour'] = df['Hour'].apply(lambda x: int(x.replace('H', '')))
    df['MeasName'] = df['MeasName'].apply(lambda x: int(x.replace('站点', '')))
    
    return df

In [13]:
train2018 = feature1(train2018)
train2019 = feature1(train2019)
train2020 = feature1(train2020)
test = feature1(test)

In [14]:
train2020.head()

Unnamed: 0,Time,MeasName,Hour,pressure,Time_time,Day
0,2020-01-01,1,0,0.309625,2020-01-01,1
1,2020-01-01,1,1,0.32575,2020-01-01,1
2,2020-01-01,1,2,0.332875,2020-01-01,1
3,2020-01-01,1,3,0.32425,2020-01-01,1
4,2020-01-01,1,4,0.33175,2020-01-01,1


#### 训练集、测试集

训练数据 2018.1.1 - 2019.12.31

验证数据 2020.1.1 - 2020.1.31；2020.3.1 - 2020.3.31；2020.5.1 - 2020.5.31；2020.8.1 - 2020.8.31

测试数据 2020.2.3 - 2020.2.16；2020.4.6 - 2020.4.19；2020.6.1 - 2020.6.14；2020.9.7 - 2020.9.20

分段1 训练集、测试集

In [15]:
train2019.head()

Unnamed: 0,Time,MeasName,Hour,pressure,Time_time,Day
0,2019-01-01,1,0,0.225625,2019-01-01,1
1,2019-01-01,1,1,0.24625,2019-01-01,1
2,2019-01-01,1,2,0.259,2019-01-01,1
3,2019-01-01,1,3,0.251875,2019-01-01,1
4,2019-01-01,1,4,0.26575,2019-01-01,1


In [16]:
train2019Mon2 = train2019[(train2019['Time_time'] >= '2019-2-1') & (train2019['Time_time'] <= '2019-2-28')]
train2019Mon1 = train2019[(train2019['Time_time'] >= '2019-1-1') & (train2019['Time_time'] <= '2019-1-28')]
Mon_2_1_2019 = train2019Mon2['pressure'].mean() - train2019Mon1['pressure'].mean()

train1 = train2020[(train2020['Time_time'] >= '2020-1-1') & (train2020['Time_time'] <= '2020-1-31')]
test1 = test[(test['Time_time'] >= '2020-2-3') & (test['Time_time'] <= '2020-2-16')]


used_feat = [f for f in train1.columns if f not in ['id', 'pressure', 'Time', 'Time_time']]
print('feat nums ', len(used_feat), used_feat)

feat nums  3 ['MeasName', 'Hour', 'Day']


In [17]:
train1.head()

Unnamed: 0,Time,MeasName,Hour,pressure,Time_time,Day
0,2020-01-01,1,0,0.309625,2020-01-01,1
1,2020-01-01,1,1,0.32575,2020-01-01,1
2,2020-01-01,1,2,0.332875,2020-01-01,1
3,2020-01-01,1,3,0.32425,2020-01-01,1
4,2020-01-01,1,4,0.33175,2020-01-01,1


In [18]:
train_x = train1[used_feat]
train_y = train1['pressure']
test_x = test1[used_feat]
print(train_x.shape, test_x.shape)

# -----------------------------------------------
scores = []

params = {'learning_rate': 0.1, 
        'boosting_type': 'gbdt', 
        'objective': 'regression_l1',
        'metric': 'mae',
        'min_child_samples': 46, 
        'min_child_weight': 0.01,
        'feature_fraction': 0.8, 
        'bagging_fraction': 0.8, 
        'bagging_freq': 2, 
        'num_leaves': 16, 
        'max_depth': 5, 
        'n_jobs': -1, 
        'seed': 2019, 
        'verbosity': -1, 
       }



oof_train = np.zeros(len(train_x))
preds = np.zeros(len(test_x))
folds = 5
seeds = [2048, 1997]
for seed in seeds: 
    kfold = KFold(n_splits=folds, shuffle=True, random_state=seed)
    for fold, (trn_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
        print('fold ', fold + 1)
        x_trn, y_trn, x_val, y_val = train_x.iloc[trn_idx], train_y.iloc[trn_idx], train_x.iloc[val_idx], train_y.iloc[val_idx]
        train_set = lgb.Dataset(x_trn, y_trn)
        val_set = lgb.Dataset(x_val, y_val)

        model = lgb.train(params, train_set, num_boost_round=500000,
                          valid_sets=(train_set, val_set), early_stopping_rounds=50,
                          verbose_eval=50)
        oof_train[val_idx] += model.predict(x_val) / len(seeds)
        preds += model.predict(test_x) / folds / len(seeds)
        del x_trn, y_trn, x_val, y_val, model, train_set, val_set
        gc.collect()
    
    mse = (mean_squared_error(oof_train, train1['pressure']))
    
    print('-'*120)
    print('mse ', round(mse, 5))


test1['pressure'] = preds + Mon_2_1_2019


(22320, 3) (10080, 3)
fold  1
Training until validation scores don't improve for 50 rounds
[50]	training's l1: 0.0173142	valid_1's l1: 0.0169753
[100]	training's l1: 0.0147663	valid_1's l1: 0.0145043
[150]	training's l1: 0.01405	valid_1's l1: 0.0138525
[200]	training's l1: 0.0137465	valid_1's l1: 0.0135758
[250]	training's l1: 0.0135952	valid_1's l1: 0.0134684
[300]	training's l1: 0.0135064	valid_1's l1: 0.0134122
[350]	training's l1: 0.0134427	valid_1's l1: 0.0133583
[400]	training's l1: 0.0133877	valid_1's l1: 0.0133215
[450]	training's l1: 0.0133501	valid_1's l1: 0.0132927
[500]	training's l1: 0.0133263	valid_1's l1: 0.0132767
[550]	training's l1: 0.0133091	valid_1's l1: 0.013259
[600]	training's l1: 0.0132958	valid_1's l1: 0.0132594
Early stopping, best iteration is:
[563]	training's l1: 0.0133034	valid_1's l1: 0.0132544
fold  2
Training until validation scores don't improve for 50 rounds
[50]	training's l1: 0.0177608	valid_1's l1: 0.0179471
[100]	training's l1: 0.0145848	valid_1's

[450]	training's l1: 0.0132413	valid_1's l1: 0.0137042
[500]	training's l1: 0.0132185	valid_1's l1: 0.0136945
[550]	training's l1: 0.0131965	valid_1's l1: 0.0136771
[600]	training's l1: 0.0131779	valid_1's l1: 0.0136626
[650]	training's l1: 0.013165	valid_1's l1: 0.0136518
[700]	training's l1: 0.0131538	valid_1's l1: 0.0136475
[750]	training's l1: 0.0131479	valid_1's l1: 0.0136453
[800]	training's l1: 0.0131368	valid_1's l1: 0.0136407
[850]	training's l1: 0.0131301	valid_1's l1: 0.0136391
[900]	training's l1: 0.0131234	valid_1's l1: 0.013639
[950]	training's l1: 0.013117	valid_1's l1: 0.0136389
Early stopping, best iteration is:
[928]	training's l1: 0.0131194	valid_1's l1: 0.0136361
fold  5
Training until validation scores don't improve for 50 rounds
[50]	training's l1: 0.0177235	valid_1's l1: 0.0174139
[100]	training's l1: 0.0146986	valid_1's l1: 0.0144505
[150]	training's l1: 0.0140149	valid_1's l1: 0.0138397
[200]	training's l1: 0.0137116	valid_1's l1: 0.0135708
[250]	training's l1:

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


分段2 训练集、测试集

In [19]:
train2019Mon4 = train2019[(train2019['Time_time'] >= '2019-4-1') & (train2019['Time_time'] <= '2019-4-30')]
train2019Mon3 = train2019[(train2019['Time_time'] >= '2019-3-1') & (train2019['Time_time'] <= '2019-3-30')]
Mon_4_3_2019 = train2019Mon4['pressure'].mean() - train2019Mon3['pressure'].mean()


train2 = train2020[(train2020['Time_time'] >= '2020-3-1') & (train2020['Time_time'] <= '2020-3-31')]
test2 = test[(test['Time_time'] >= '2020-4-6') & (test['Time_time'] <= '2020-4-19')]


used_feat = [f for f in train2.columns if f not in ['id', 'pressure', 'Time', 'Time_time']]
print('feat nums ', len(used_feat), used_feat)

feat nums  3 ['MeasName', 'Hour', 'Day']


In [20]:
train_x = train2[used_feat]
train_y = train2['pressure']
test_x = test2[used_feat]
print(train_x.shape, test_x.shape)

# -----------------------------------------------
scores = []

params = {'learning_rate': 0.1, 
        'boosting_type': 'gbdt', 
        'objective': 'regression_l1',
        'metric': 'mae',
        'min_child_samples': 46, 
        'min_child_weight': 0.01,
        'feature_fraction': 0.8, 
        'bagging_fraction': 0.8, 
        'bagging_freq': 2, 
        'num_leaves': 16, 
        'max_depth': 5, 
        'n_jobs': -1, 
        'seed': 2019, 
        'verbosity': -1, 
       }



oof_train = np.zeros(len(train_x))
preds = np.zeros(len(test_x))
folds = 5
seeds = [2048, 1997] 
for seed in seeds: 
    kfold = KFold(n_splits=folds, shuffle=True, random_state=seed)
    for fold, (trn_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
        print('fold ', fold + 1)
        x_trn, y_trn, x_val, y_val = train_x.iloc[trn_idx], train_y.iloc[trn_idx], train_x.iloc[val_idx], train_y.iloc[val_idx]
        train_set = lgb.Dataset(x_trn, y_trn)
        val_set = lgb.Dataset(x_val, y_val)

        model = lgb.train(params, train_set, num_boost_round=500000,
                          valid_sets=(train_set, val_set), early_stopping_rounds=50,
                          verbose_eval=50)
        oof_train[val_idx] += model.predict(x_val) / len(seeds)
        preds += model.predict(test_x) / folds / len(seeds)
        del x_trn, y_trn, x_val, y_val, model, train_set, val_set
        gc.collect()
    
    mse = (mean_squared_error(oof_train, train2['pressure']))
    
    print('-'*120)
    print('mse ', round(mse, 5))

    
test2['pressure'] = preds + Mon_4_3_2019


(22320, 3) (10080, 3)
fold  1
Training until validation scores don't improve for 50 rounds
[50]	training's l1: 0.018869	valid_1's l1: 0.0174423
[100]	training's l1: 0.0158103	valid_1's l1: 0.0144793
[150]	training's l1: 0.0151531	valid_1's l1: 0.0139297
[200]	training's l1: 0.0148434	valid_1's l1: 0.0136581
[250]	training's l1: 0.0146681	valid_1's l1: 0.0135006
[300]	training's l1: 0.0145683	valid_1's l1: 0.0134137
[350]	training's l1: 0.0144844	valid_1's l1: 0.0133426
[400]	training's l1: 0.0144234	valid_1's l1: 0.0132958
[450]	training's l1: 0.0143911	valid_1's l1: 0.0132721
[500]	training's l1: 0.0143568	valid_1's l1: 0.0132578
[550]	training's l1: 0.0143307	valid_1's l1: 0.0132427
[600]	training's l1: 0.0143101	valid_1's l1: 0.0132226
[650]	training's l1: 0.0142947	valid_1's l1: 0.0132127
[700]	training's l1: 0.0142772	valid_1's l1: 0.0132037
[750]	training's l1: 0.0142599	valid_1's l1: 0.0131965
[800]	training's l1: 0.0142469	valid_1's l1: 0.0131823
[850]	training's l1: 0.0142348	

[850]	training's l1: 0.014055	valid_1's l1: 0.0137546
Early stopping, best iteration is:
[825]	training's l1: 0.0140616	valid_1's l1: 0.0137519
fold  3
Training until validation scores don't improve for 50 rounds
[50]	training's l1: 0.0182748	valid_1's l1: 0.0187511
[100]	training's l1: 0.015445	valid_1's l1: 0.0163186
[150]	training's l1: 0.0147685	valid_1's l1: 0.0157058
[200]	training's l1: 0.0144409	valid_1's l1: 0.015426
[250]	training's l1: 0.0142004	valid_1's l1: 0.0152056
[300]	training's l1: 0.0140901	valid_1's l1: 0.0151295
[350]	training's l1: 0.0140164	valid_1's l1: 0.015082
[400]	training's l1: 0.0139639	valid_1's l1: 0.0150425
[450]	training's l1: 0.0139207	valid_1's l1: 0.015006
[500]	training's l1: 0.0138937	valid_1's l1: 0.0149924
[550]	training's l1: 0.0138729	valid_1's l1: 0.0149747
[600]	training's l1: 0.0138496	valid_1's l1: 0.0149543
[650]	training's l1: 0.0138327	valid_1's l1: 0.0149397
[700]	training's l1: 0.013816	valid_1's l1: 0.0149327
[750]	training's l1: 0.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


分段3 训练集、测试集

In [21]:
train2019Mon6 = train2019[(train2019['Time_time'] >= '2019-6-1') & (train2019['Time_time'] <= '2019-6-30')]
train2019Mon5 = train2019[(train2019['Time_time'] >= '2019-5-1') & (train2019['Time_time'] <= '2019-5-30')]
Mon_6_5_2019 = train2019Mon6['pressure'].mean() - train2019Mon5['pressure'].mean()

train3 = train2020[(train2020['Time_time'] >= '2020-5-1') & (train2020['Time_time'] <= '2020-5-31')]
test3 = test[(test['Time_time'] >= '2020-6-1') & (test['Time_time'] <= '2020-6-14')]


used_feat = [f for f in train3.columns if f not in ['id', 'pressure', 'Time', 'Time_time']]
print('feat nums ', len(used_feat), used_feat)

feat nums  3 ['MeasName', 'Hour', 'Day']


In [22]:
train_x = train3[used_feat]
train_y = train3['pressure']
test_x = test3[used_feat]
print(train_x.shape, test_x.shape)

# -----------------------------------------------
scores = []

params = {'learning_rate': 0.1, 
        'boosting_type': 'gbdt', 
        'objective': 'regression_l1',
        'metric': 'mae',
        'min_child_samples': 46, 
        'min_child_weight': 0.01,
        'feature_fraction': 0.8, 
        'bagging_fraction': 0.8, 
        'bagging_freq': 2, 
        'num_leaves': 16, 
        'max_depth': 5, 
        'n_jobs': -1, 
        'seed': 2019, 
        'verbosity': -1, 
       }



oof_train = np.zeros(len(train_x))
preds = np.zeros(len(test_x))
folds = 5
seeds = [2048, 1997] 
for seed in seeds: 
    kfold = KFold(n_splits=folds, shuffle=True, random_state=seed)
    for fold, (trn_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
        print('fold ', fold + 1)
        x_trn, y_trn, x_val, y_val = train_x.iloc[trn_idx], train_y.iloc[trn_idx], train_x.iloc[val_idx], train_y.iloc[val_idx]
        train_set = lgb.Dataset(x_trn, y_trn)
        val_set = lgb.Dataset(x_val, y_val)

        model = lgb.train(params, train_set, num_boost_round=500000,
                          valid_sets=(train_set, val_set), early_stopping_rounds=50,
                          verbose_eval=50)
        oof_train[val_idx] += model.predict(x_val) / len(seeds)
        preds += model.predict(test_x) / folds / len(seeds)
        del x_trn, y_trn, x_val, y_val, model, train_set, val_set
        gc.collect()
    
    mse = (mean_squared_error(oof_train, train3['pressure']))
    
    print('-'*120)
    print('mse ', round(mse, 5))


test3['pressure'] = preds + Mon_6_5_2019


(22320, 3) (10080, 3)
fold  1
Training until validation scores don't improve for 50 rounds
[50]	training's l1: 0.0188884	valid_1's l1: 0.0187197
[100]	training's l1: 0.015304	valid_1's l1: 0.0152618
[150]	training's l1: 0.0142656	valid_1's l1: 0.014327
[200]	training's l1: 0.0137474	valid_1's l1: 0.0138451
[250]	training's l1: 0.0134329	valid_1's l1: 0.0135748
[300]	training's l1: 0.013237	valid_1's l1: 0.0134216
[350]	training's l1: 0.0131166	valid_1's l1: 0.0133055
[400]	training's l1: 0.0130105	valid_1's l1: 0.0131886
[450]	training's l1: 0.0129464	valid_1's l1: 0.0131018
[500]	training's l1: 0.0129016	valid_1's l1: 0.0130665
[550]	training's l1: 0.0128681	valid_1's l1: 0.0130438
[600]	training's l1: 0.0128265	valid_1's l1: 0.0130029
[650]	training's l1: 0.0127987	valid_1's l1: 0.0129668
[700]	training's l1: 0.0127758	valid_1's l1: 0.0129468
[750]	training's l1: 0.0127505	valid_1's l1: 0.0129273
[800]	training's l1: 0.0127302	valid_1's l1: 0.0129033
[850]	training's l1: 0.0127154	va

[250]	training's l1: 0.0133886	valid_1's l1: 0.0134739
[300]	training's l1: 0.0132009	valid_1's l1: 0.0133139
[350]	training's l1: 0.0130728	valid_1's l1: 0.013208
[400]	training's l1: 0.0129879	valid_1's l1: 0.0131528
[450]	training's l1: 0.012923	valid_1's l1: 0.0130947
[500]	training's l1: 0.012873	valid_1's l1: 0.0130499
[550]	training's l1: 0.0128303	valid_1's l1: 0.0130127
[600]	training's l1: 0.0127978	valid_1's l1: 0.0129891
[650]	training's l1: 0.0127678	valid_1's l1: 0.0129739
[700]	training's l1: 0.0127452	valid_1's l1: 0.0129519
[750]	training's l1: 0.0127263	valid_1's l1: 0.0129389
[800]	training's l1: 0.0127053	valid_1's l1: 0.012923
[850]	training's l1: 0.0126926	valid_1's l1: 0.0129194
[900]	training's l1: 0.012682	valid_1's l1: 0.0129158
[950]	training's l1: 0.0126739	valid_1's l1: 0.0129158
Early stopping, best iteration is:
[912]	training's l1: 0.0126797	valid_1's l1: 0.0129123
fold  3
Training until validation scores don't improve for 50 rounds
[50]	training's l1: 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


分段4 训练集、测试集

In [23]:
train2019Mon9 = train2019[(train2019['Time_time'] >= '2019-9-1') & (train2019['Time_time'] <= '2019-9-30')]
train2019Mon8 = train2019[(train2019['Time_time'] >= '2019-8-1') & (train2019['Time_time'] <= '2019-8-30')]
Mon_9_8_2019 = train2019Mon9['pressure'].mean() - train2019Mon8['pressure'].mean()


train4 = train2020[(train2020['Time_time'] >= '2020-8-1') & (train2020['Time_time'] <= '2020-8-31')]
test4 = test[(test['Time_time'] >= '2020-9-7') & (test['Time_time'] <= '2020-9-20')]

used_feat = [f for f in train4.columns if f not in ['id', 'pressure', 'Time', 'Time_time']]
print('feat nums ', len(used_feat), used_feat)

feat nums  3 ['MeasName', 'Hour', 'Day']


In [24]:
train_x = train4[used_feat]
train_y = train4['pressure']
test_x = test4[used_feat]
print(train_x.shape, test_x.shape)

# -----------------------------------------------
scores = []

params = {'learning_rate': 0.1, 
        'boosting_type': 'gbdt', 
        'objective': 'regression_l1',
        'metric': 'mae',
        'min_child_samples': 46, 
        'min_child_weight': 0.01,
        'feature_fraction': 0.8, 
        'bagging_fraction': 0.8, 
        'bagging_freq': 2, 
        'num_leaves': 16, 
        'max_depth': 5, 
        'n_jobs': -1, 
        'seed': 2019, 
        'verbosity': -1, 
       }



oof_train = np.zeros(len(train_x))
preds = np.zeros(len(test_x))
folds = 5
seeds = [2048, 1997] 
for seed in seeds: 
    kfold = KFold(n_splits=folds, shuffle=True, random_state=seed)
    for fold, (trn_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
        print('fold ', fold + 1)
        x_trn, y_trn, x_val, y_val = train_x.iloc[trn_idx], train_y.iloc[trn_idx], train_x.iloc[val_idx], train_y.iloc[val_idx]
        train_set = lgb.Dataset(x_trn, y_trn)
        val_set = lgb.Dataset(x_val, y_val)

        model = lgb.train(params, train_set, num_boost_round=500000,
                          valid_sets=(train_set, val_set), early_stopping_rounds=50,
                          verbose_eval=50)
        oof_train[val_idx] += model.predict(x_val) / len(seeds)
        preds += model.predict(test_x) / folds / len(seeds)
        del x_trn, y_trn, x_val, y_val, model, train_set, val_set
        gc.collect()
    
    mse = (mean_squared_error(oof_train, train4['pressure']))
    
    print('-'*120)
    print('mse ', round(mse, 5))
    
test4['pressure'] = preds + Mon_9_8_2019

(22320, 3) (10080, 3)
fold  1
Training until validation scores don't improve for 50 rounds
[50]	training's l1: 0.0153139	valid_1's l1: 0.0152649
[100]	training's l1: 0.0128988	valid_1's l1: 0.0130196
[150]	training's l1: 0.0122528	valid_1's l1: 0.0124379
[200]	training's l1: 0.0119947	valid_1's l1: 0.0122026
[250]	training's l1: 0.0118477	valid_1's l1: 0.012072
[300]	training's l1: 0.0117407	valid_1's l1: 0.0119733
[350]	training's l1: 0.0116779	valid_1's l1: 0.0119222
[400]	training's l1: 0.0116208	valid_1's l1: 0.01187
[450]	training's l1: 0.0115897	valid_1's l1: 0.0118553
[500]	training's l1: 0.0115605	valid_1's l1: 0.0118433
[550]	training's l1: 0.011537	valid_1's l1: 0.0118216
[600]	training's l1: 0.0115184	valid_1's l1: 0.0118109
[650]	training's l1: 0.0115015	valid_1's l1: 0.011808
[700]	training's l1: 0.0114864	valid_1's l1: 0.0117956
[750]	training's l1: 0.0114753	valid_1's l1: 0.0117893
[800]	training's l1: 0.0114626	valid_1's l1: 0.0117855
[850]	training's l1: 0.0114533	vali

[850]	training's l1: 0.0114267	valid_1's l1: 0.0118271
[900]	training's l1: 0.0114207	valid_1's l1: 0.0118168
[950]	training's l1: 0.0114128	valid_1's l1: 0.0118222
Early stopping, best iteration is:
[919]	training's l1: 0.0114177	valid_1's l1: 0.0118146
fold  4
Training until validation scores don't improve for 50 rounds
[50]	training's l1: 0.0159187	valid_1's l1: 0.0158127
[100]	training's l1: 0.0130334	valid_1's l1: 0.012961
[150]	training's l1: 0.0123057	valid_1's l1: 0.0123045
[200]	training's l1: 0.0120389	valid_1's l1: 0.0120522
[250]	training's l1: 0.0118855	valid_1's l1: 0.0119122
[300]	training's l1: 0.0117798	valid_1's l1: 0.0118384
[350]	training's l1: 0.0117015	valid_1's l1: 0.011794
[400]	training's l1: 0.0116494	valid_1's l1: 0.0117739
[450]	training's l1: 0.0116096	valid_1's l1: 0.0117469
[500]	training's l1: 0.0115725	valid_1's l1: 0.011727
[550]	training's l1: 0.011547	valid_1's l1: 0.0117216
[600]	training's l1: 0.011524	valid_1's l1: 0.011709
[650]	training's l1: 0.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [25]:
test = pd.concat([test1, test2, test3, test4], axis = 0)

In [26]:
test[['id', 'pressure']].to_csv('../sub/submit.csv', index = False)
test.head()

Unnamed: 0,id,Time,MeasName,Hour,Time_time,Day,pressure
0,0,2020-02-03,4,0,2020-02-03,3,0.439729
1,1,2020-02-03,4,1,2020-02-03,3,0.452367
2,2,2020-02-03,4,2,2020-02-03,3,0.459496
3,3,2020-02-03,4,3,2020-02-03,3,0.464083
4,4,2020-02-03,4,4,2020-02-03,3,0.466106
