In [1]:
import warnings
warnings.filterwarnings('ignore')

import gc

import numpy as np
import pandas as pd
from tqdm import tqdm

from tools import *

# Features

In [None]:
s_features = ['B_HYC_NH4', 'B_HYC_XD', 'B_HYC_MLSS', 'B_HYC_JS_DO', 'B_HYC_DO',
       'B_CS_MQ_SSLL', 'B_QY_ORP']
n_features = ['N_HYC_NH4', 'N_HYC_XD', 'N_HYC_MLSS',
       'N_HYC_JS_DO', 'N_HYC_DO', 'N_CS_MQ_SSLL', 'N_QY_ORP']
base_features = ['JS_NH3', 'CS_NH3', 'JS_TN', 'CS_TN', 'JS_LL', 'CS_LL',
       'MCCS_NH4', 'MCCS_NO3', 'JS_COD', 'CS_COD', 'JS_SW', 'CS_SW',]

# Pre-processing

## load data

In [2]:
train_data = pd.read_csv(r'./data/train_dataset.csv')
test_data = pd.read_csv(r'./data/evaluation_public.csv')
train_data.shape, test_data.shape

((140480, 29), (10000, 27))

In [3]:
train_data['istest'] = 0
test_data['istest'] = 1
data = pd.concat([train_data, test_data], axis=0, ignore_index=True)
del train_data, test_data
gc.collect()
data.shape

(150480, 30)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150480 entries, 0 to 150479
Data columns (total 30 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   time          150480 non-null  object 
 1   JS_NH3        123955 non-null  float64
 2   CS_NH3        123432 non-null  float64
 3   JS_TN         123955 non-null  float64
 4   CS_TN         123532 non-null  float64
 5   JS_LL         130737 non-null  float64
 6   CS_LL         130970 non-null  float64
 7   MCCS_NH4      130974 non-null  float64
 8   MCCS_NO3      130974 non-null  float64
 9   JS_COD        123955 non-null  float64
 10  CS_COD        130534 non-null  float64
 11  JS_SW         122342 non-null  float64
 12  CS_SW         122585 non-null  float64
 13  B_HYC_NH4     130975 non-null  float64
 14  B_HYC_XD      130975 non-null  float64
 15  B_HYC_MLSS    130975 non-null  float64
 16  B_HYC_JS_DO   130975 non-null  float64
 17  B_HYC_DO      130975 non-null  float64
 18  B_CS

## get time

In [5]:
data['time'] = pd.to_datetime(data['time'])

data['month'] = data['time'].dt.month
data['minute'] = data['time'].dt.minute

data['day'] = data['time'].dt.day
data['hour'] = data['time'].dt.hour
data['dayofweek'] = data['time'].dt.dayofweek

data['minute10'] = (data['minute'] // 10) * 10

data['hourl'] = data['day'] * 24 + data['hour']
data['hourl'] = data['hourl'] - data['hourl'].min() # 将时转换成连续递增序列

data['minute10l'] = data['hourl'] * 60 + data['minute10']
data['minute10l'] = data['minute10l'] - data['minute10l'].min()  # 将分转换成连续递增序列
data['ts'] = pd.to_datetime(data['time'], format='%Y/%m/%d %H:%M').dt.strftime("%Y%m%d%H%M")

data = data.sort_values(by='ts')

In [8]:
data.columns.values

array(['time', 'JS_NH3', 'CS_NH3', 'JS_TN', 'CS_TN', 'JS_LL', 'CS_LL',
       'MCCS_NH4', 'MCCS_NO3', 'JS_COD', 'CS_COD', 'JS_SW', 'CS_SW',
       'B_HYC_NH4', 'B_HYC_XD', 'B_HYC_MLSS', 'B_HYC_JS_DO', 'B_HYC_DO',
       'B_CS_MQ_SSLL', 'B_QY_ORP', 'N_HYC_NH4', 'N_HYC_XD', 'N_HYC_MLSS',
       'N_HYC_JS_DO', 'N_HYC_DO', 'N_CS_MQ_SSLL', 'N_QY_ORP', 'Label1',
       'Label2', 'istest', 'month', 'minute', 'day', 'hour', 'dayofweek',
       'minute10', 'hourl', 'minute10l', 'ts'], dtype=object)

## Fillna

In [10]:
for feat in tqdm(s_features+n_features+base_features):
    data[feat] = data[feat].interpolate(method='spline', order=3) 

  8%|▊         | 2/26 [00:00<00:04,  5.26it/s]

In [None]:
data.to_pickle(r'./data/data_fillna.pickle')

# load pre-preprocessing data

In [None]:
data = pd.read_pickle(r'./data/data_fillna.pickle')

# Roll features

In [None]:
roll_cols = base_features + s_features + n_features

for i in range(1,5):
    data[[ii+f'_roll_{i}_mean_diff' for ii in roll_cols]] = data[roll_cols].rolling(i, min_periods=1).sum().diff()
    
data[[ii+'_roll_8_mean' for ii in roll_cols]] = data[roll_cols].rolling(8, min_periods=1).mean()
data[[ii+'_roll_16_mean' for ii in roll_cols]] = data[roll_cols].rolling(16, min_periods=1).mean()

data[[ii+'_roll_16_mean_diff' for ii in roll_cols]] = data[[ii+'_roll_16_mean' for ii in roll_cols]].diff()
data[[ii+'_roll_8_mean_diff' for ii in roll_cols]] = data[[ii+'_roll_8_mean' for ii in roll_cols]].diff()

data[[ii+'_roll_8_std' for ii in roll_cols]] = data[roll_cols].rolling(8, min_periods=1).std()