In [65]:
import pandas as pd
import numpy as np

from sklearn.utils import shuffle
from sklearn.model_selection import KFold

import lightgbm as lgb
from lightgbm import LGBMRegressor
import lightgbm

from contextlib import contextmanager
import time
import gc

import random
seed = 10
random.seed(seed)
np.random.seed(seed)

In [66]:
def read_df():
  df_train = pd.read_csv('../input/Train.csv')
  print("Train shape: {}".format(df_train.shape))
  return df_train
  
df_train = read_df()

Train shape: (28049, 15)


In [67]:
def train_data_cleaning(df):
  outlayers = [(4384, 4770), (19469, 19739)]
  columns = ['Air temperature (C)', 'Air humidity (%)', 'Pressure (KPa)', 
             'Wind speed (Km/h)', 'Wind gust (Km/h)', 'Wind direction (Deg)'] 
  for c in columns:
    for start, end in outlayers:
      df[c][start: end] = np.nan
      for i in range(start, end):
        if np.isnan(df.iloc[i + 288][c]):
          df[c][i] = (11 * df.iloc[i + 2 * 288][c] + 8 * df.iloc[i - 288][c] + 7 * df.iloc[i - 2 * 288][c]) / 26
        else:
          df[c][i] =  (df.iloc[i + 288][c] + df.iloc[i + 2 * 288][c] + df.iloc[i - 288][c] + df.iloc[i - 2 * 288][c]) / 4 

  return df

In [68]:
def test_data_forecasting(df):
  columns = ['Air temperature (C)', 'Air humidity (%)', 'Pressure (KPa)',
             'Wind speed (Km/h)', 'Wind gust (Km/h)', 'Wind direction (Deg)',             
            ]
  for (start_v, end_v, end_n) in [(0, 8914, 10067), (10067, 16083, 17236), (17236, 26301, len(df))]:
    for c in columns:
      for p in range(end_v, end_n, 2 * 288):
        pred = df[c].iloc[p - 2 *288: p]
        df[c][p: min(p + 2 * 288, end_n)] = pred[0: min(2 * 288, end_n - p)]
  
  return df

In [69]:
def feature_engineering(df):       
  # environment features
  df['D Air temperature (C)'] = df['Air temperature (C)'] - df['Air temperature (C)'].shift(1)
  df['D Pressure (KPa)'] = df['Pressure (KPa)'] - df['Pressure (KPa)'].shift(1)
    
  # control features
  df['M Irrigation field'] = df['Irrigation field'] * df['Irrigation field'].rolling(window=24).sum()    
  df['D Air temperature (C)'] = df['Irrigation field'] * df['D Air temperature (C)']  
  df['D Pressure (KPa)'] = df['Irrigation field'] * df['D Pressure (KPa)'] 

  # state features 

  # target
  df['Velocity'] = df['Soil humidity'] - df['Soil humidity'].shift(1)

  return df


In [70]:
input_columns = ['Irrigation field', 'M Irrigation field',
                'D Pressure (KPa)',  'D Air temperature (C)',
               ]

In [71]:
def train(train_df, input_columns, target_column):
  train_df = train_df[train_df[target_column].notnull()].copy()

  folds = KFold(n_splits=5, shuffle=True, random_state=123)

  regs = []
  train_x = train_df[input_columns]
  train_y = train_df[target_column]
  for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train_x, train_y)):

    trn_x, trn_y = train_x.iloc[trn_idx], train_y.iloc[trn_idx]
    val_x, val_y = train_x.iloc[val_idx], train_y.iloc[val_idx]
    reg = LGBMRegressor(
        n_estimators=2000,
        learning_rate=0.5,
        num_leaves=123,
        colsample_bytree=.8,
        subsample=.9,
        max_depth=15,
        reg_alpha=.1,
        reg_lambda=.1,
        min_split_gain=.01,
        min_child_weight=2
    )
    reg.fit(trn_x, trn_y, 
            eval_set= [(trn_x, trn_y), (val_x, val_y)], verbose=250, early_stopping_rounds=150,
           )
    #lightgbm.plot_importance(reg, height=1.0, max_num_features=10)
    regs.append(reg)
    del reg, trn_x, trn_y, val_x, val_y 
    gc.collect() 
  return regs
  

In [72]:
def predict(regs, test_df, first_state, field_id):
  test_df = test_df.rename(columns={'Soil humidity': 'Values'})
  indices = [-1] + list(test_df[test_df['Values'].notnull()]['Values'].index)
  test_df['Velocity'] = np.mean([reg.predict(test_df[input_columns]) for reg in regs ], axis=0)

  def forward_process(start, end):
    preds = []
    pred = first_state if start == -1 else test_df['Values'].iloc[start]
    for j in range(start + 1, end):
      pred += test_df['Velocity'].iloc[j]
      preds.append(pred.copy())
    return preds

  def backward_process(start, end):
    preds = []
    pred = test_df['Values'].iloc[end]    
    for j in range(end, start + 1, -1):
      pred -= test_df['Velocity'].iloc[j]
      preds.append(pred.copy())
    return preds[::-1]
   
  for start, end in zip(indices[:-1], indices[1:]):
    f_preds = forward_process(start, end)
    b_preds = backward_process(start, end)
    j = start + 1
    for i in range(start + 1, end):
      if test_df['Irrigation field'].iloc[i] == 0:
        test_df['Values'].iloc[i] = f_preds[i - start - 1]
        j = i
      else:      
        test_df['Values'].iloc[i] = (1 - (i - j)/(end - j - 1)) * f_preds[i - start - 1] +\
            ((i - j) /(end - j - 1)) * b_preds[i - start - 1]

  
  f_preds = forward_process(indices[-1], len(test_df))
  if len(f_preds) > 0:
    test_df['Values'][indices[-1] + 1: len(test_df)] = f_preds

  test_df['ID'] = test_df['timestamp'] + ' x Soil humidity ' + str(field_id + 1)
  return test_df[['ID', 'Values']]


In [73]:
@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0)) 

with timer("Importing Datasets: "):    
    df_train = read_df()
    gc.collect();

with timer("Time Series Imputation: "):
    df_train = train_data_cleaning(df_train)
    df_train = test_data_forecasting(df_train) 
    gc.collect();
    
env_columns = ['Air temperature (C)', 'Air humidity (%)', 'Pressure (KPa)', 
             'Wind speed (Km/h)', 'Wind gust (Km/h)', 'Wind direction (Deg)',]
df_train_1 = df_train[['timestamp', 'Soil humidity 1', 'Irrigation field 1', *env_columns]]
df_train_1 = df_train_1.rename(columns={
    'Soil humidity 1': 'Soil humidity', 'Irrigation field 1': 'Irrigation field'})
df_train_2 = df_train[['timestamp', 'Soil humidity 2', 'Irrigation field 2', *env_columns]]
df_train_2 = df_train_2.rename(columns={
    'Soil humidity 2': 'Soil humidity', 'Irrigation field 2': 'Irrigation field'})
df_train_3 = df_train[['timestamp', 'Soil humidity 3', 'Irrigation field 3', *env_columns]]
df_train_3 = df_train_3.rename(columns={
    'Soil humidity 3': 'Soil humidity', 'Irrigation field 3': 'Irrigation field'})
df_train_4 = df_train[['timestamp', 'Soil humidity 4', 'Irrigation field 4', *env_columns]]
df_train_4 = df_train_4.rename(columns={
    'Soil humidity 4': 'Soil humidity', 'Irrigation field 4': 'Irrigation field'})
limits = [(8914, 10067), (26301,28048), (16083, 17236), (26301,28030)]

preds = []    
for i, (df_train_i, (start, end)) in enumerate(zip([df_train_1, df_train_2, df_train_3, df_train_4], limits)): 
    with timer("Feature Engineering: "):
        df_train_i = feature_engineering(df_train_i)
        df_train_i = df_train_i.set_index('timestamp')
        df_train_i, df_test_i = df_train_i.iloc[:start], df_train_i.iloc[start: end]
        df_test_i = df_test_i.reset_index()
        df_test_i['Irrigation field'] = df_test_i['Irrigation field'].fillna(value=0)       
        gc.collect();
    with timer("Training"):
        regs = train(df_train_i, input_columns, 'Velocity')
        gc.collect()
    with timer("Testing"):
        first_state = df_train_i['Soil humidity'].iloc[start - 1]
        prediction = predict(regs, df_test_i, first_state, i)
        preds.append(prediction)
        gc.collect()

preds = pd.concat(preds, ignore_index=True)
preds.to_csv("submission.csv", index= False)

Train shape: (28049, 15)
Importing Datasets:  - done in 0s
Time Series Imputation:  - done in 7s
Feature Engineering:  - done in 0s
Training until validation scores don't improve for 150 rounds.
Early stopping, best iteration is:
[6]	training's l2: 0.00261316	valid_1's l2: 0.00360497
Training until validation scores don't improve for 150 rounds.
Early stopping, best iteration is:
[4]	training's l2: 0.0028067	valid_1's l2: 0.0029202
Training until validation scores don't improve for 150 rounds.
Early stopping, best iteration is:
[6]	training's l2: 0.00261658	valid_1's l2: 0.00357532
Training until validation scores don't improve for 150 rounds.
Early stopping, best iteration is:
[3]	training's l2: 0.00292515	valid_1's l2: 0.00297632
Training until validation scores don't improve for 150 rounds.
Early stopping, best iteration is:
[5]	training's l2: 0.00282247	valid_1's l2: 0.00261363
Training - done in 1s
Testing - done in 1s
Feature Engineering:  - done in 0s
Training until validation s