1.1 Build LightGBM with GPU support + install dependencies.

In [None]:
!pip install numerapi catboost xgboost lightgbm catboost
!git clone --recursive https://github.com/Microsoft/LightGBM
%cd /content/LightGBM
!mkdir build
!cmake -DUSE_GPU=1
!make -j$(nproc)
!sudo apt-get -y install python-pip
!sudo -H pip install setuptools pandas numpy scipy scikit-learn -U
%cd /content/LightGBM/python-package
!sudo python setup.py install --precompile

1.2 Import dependencies + Configure environment.

In [None]:
import numpy as np
import pandas as pd
import catboost as cb
import lightgbm as lgb
import numerapi, warnings
from sklearn import preprocessing
from sklearn.model_selection import KFold

napi = numerapi.NumerAPI(verbosity="info")
warnings.filterwarnings('ignore')

leaderboard = napi.get_leaderboard()
# check if a new round has started

try:
  if napi.check_new_round():
    print("Ready.")
  else:
    print("In progress.")
except:
  print("Not ready.")

2.1 Import dataset + optimize memory usage.

In [None]:
host = 'numerai-public-datasets.s3-us-west-2.amazonaws.com'
filename_tr = 'latest_numerai_training_data.csv.xz'
filename_te = 'latest_numerai_tournament_data.csv.xz'

# Import data
df_tr = pd.read_csv('https://{}/{}'.format(host, filename_tr))
df_te = pd.read_csv('https://{}/{}'.format(host, filename_te))

2.2 Most of the features are encoded as obfuscated floating point values at 0.25 increments. We can represent these step-wise values as integers by multiplying by 4.

In [None]:
def labelencode(col, df_train, df_test):
  # Convert strings to ids using a LabelEncoder.
  le = preprocessing.LabelEncoder()
  le.fit(list(df_train[col].values) + list(df_test[col].values))
  df_train[col] = le.transform(df_train[col])
  df_test[col] = le.transform(df_test[col])

def preprocess(df_orig):
  df = df_orig.copy()

  # Upscale non-categorical columns to integer features.
  c = list(set(df.columns) - set(['era', 'data_type']))
  df[c] = (df[c] * 4).astype(np.int32)
  df['era'] = df['era'].astype('category')
  df['data_type'] = df['data_type'].astype('category')
  
  return df

# Encode labels.
labelencode('era', df_tr, df_te)
labelencode('data_type', df_tr, df_te)

# Define & scale targets.
y_tr = df_tr['target_kazutsugi']
y_tr_int = (df_tr['target_kazutsugi'] * 4).astype(np.int32)

# Remove non-feature columns from X-dataset.
df_tr.drop(['target_kazutsugi', 'id'], axis=1, inplace=True)
df_tr_processed = preprocess(df_tr)

df_te.drop(['target_kazutsugi', 'id'], axis=1, inplace=True)
df_te_processed = preprocess(df_te)

3.1 Train LightGBM KFold CV models + make predictions using Integer targets.

In [None]:
def scale(arr, minv, maxv):
  return np.interp(arr, (np.min(arr), np.max(arr)), (minv, maxv))

# Catboost settings.
params_c = {
    'iterations': 5000,
    'task_type': 'GPU',
    'eval_metric': 'RMSE',
    'loss_function': 'RMSE',
    'od_type': 'Iter',
    'od_wait': 100 
}

# LightGBM settings.
params_l = {
    'objective':'mse',
    'boosting_type':'gbrt',
    'metric':'mse',
    'device_type':'gpu',
    'max_depth': 10
}

N_EPOCH = 10
N_FOLD = 10
FEATURE_DROPOUT = 0.03 # Reduce features each epoch.

f_c = list(df_tr_processed.columns[df_tr_processed.columns.str.startswith('feature')])
f_l = list(df_tr_processed.columns[df_tr_processed.columns.str.startswith('feature')])
preds1 = []
for i in range(N_EPOCH):
  pred_l = np.zeros(len(df_te))
  folds = KFold(n_splits=N_FOLD)
  for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_tr_processed[f_l], y_tr_int)):
    tr_x, tr_y = df_tr_processed[f_l].iloc[trn_idx,:], y_tr_int[trn_idx]
    vl_x, vl_y = df_tr_processed[f_l].iloc[val_idx,:], y_tr_int[val_idx]

    print('EPOCH {}/{} | LGBM FOLD {}/{}'.format(i+1, N_EPOCH, fold_+1, N_FOLD))
    tr_data = lgb.Dataset(tr_x, label=tr_y)
    vl_data = lgb.Dataset(vl_x, label=vl_y)  
    m_l = lgb.train(
        params_l,
        tr_data,
        valid_sets = [tr_data, vl_data],
        verbose_eval = 200,
    )

    # Predict within fold.
    pred_l += m_l.predict(df_te_processed[f_l])/N_FOLD

    # Scale ints back to PCA floats.
    pred_l *= 0.25

    # Reduce LGBM Features by sorted importance.
    f_imp = pd.DataFrame(sorted(zip(m_l.feature_importance(),
                                    df_tr_processed[f_l].columns)),
                        columns=['Value', 'Feature'])
    col_drop = int(len(f_imp) * FEATURE_DROPOUT)
    f_l = list(f_imp[col_drop:]['Feature'].values)

  # Store epoch prediction.
  preds1.append(pred_l)

# Average all predictions.
preds1 = np.mean(preds1, axis=0)
preds1 = scale(preds1, 0, 1)

3.1 Train CatBoost + LightGBM KFold CV ensemble models + make predictions with float targets.

In [None]:
pred_c = np.zeros(len(df_te))
pred_l = np.zeros(len(df_te))
folds = KFold(n_splits=N_FOLD, shuffle=True)
for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_tr, y_tr)):
  tr_x, tr_y = df_tr.iloc[trn_idx,:], y_tr[trn_idx]
  vl_x, vl_y = df_tr.iloc[val_idx,:], y_tr[val_idx]
  tr_data = lgb.Dataset(tr_x, label=tr_y)
  vl_data = lgb.Dataset(vl_x, label=vl_y)

  m_c = cb.CatBoost(params_c)
  m_c.fit(tr_x, tr_y, eval_set=[(vl_x, vl_y)])
  pred_c += m_c.predict(df_te)/N_FOLD

  print('FOLD {}/{}'.format(fold_+1, N_FOLD))
  m_l = lgb.train(
      params_l,
      tr_data,
      valid_sets = [tr_data, vl_data],
      verbose_eval = 200
  )   
  pred_l += m_l.predict(df_te)/N_FOLD

# Average CB + LightGBM predictions.
preds2 = np.mean([pred_c, pred_l], axis=0)
preds2 = scale(preds2, 0, 1)

3.3 Average all ensemble predictions

In [None]:
preds = (preds1 + preds2) / 2

# Interpolate values between zero and one.
preds = scale(preds, 0, 1)

# Construct submission dataframe.
df['prediction_kazutsugi'] = preds
df.head()

4.0 Submit predictions \o/

In [None]:
df.to_csv('./predictions.csv', index=False)
submission_id = napi.upload_predictions('./predictions.csv')
print(submission_id)