## Import Libraries

## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import gc
import lightgbm as lgb
import time
from sklearn.metrics import mean_squared_error
import scipy
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import pickle

In [None]:
## Remove scientific notation
pd.set_option('display.float_format', lambda x: '%.2f' % x)

## Import Data

In [None]:
## read the training data
## A memory optimized pickle dataset will be utilized.

df_train = pd.read_pickle('../input/ubiquant-market-prediction-half-precision-pickle/train.pkl')

In [None]:
display(df_train.shape)
display(df_train.info())
df_train.head()

In [None]:
## Number of investments
df_train.investment_id.nunique()

In [None]:
## Number of investments
df_train.investment_id.value_counts()

In [None]:
## Investment distribution
plt.figure(figsize=(10,8))
df_train.investment_id.value_counts().plot(kind='hist')

In [None]:
## Number of Time ids
df_train.time_id.nunique()

In [None]:
## Time Id summary
df_train.time_id.describe()

In [None]:
## Time id tabulation
df_train.time_id.value_counts()

In [None]:
## Time id distribution
plt.figure(figsize=(10,8))
df_train.time_id.value_counts().plot(kind='hist')

In [None]:
## Check first time IDs for most investments
#df_train.groupby('investment_id')[['investment_id','time_id']].sort_values(['investment_id','time_id']).head()
first_time_id_by_investment = df_train.sort_values(by=['investment_id', 'time_id'])[['investment_id', 'time_id']].drop_duplicates('investment_id')[['time_id']]


In [None]:
## What time id do investments start from (some investments do not start at time 0)
print(len(first_time_id_by_investment))
first_time_id_by_investment.time_id.describe()

In [None]:
## Target

df_train.target.describe()

In [None]:
## Time id distribution
plt.figure(figsize=(10,8))
df_train.target.plot()

In [None]:
sample_investment_id = 2140
sample_df = df_train[df_train['investment_id'] == sample_investment_id ]
print(sample_df.shape)
sample_df.head(10)

In [None]:
## Check for duplicate time ids for sample df
sample_df.time_id.nunique()

In [None]:
plt.figure(figsize=(10,8))
sample_df.target.plot()

In [None]:
## Correlation btn features (Top 50)
df_train.corr().unstack().sort_values(ascending=False).drop_duplicates()[:50]


In [None]:
## Sample investments with at least time_id 500
first_time_id_to_use = 500 

## Select anonymized features
features_to_use= [col for col in df_train.columns if col.startswith("f")] # use only the anonymised features

## Cap training set at time id 1000
time_id_to_split_train_and_val = 1000

## Filter for investments greater than time period
df_train = df_train.loc[df_train.time_id >= first_time_id_to_use]
print("df_train.shape: ",df_train.shape)

## Split data into train and validation set
X_train = df_train.loc[df_train.time_id < time_id_to_split_train_and_val]
X_val = df_train.loc[df_train.time_id >= time_id_to_split_train_and_val]

## Select target features
y_train = X_train.target
y_val = X_val.target

## Select independent features (anonymalized)
X_train = X_train[features_to_use]
X_val = X_val[features_to_use]

print("X_train.shape:  ", X_train.shape)
print("X_val.shape:    ", X_val.shape)


In [None]:
## Free up memory
del df_train
gc.collect()

## Light GBM

Light Gradient Boosting Machine, is a free and open source distributed gradient boosting framework for machine learning originally developed by Microsoft.
Fast, distributed, high-performance gradient boosting framework based on decision tree algorithm.

Gradient boosted decision trees is an ensemble model of decision trees. 

It is based on three important principles:

1. Weak learners (decision trees).
2. Gradient Optimization.
3. Boosting Technique.

In the gbdt method we have a lot of decision trees(weak learners). Those trees are built sequentially:

first tree learns how to fit to the target variable
second tree learns how to fit to the residual (difference) between the predictions of the first tree and the ground truth
The third tree learns how to fit the residuals of the second tree and so on.
All those trees are trained by propagating the gradients of errors throughout the system.

In [None]:
## create lgbm datasets
dtrain = lgb.Dataset(X_train, label=y_train)
dval = lgb.Dataset(X_val, label=y_val)

In [None]:
## Define parameters

lgb_params = {'objective': 'regression', ## max number of leaves in one tree.
    'metric': 'MSE', ## Mean Squared Error to be evaluated on the evaluation set.
    'boosting_type': 'gbdt', ## traditional Gradient Boosting Decision Tree
    'lambda_l1': 2.3e-05, ## combat overfitting
    'lambda_l2': 0.1, ## combat overfitting
    'num_leaves': 4, ## set the maximum number of leaves each weak learner has
    'feature_fraction': 0.5, ## randomly select a subset of features on each iteration (tree) for training.
    'bagging_fraction': 0.9, ## randomly select part of data without resampling
    'bagging_freq': 7, ## frequency for bagging
    'min_child_samples': 20, ## Minimum number of data points needed in a child (leaf) node.
    'num_iterations': 1000 ## controls the number of boosting rounds that will be performed (number of trees)
             }

In [None]:
ts = time.time()

# dict for logging the evaluation metrics
metric_over_time = {} 

## Train model
model = lgb.train(        
        lgb_params, 
        dtrain, 
        valid_sets=[dtrain, dval],
        valid_names=['train','val'],
        callbacks=[lgb.early_stopping(100), lgb.log_evaluation(100), lgb.record_evaluation(metric_over_time)]
    )


execution_time = time.time() - ts
print("\nTraining time: " + str(round(execution_time,3)) + "s")

In [None]:
y_val_hat = model.predict(X_val)

# using MSE as a proxy for pearson corellation (https://www.kaggle.com/c/ubiquant-market-prediction/discussion/302181)
print("MSE:  ", mean_squared_error(y_val, y_val_hat, squared=True))
print("RMSE: ", mean_squared_error(y_val, y_val_hat, squared=False))

In [None]:
# check the competition metric 
# correlation between predicted and actual
corr, _ = pearsonr(y_val_hat, y_val)
print("Pearson Correlation Coeficient Validation Data: ", corr)

In [None]:
## Plot metric (MSE)
lgb.plot_metric(metric_over_time, figsize=(10,5))
plt.show()

In [None]:
# let's look at which features lgbm deems important
lgb.plot_importance(model, figsize=(10,40), importance_type='gain', max_num_features=300) # importance_type: gain/split: V7 has 'split'
plt.show()

In [None]:
# there are features with little importance, get rid of them
imp = pd.DataFrame({'Value':model.feature_importance(importance_type='gain'),'Feature':X_train.columns}).sort_values(by="Value",ascending=False).reset_index(drop=True)
print(imp.head())
print("")
print("Value Distribution")
print(imp.Value.describe())


In [None]:
#imp.Value.value_counts()
imp = imp[imp.Value>500]  # remove all features with gain lower than 100
new_feature_list = list(imp.Feature)
print("Number of features, new: ", len(new_feature_list))

In [None]:
# save model to disk, it will take up approx. 263kB
filename = 'finalized_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [None]:
#del model,y_val_hat,dtrain,dval,imp # free up memory
del model,y_val_hat,dtrain,dval

In [None]:
## Now it's time to retrain with the reduced feature set.

dtrain = lgb.Dataset(X_train[new_feature_list], label=y_train)
dval = lgb.Dataset(X_val[new_feature_list], label=y_val)


In [None]:

ts = time.time()

model = lgb.train(        
        lgb_params, 
        dtrain, 
        valid_sets=[dtrain, dval],
        valid_names=['train','val'],
        callbacks=[lgb.early_stopping(100), lgb.log_evaluation(100), lgb.record_evaluation(metric_over_time)]
    )


execution_time = time.time() - ts
print("\nTraining time: " + str(round(execution_time,3)) + "s")

In [None]:
y_val_hat = model.predict(X_val[new_feature_list])

# using MSE as a proxy for pearson corellation (https://www.kaggle.com/c/ubiquant-market-prediction/discussion/302181)
print("MSE:  ", mean_squared_error(y_val, y_val_hat, squared=True))
print("RMSE: ", mean_squared_error(y_val, y_val_hat, squared=False))

In [None]:
corr, _ = pearsonr(y_val_hat, y_val)
print("Pearson Correlation Coeficient Validation Data: ", corr)

In [None]:
# save model to disk, it will take up approx. 205kB
filename = 'finalized_model_reduced.sav'
pickle.dump(model, open(filename, 'wb'))

## Submit

In [None]:
def preprocess(df, features):
    df = df[features]  
    return df
    
def make_predictions(model, df): 
    pred = model.predict(df)
    return pred

In [None]:
import ubiquant
env = ubiquant.make_env()   # initialize the environment
iter_test = env.iter_test()    # an iterator which loops over the test set and sample submission
for (test_df, sample_prediction_df) in iter_test:
    test_df = preprocess(test_df, new_feature_list) 
    sample_prediction_df['target'] = make_predictions(model, test_df)  # make your predictions here
    env.predict(sample_prediction_df)   # register your predictions