### Installing Essential Libraries 

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
import numpy as np
import lightgbm as lgb
import pandas as pd
from kaggle.competitions import twosigmanews
import matplotlib.pyplot as plt
import random
import keras
from datetime import datetime, date
from sklearn import model_selection
from sklearn.metrics import mean_squared_error
import time 

### Importing Training Data from the Kaggle Environment 

In [None]:
env = twosigmanews.make_env()

(market_train, news_train) = env.get_training_data() 

### Null Value Replacement  

In [None]:
market_train.isnull().sum()

In [None]:
for columns in market_train:
    if market_train[columns].dtype == 'int64' or market_train[columns].dtype == 'float64':
        market_train[columns] = market_train[columns].fillna(market_train[columns].mean())
        

In [None]:
market_train.isnull().sum() 

### Label Encoding of Asset Codes 

In [None]:
market_train.time = market_train.time.dt.date
lbl = {k: v for v, k in enumerate(market_train['assetCode'].unique())}
market_train['assetCode'] = market_train['assetCode'].map(lbl) 

In [None]:
market_train = market_train.dropna(axis=0) 

### The entire dataset was not used, only data from (01/01/2009) was used for making the predictions

In [None]:
market_train = market_train.loc[market_train['time']>=date(2009, 1, 1)] 

### Response Variable for our classification problem 
This binary variable returns true if market returns are positive and returns false if market returns are negative tp find which
of these stocks are most likely to make postive returns. 

In [None]:
up = (market_train.returnsOpenNextMktres10 >= 0).astype(int)

### Market Returns 

In [None]:
y = market_train.returnsOpenNextMktres10.values 

### Predictor Matrix 

In [None]:
num = ['volume', 'close', 'open', 'returnsClosePrevRaw1', 'returnsOpenPrevRaw1', 'returnsClosePrevMktres1',
                    'returnsOpenPrevMktres1', 'returnsClosePrevRaw10', 'returnsOpenPrevRaw10', 'returnsClosePrevMktres10',
                    'returnsOpenPrevMktres10'] 


In [None]:
X = market_train[num] 

In [None]:
X.head()

In [None]:
assert X.shape[0] == y.shape[0] == up.shape[0] 

### Scaling the Predictors 

In [None]:
from sklearn.preprocessing import MinMaxScaler 
scaler = MinMaxScaler() 
X = scaler.fit_transform(X) 

In [None]:
X = pd.DataFrame(X) 

In [None]:
X.shape[1]

### Train Test Split 

In [None]:
X_train, X_test, up_train, up_test, y_train, y_test= model_selection.train_test_split(X, up, y, test_size=0.33, random_state=0) 

### Artifical Neural Networks Model 

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense

In [None]:
# Initialising the ANN
classifier = Sequential() 

In [None]:
# Adding the input layer and the first hidden layer
classifier.add(Dense(output_dim = 1024, init = 'uniform', activation = 'relu', input_dim = 11)) 

In [None]:
# Adding the second hidden layer
classifier.add(Dense(output_dim = 512, init = 'uniform', activation = 'relu')) 

In [None]:
# Adding the third hidden layer
classifier.add(Dense(output_dim = 256, init = 'uniform', activation = 'relu')) 

In [None]:
# Adding the fourth hidden layer
classifier.add(Dense(output_dim = 128, init = 'uniform', activation = 'relu')) 

In [None]:
# Adding the output layer
classifier.add(Dense(output_dim = 1, init = 'uniform', activation = 'sigmoid'))

In [None]:
# Compiling the ANN
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy']) 

In [None]:
# Fitting the ANN to the Training set
classifier.fit(X_train, up_train, batch_size = 300, nb_epoch = 15) 

### Light GBM Model 

In [None]:
import lightgbm as lgb
d_train = lgb.Dataset(X_train, label= up_train) 

In [None]:
x_1 = [0.19000424246380565, 2452, 212, 328, 202]
params = {
        'task': 'train',
        'boosting_type': 'dart',
        'objective': 'binary',
        'learning_rate': x_1[0],
        'num_leaves': x_1[1],
        'min_data_in_leaf': x_1[2],
        'num_iteration': x_1[3],
        'max_bin': x_1[4],
        'verbose': 1
    }

In [None]:
clf = lgb.train(params, d_train, 100) 

### Learing Curve 

In [None]:
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate a simple plot of the test and training learning curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 3-fold cross-validation,
          - integer, to specify the number of folds.
          - An object to be used as a cross-validation generator.
          - An iterable yielding train/test splits.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.

    n_jobs : integer, optional
        Number of jobs to run in parallel (default 1).
    """
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt


In [None]:
plot_learning_curve(clf, Learning Curve, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):

### Making Predictions using Parallel Ensemble Technique 

In [None]:
days = env.get_prediction_days()  

In [None]:
import time
n_days = 0
prep_time = 0
prediction_time = 0
packaging_time = 0
for (market_obs_df, news_obs_df, predictions_template_df) in days:
    n_days +=1
    print(n_days,end=' ')
    t = time.time()
    
    for columns in market_obs_df:
        if market_obs_df[columns].dtype == 'int64' or market_obs_df[columns].dtype == 'float64':
            market_obs_df[columns] = market_obs_df[columns].fillna(market_obs_df[columns].mean()) 

    market_obs_df.time = market_obs_df.time.dt.date
    lbl = {k: v for v, k in enumerate(market_obs_df['assetCode'].unique())}
    market_obs_df['assetCode'] = market_obs_df['assetCode'].map(lbl) 
    
    market_obs_df = market_obs_df[market_obs_df.assetCode.isin(predictions_template_df.assetCode)]
    X_live = market_obs_df[num].values
    mins = np.min(X_live, axis=0)
    maxs = np.max(X_live, axis=0)
    rng = maxs - mins
    X_live = 1 - ((maxs - X_live) / rng)
    prep_time += time.time() - t
    
    t = time.time()
    lp1 = clf.predict(X_live) 
    lp2 = classifier.predict(X_live) 
    lp = (lp1+lp2)/2
    prediction_time += time.time() -t
    
    t = time.time()
    confidence = lp
    confidence = (confidence-confidence.min())/(confidence.max()-confidence.min())
    confidence = confidence * 2 - 1
    preds = pd.DataFrame({'assetCode':market_obs_df['assetCode'],'confidence':confidence})
    predictions_template_df = predictions_template_df.merge(preds,how='left').drop('confidenceValue',axis=1).fillna(0).rename(columns={'confidence':'confidenceValue'})
    env.predict(predictions_template_df)
    packaging_time += time.time() - t 