In [291]:
import os
import gc
import time
from math import ceil
import numpy as np
import pandas as pd
from nsepy import get_history
from datetime import date
import lightgbm as lgb

## Read historical data

In [312]:
def read_data(share_ticker):
    df = pd.read_csv(os.path.join(DATA_DIR, share_ticker+'.csv'), 
                     infer_datetime_format=True,
                     parse_dates=['Date'],
                     error_bad_lines=False, 
                     warn_bad_lines=True,)
    cleaned_colnames = {col:col.strip().replace(' ','_') for col in df.columns}
    df.rename(columns=cleaned_colnames, inplace=True)
    #df.drop(['Series', 'Symbol'], axis=1, inplace=True)
    df = df.sort_values(by=['Date']).reset_index(drop=True)
    return df

In [308]:
def get_data(f_path, share_ticker, start=date(2015,9,5), end=date(2018,9,4)):
    df = get_history(symbol=share_ticker, start=start, end=end)
    df = df.reset_index()
    cleaned_colnames = {col:col.strip().replace(' ','_') for col in df.columns}
    df.rename(columns=cleaned_colnames, inplace=True)
    df.drop(['Series', 'Symbol'], axis=1, inplace=True)
    df = df.sort_values(by=['Date']).reset_index(drop=True)
    df['Date'] = pd.to_datetime(df['Date'])
    df.to_csv(os.path.join(f_path, share_ticker+'.csv'), index=False)
    return df

In [309]:
get_data('C:\\Users\\abhiawa\\aa\\DS\\trading_model\\data', 'RTNPOWER', start=date(2015,9,19), end=date(2018,9,18))

Unnamed: 0,Date,Prev_Close,Open,High,Low,Last,Close,VWAP,Volume,Turnover,Trades,Deliverable_Volume,%Deliverble
0,2015-09-21,6.20,6.20,6.40,6.10,6.35,6.30,6.26,1093418,6.847570e+11,558,793737,0.7259
1,2015-09-22,6.30,6.40,6.40,6.10,6.10,6.15,6.20,869204,5.387657e+11,617,717272,0.8252
2,2015-09-23,6.15,6.15,6.15,6.00,6.10,6.05,6.08,1292800,7.861447e+11,840,1013560,0.7840
3,2015-09-24,6.05,6.15,6.15,6.00,6.10,6.05,6.07,918611,5.572581e+11,1271,656759,0.7149
4,2015-09-28,6.05,6.05,6.15,5.95,6.00,6.00,6.03,835251,5.037548e+11,969,561205,0.6719
5,2015-09-29,6.00,6.10,6.10,5.95,6.00,6.00,5.99,1115405,6.684250e+11,516,775376,0.6952
6,2015-09-30,6.00,6.05,6.95,5.95,6.75,6.75,6.41,3374894,2.162800e+12,2270,2101841,0.6228
7,2015-10-01,6.75,7.00,7.05,6.55,6.65,6.60,6.77,1246985,8.438098e+11,1385,657053,0.5269
8,2015-10-05,6.60,6.70,6.85,6.50,6.70,6.70,6.70,957976,6.420299e+11,896,434249,0.4533
9,2015-10-06,6.70,6.70,6.80,6.60,6.75,6.70,6.70,532460,3.565090e+11,437,324768,0.6099


### New features

In [294]:
def generate_date_features(df, dateCol):
    '''
    Generates date based features from date column.
    
    Args:
    ----
    df: Pandas DataFrame.
    dateCol: Date Column Name in df.
    
    Returns:
    -------
    df: Pandas DataFrame with date features added.
    '''
    t0 = time.time()
    df['day'] = df[dateCol].dt.day
    df['month'] = df[dateCol].dt.month
    df['year'] = df[dateCol].dt.year
    df['dayofweek'] = df[dateCol].dt.dayofweek
    df['dayofyear'] = df[dateCol].dt.dayofyear
    df['week'] = df[dateCol].dt.week
    df['is_month_end'] = df[dateCol].dt.is_month_end
    df['is_month_start'] = df[dateCol].dt.is_month_start
    df['is_quarter_end'] = df[dateCol].dt.is_quarter_end
    df['is_quarter_start'] = df[dateCol].dt.is_quarter_start
    df['quarter'] = df['month'].apply(lambda mon: ceil(mon/3))
    print('Total time elapsed in making date features: ', (time.time()-t0)/60, 'minutes!')
    return df

In [295]:
def create_sales_ewm_feats(df, target_cols, alpha=[0.9], shift=[1]):
    '''
    Function to create exponentially weighted mean features.
    
    Args:
    ----
    df: A Pandas DataFrame in which ewm features to be created.
    target_col: String. Name of Target column on which ewm features would be based.
                Like 'sales'
    alpha: list. List of alpha values on which ewm features to be calculated.
    shift: Int. Target column to be shifted by this number so as to avoid current values.
         
    Returns:
    -------
    df: Pandas DataFrame with ewm features added.
    '''
    for target_col in target_cols:
        for a in alpha:
            for s in shift:
                df['_'.join([target_col, 'lag', str(s), 'ewm', str(a)])] = \
                    df[target_col].shift(s).ewm(alpha=a).mean().values
    return df

### EDA

In [296]:
def eda(df, share_ticker, amount=10000):
    # Percent increase in stock value in 3 months
    start_value = df.iloc[0]['Open']
    end_value = df.iloc[-1]['Close']
    per_change = ((end_value-start_value)*100)/(start_value)
    print('Pecent change in share {} in 3 years is: {}'.format(share_ticker, per_change))
    # Amount in 3 months
    new_amount = amount + (per_change*amount*0.01)
    print('An amount of 10000 invested in {} for 3 years would have become {} \n'.format(share_ticker, new_amount))

### Validation

In [297]:
# For validation 
# We can choose last 2 months of training period as our validation set to gauge the performance of the model.

def create_validation(df, val_weeks):
    masked_series = (df.year==2018) & (df.month.isin([7,8,9]))

    df.loc[(masked_series), 'train_or_test'] = 'val'
    df.loc[~(masked_series), 'train_or_test'] = 'train'
    print('Train shape: {}'.format(df.loc[df.train_or_test=='train',:].shape))
    print('Validation shape: {}'.format(df.loc[df.train_or_test=='val',:].shape))
    return df

### Model data preparation

In [298]:
def create_val_train_arrays(df, cols, targetCol):
    X_train = df.loc[df.train_or_test=='train', cols].values
    Y_train = df.loc[df.train_or_test=='train', targetCol].values.reshape((-1))
    Y_val = df.loc[df.train_or_test=='val', targetCol].values.reshape((-1))
    df.loc[df.train_or_test=='val', targetCol] = np.nan
    X_val = df.loc[df.train_or_test=='val', cols].values
    return (X_train, X_val, Y_train, Y_val)

def create_lgb_datasets(X_train, X_val, Y_train, Y_val, cols):
    # Creating lgbtrain & lgbval
    lgbtrain = lgb.Dataset(data=X_train, 
                           label=Y_train, 
                           feature_name=cols,)
    lgbval = lgb.Dataset(data=X_val, 
                         label=Y_val, 
                         reference=lgbtrain, 
                         feature_name=cols,)
    return (lgbtrain,lgbval)

In [299]:
def lgb_validation(params, lgbtrain, lgbval, X_val, Y_val, df, verbose_eval):
    t0 = time.time()
    evals_result = {}
    model = lgb.train(params, lgbtrain, num_boost_round=params['num_boost_round'], valid_sets=[lgbtrain, lgbval], 
                      early_stopping_rounds=params['early_stopping_rounds'], evals_result=evals_result, verbose_eval=verbose_eval)
    print(model.best_iteration)
    print('Total time taken to build the model: ', (time.time()-t0)/60, 'minutes!!')
    pred_Y_val = model.predict(X_val, num_iteration=model.best_iteration)
    val_df = pd.DataFrame(columns=['true_Y_val','pred_Y_val'])
    val_df = df.loc[df.train_or_test=='val', ['Date']].reset_index(drop=True)
    val_df['pred_Y_val'] = pred_Y_val
    val_df['true_Y_val'] = Y_val
    val_df['error'] = (pred_Y_val - Y_val)
    print(val_df.shape)
    print(val_df.head())
    return model, val_df

In [313]:
if __name__ == '__main__':
    DATA_DIR = 'C:\\Users\\abhiawa\\aa\\DS\\trading_model\\data'
    
    lgb_params ={'task':'train', 
             'boosting_type':'gbdt', 
             'objective':'regression', 
             'metric': {'rmse'},
             'num_leaves': 31, 
             'learning_rate': 0.05, 
             'feature_fraction': 0.8, 
             'verbose': 0, 
             'num_boost_round':5000, 
             'early_stopping_rounds':10, 
             'nthread':4,}
    
    val_weeks = [34, 35, 36]
    
    targetCol = 'High'
    cols_to_lag = ['Open', 'High', 'Low', 'Last', 'VWAP', 'Volume', 'Turnover', 'Trades', 
                   'Deliverable_Volume', '%Deliverble']
    avoid_cols = ['Date', 
                  'year', 
                  'train_or_test', 
                  'day', 
                  'month',
                  'dayofyear',
                  'is_month_end', 
                  'is_month_start', 
                  'is_quarter_end', 
                  'is_quarter_start', 
                  'quarter',
                  'Close',] + cols_to_lag
    avoid_cols.append(targetCol)
    
    share_list = [fname.split('.')[0] for fname in os.listdir(DATA_DIR)]+['IDEA','EICHERMOT']
    print('Shares are: ', share_list)
    #for share_ticker in share_list[4:5]:
    share_ticker='RTNPOWER'
    print('Share is: {} \n'.format(share_ticker))

    df = read_data(share_ticker)
    #df = get_data(share_ticker)
    df = generate_date_features(df, dateCol='Date')

    df = create_sales_ewm_feats(df, 
                                target_cols=cols_to_lag, 
                                alpha=[0.95], 
                                shift=[1])
    eda(df, share_ticker, amount=10000)

    df = create_validation(df, val_weeks)
    print(df.head(2), '\n')

    cols = [col for col in df.columns if col not in avoid_cols]
    #cols = ['Open_Price', 'dayofweek','Open_Price_lag_1_ewm_0.95', 'High_Price_lag_1_ewm_0.95']
    print('No of training features: {} \nAnd they are:{}'.format(len(cols), cols))

    X_train, X_val, Y_train, Y_val = create_val_train_arrays(df, cols, targetCol)
    lgbtrain, lgbval = create_lgb_datasets(X_train, X_val, Y_train, Y_val, cols)

    model, val_df = lgb_validation(lgb_params, lgbtrain, lgbval, X_val, Y_val, df, verbose_eval=10)
    print('\n\n')

Shares are:  ['BOSCHLTD', 'DIVISLAB', 'ICICIBANK', 'RAYMOND', 'RTNPOWER', 'SUNPHARMA', 'TCS', 'TECHM', 'IDEA', 'EICHERMOT']
Share is: RTNPOWER 

Total time elapsed in making date features:  0.0021667003631591795 minutes!
Pecent change in share RTNPOWER in 3 years is: -38.70967741935484
An amount of 10000 invested in RTNPOWER for 3 years would have become 6129.032258064515 

Train shape: (687, 35)
Validation shape: (54, 35)
        Date  Prev_Close  Open  High  Low  Last  Close  VWAP   Volume  \
0 2015-09-21         6.2   6.2   6.4  6.1  6.35   6.30  6.26  1093418   
1 2015-09-22         6.3   6.4   6.4  6.1  6.10   6.15  6.20   869204   

       Turnover      ...        High_lag_1_ewm_0.95  Low_lag_1_ewm_0.95  \
0  6.847570e+11      ...                        NaN                 NaN   
1  5.387657e+11      ...                        6.4                 6.1   

   Last_lag_1_ewm_0.95  VWAP_lag_1_ewm_0.95  Volume_lag_1_ewm_0.95  \
0                  NaN                  NaN              



Training until validation scores don't improve for 10 rounds.
[10]	training's rmse: 1.27848	valid_1's rmse: 2.7322
[20]	training's rmse: 0.789088	valid_1's rmse: 1.7335
[30]	training's rmse: 0.50664	valid_1's rmse: 1.13291
[40]	training's rmse: 0.349737	valid_1's rmse: 0.786012
[50]	training's rmse: 0.267242	valid_1's rmse: 0.594023
[60]	training's rmse: 0.226034	valid_1's rmse: 0.49234
[70]	training's rmse: 0.204328	valid_1's rmse: 0.438917
[80]	training's rmse: 0.190678	valid_1's rmse: 0.414862
[90]	training's rmse: 0.180484	valid_1's rmse: 0.402362
[100]	training's rmse: 0.171928	valid_1's rmse: 0.397286
[110]	training's rmse: 0.164059	valid_1's rmse: 0.393491
[120]	training's rmse: 0.156821	valid_1's rmse: 0.388442
[130]	training's rmse: 0.15091	valid_1's rmse: 0.384737
[140]	training's rmse: 0.14547	valid_1's rmse: 0.383761
[150]	training's rmse: 0.140818	valid_1's rmse: 0.381897
[160]	training's rmse: 0.135747	valid_1's rmse: 0.37957
[170]	training's rmse: 0.131384	valid_1's rmse

In [316]:
val_df.tail(10)

Unnamed: 0,Date,pred_Y_val,true_Y_val,error
44,2018-09-04,3.950183,4.2,-0.249817
45,2018-09-05,3.990091,4.1,-0.109909
46,2018-09-06,3.936197,4.1,-0.163803
47,2018-09-07,3.954731,4.0,-0.045269
48,2018-09-10,4.040392,3.9,0.140392
49,2018-09-11,4.008823,4.6,-0.591177
50,2018-09-12,4.416411,5.3,-0.883589
51,2018-09-14,5.050126,4.75,0.300126
52,2018-09-17,4.559913,4.2,0.359913
53,2018-09-18,3.9732,4.0,-0.0268


In [315]:
# Let's see top 25 features as identified by the lightgbm model.
print("Features importance...")
gain = model.feature_importance('gain')
feat_imp = pd.DataFrame({'feature':model.feature_name(), 
                         'split':model.feature_importance('split'), 
                         'gain':100 * gain / gain.sum()}).sort_values('gain', ascending=False)
print('Top 25 features:\n', feat_imp.head(25))

Features importance...
Top 25 features:
                               feature  split       gain
0                          Prev_Close    360  48.531813
6                 Last_lag_1_ewm_0.95    505  31.968149
4                 High_lag_1_ewm_0.95    335  17.005415
7                 VWAP_lag_1_ewm_0.95    144   1.251203
5                  Low_lag_1_ewm_0.95    232   0.327743
2                                week    480   0.174841
3                 Open_lag_1_ewm_0.95    289   0.174538
10              Trades_lag_1_ewm_0.95    418   0.126810
12         %Deliverble_lag_1_ewm_0.95    410   0.107771
8               Volume_lag_1_ewm_0.95    296   0.098538
9             Turnover_lag_1_ewm_0.95    332   0.083192
11  Deliverable_Volume_lag_1_ewm_0.95    322   0.077195
1                           dayofweek    192   0.072793
