In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!wget http://prdownloads.sourceforge.net/ta-lib/ta-lib-0.4.0-src.tar.gz
!tar -xzvf ta-lib-0.4.0-src.tar.gz
%cd ta-lib
!./configure --prefix=/usr
!make
!make install
!pip install Ta-Lib

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.style.use('seaborn')
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import tensorflow as tf
from dataclasses import dataclass
import warnings
warnings.filterwarnings('ignore')
import time

from scipy.stats import pearsonr
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from lightgbm import LGBMRegressor
import xgboost as xgb
from tqdm import tqdm
import talib as ta
import functools  

### 1. Load, Clean, Create Features

In [None]:
def load_data(supplement : bool) :
    stock_prices = pd.read_csv('/content/drive/My Drive/Colab Notebooks/data/train_files/stock_prices.csv', parse_dates=True)
    secondary_stock_prices = pd.read_csv('/content/drive/My Drive/Colab Notebooks/data/train_files/secondary_stock_prices.csv', parse_dates=True)
    options = pd.read_csv('/content/drive/My Drive/Colab Notebooks/data/train_files/options.csv', parse_dates=True)
    stock_list = pd.read_csv('/content/drive/My Drive/Colab Notebooks/data/stock_list.csv')
    if supplement :
        supplemental_stock_prices = pd.read_csv('/content/drive/My Drive/Colab Notebooks/data/supplemental_files/stock_prices.csv', parse_dates=True)
        supplemental_secondary_stock_prices = pd.read_csv('/content/drive/My Drive/Colab Notebooks/data/supplemental_files/secondary_stock_prices.csv', parse_dates=True)
        supplemental_options = pd.read_csv('/content/drive/My Drive/Colab Notebooks/data/supplemental_files/options.csv', parse_dates=True)
        
        stock_prices = stock_prices.append(supplemental_stock_prices)
        secondary_stock_prices = secondary_stock_prices.append(supplemental_secondary_stock_prices)
        options = options.append(supplemental_options)
    
    return stock_prices, secondary_stock_prices, options, stock_list

def merge_stock_list(df : pd.DataFrame) :
    
    # On ajoute seulement la colonne 33 Sector Code qui donne le secteur de l'entreprise
    df = df.merge(stock_list[['SecuritiesCode','33SectorCode']], on='SecuritiesCode', how='left')
    df['33SectorCode'] = df['33SectorCode'].astype(int)
    return df

def merge_stock_list_secondary(df : pd.DataFrame) :
    # On ajoute seulement la colonne 33 Sector Code qui donne le secteur de l'entreprise
    df = df.merge(stock_list[['SecuritiesCode','33SectorCode']], on='SecuritiesCode', how='left')
    df = df.drop(df[df['33SectorCode']=='-'].index)
    df['33SectorCode'] = df['33SectorCode'].astype(int)
    
    return df

def preprocess_prices(df : pd.DataFrame) :
    
    # Cast the categorical data
    df['SecuritiesCode'] = df['SecuritiesCode'].astype('category')
    df['AdjustmentFactor'] = df['AdjustmentFactor'].astype('category')
    df['SupervisionFlag'] = df['SupervisionFlag'].map({True: 1, False: 0})
    
    # Remove useless column
    df.drop(columns = ['RowId'], axis=1, inplace=True)
    
    # Manage NaN values
    df['ExpectedDividend'] = df['ExpectedDividend'].fillna(0)
    df.dropna(inplace=True) # Il reste 8200 rows contenant que des Nan sur les colonnes OLHC, on les drop
    
    # Transform date into int
    df['Date'] = pd.to_datetime(df['Date'])
    df['DateInt'] = df['Date'].dt.strftime("%Y%m%d").astype(int)
    
    return df

def preprocess_prices_submission(df : pd.DataFrame) :
    
    # Cast the categorical data
    df['SecuritiesCode'] = df['SecuritiesCode'].astype('category')
    df['AdjustmentFactor'] = df['AdjustmentFactor'].astype('category')
    df['SupervisionFlag'] = df['SupervisionFlag'].map({True: 1, False: 0})
    
    # Remove useless column
    df.drop(columns = ['RowId'], axis=1, inplace=True)
    
    # Manage NaN values
    df['ExpectedDividend'] = df['ExpectedDividend'].fillna(0)
    
    # Transform date into int
    df['Date'] = pd.to_datetime(df['Date'])
    df['DateInt'] = df['Date'].dt.strftime("%Y%m%d").astype(int)
    
    return df

def fillna_prices_submission(df : pd.DataFrame) :
    df = df.sort_values(by=['SecuritiesCode']).fillna(method='backfill')
    return df

def add_secondary_market_features (df_prices : pd.DataFrame, df_secondary : pd.DataFrame) :
    df_prices = merge_stock_list(df_prices)
    df_secondary = merge_stock_list_secondary(df_secondary)
    df_all_securities = df_prices.append(df_secondary)
    date_sector_volume = df_all_securities.groupby(by=['Date','33SectorCode'])['Volume'].sum()
    date_sector_volume = date_sector_volume.reset_index()
    date_sector_volume = date_sector_volume.rename(columns = {'Volume' : 'VolumeSector'})
    del df_all_securities
    df_prices = df_prices.merge(date_sector_volume, on=['Date', '33SectorCode'], how='outer')
    return df_prices

def add_options_features(df_prices : pd.DataFrame, df_options : pd.DataFrame) :
    df_options['Date'] = pd.to_datetime(df_options['Date'])
    put_call_day_volume = df_options.groupby(['Date','Putcall'])['TradingVolume'].sum().unstack().rename(columns={1:'Put',2:'Call'})
    put_call_day_volume['put/call'] = put_call_day_volume['Put']/put_call_day_volume['Call']
    put_call_day_volume['Volatility'] = df_options.groupby('Date')['ImpliedVolatility'].mean()
    df_prices = df_prices.merge(put_call_day_volume[['put/call', 'Volatility']], how='left', on='Date')
    return df_prices

In [None]:
df_prices, df_secondary, df_options, stock_list = load_data(1)
df_prices = preprocess_prices(df_prices)
df_secondary = preprocess_prices(df_secondary)
df_prices = add_secondary_market_features (df_prices, df_secondary)
df_prices = add_options_features(df_prices, df_options)

In [None]:
def get_talib_features(df):
    """
    Get technical features from TA-Lib
    """
    op = df['Open']
    hi = df['High']
    lo = df['Low']
    cl = df['Close']
    vo = df['Volume']
    
    # Overlap Studies
    df['BBANDS_upper'], df['BBANDS_middle'], df['BBANDS_lower'] = ta.BBANDS(cl, timeperiod=5, nbdevup=2, nbdevdn=2, matype=0)
    df['DEMA'] = ta.DEMA(cl, timeperiod=30)
    df['EMA_30'] = ta.EMA(cl, timeperiod=30)
    df['EMA_20'] = ta.EMA(cl, timeperiod=20)
    df['EMA_10'] = ta.EMA(cl, timeperiod=10)
    df['HT_TRENDLINE'] = ta.HT_TRENDLINE(cl)
    # df['KAMA'] = ta.KAMA(cl, timeperiod=30)
    # df['MA_50'] = ta.MA(cl, timeperiod=50, matype=0)
    # df['MA_30'] = ta.MA(cl, timeperiod=30, matype=0)
    # df['MA_15'] = ta.MA(cl, timeperiod=15, matype=0)
    # df['MIDPOINT'] = ta.MIDPOINT(cl, timeperiod=14)
    # df['SAR'] = ta.SAR(hi, lo, acceleration=0, maximum=0)
    # df['SAREXT'] = ta.SAREXT(hi, lo, startvalue=0, offsetonreverse=0, accelerationinitlong=0, accelerationlong=0, accelerationmaxlong=0, accelerationinitshort=0, accelerationshort=0, accelerationmaxshort=0)
    # df['SMA'] = ta.SMA(cl, timeperiod=30)
    # df['T3'] = ta.T3(df['Close'], timeperiod=5, vfactor=0)
    df['TEMA_50'] = ta.TEMA(df['Close'], timeperiod=50)
    df['TEMA_30'] = ta.TEMA(df['Close'], timeperiod=30)
    df['TEMA_15'] = ta.TEMA(df['Close'], timeperiod=15)
    df['TEMA_10'] = ta.TEMA(df['Close'], timeperiod=10)
    # df['TRIMA'] = ta.TRIMA(df['Close'], timeperiod=30)
    # df['WMA'] = ta.WMA(df['Close'], timeperiod=30)
    
    # Momentum Indicators
    df['ADX'] = ta.ADX(hi, lo, cl, timeperiod=14)
    df['ADXR'] = ta.ADXR(hi, lo, cl, timeperiod=14)
    df['APO'] = ta.APO(cl, fastperiod=12, slowperiod=26, matype=0)
    # df['AROON_down'], df['AROON_up'] = ta.AROON(hi, lo, timeperiod=14)
    # df['AROONOSC'] = ta.AROONOSC(hi, lo, timeperiod=14)
    # df['BOP'] = ta.BOP(op, hi, lo, cl)
    # df['CCI'] = ta.CCI(hi, lo, cl, timeperiod=14)
    # df['DX'] = ta.DX(hi, lo, cl, timeperiod=14)
    df['MACD_macd'], df['MACD_macdsignal'], df['MACD_macdhist'] = ta.MACD(cl, fastperiod=12, slowperiod=26, signalperiod=9)
    # df['MFI'] = ta.MFI(hi, lo, cl, vo, timeperiod=14)
    # df['MINUS_DI'] = ta.MINUS_DI(hi, lo, cl, timeperiod=14)
    # df['MINUS_DM'] = ta.MINUS_DM(hi, lo, timeperiod=14)
    # df['MOM'] = ta.MOM(cl, timeperiod=10)
    # df['PLUS_DI'] = ta.PLUS_DI(hi, lo, cl, timeperiod=14)
    # df['PLUS_DM'] = ta.PLUS_DM(hi, lo, timeperiod=14)
    df['RSI'] = ta.RSI(cl, timeperiod=14)
    # df['STOCH_slowk'], df['STOCH_slowd'] = ta.STOCH(hi, lo, cl, fastk_period=5, slowk_period=3, slowk_matype=0, slowd_period=3, slowd_matype=0)
    # df['STOCHF_fastk'], df['STOCHF_fastd'] = ta.STOCHF(hi, lo, cl, fastk_period=5, fastd_period=3, fastd_matype=0)
    # df['STOCHRSI_fastk'], df['STOCHRSI_fastd'] = ta.STOCHRSI(cl, timeperiod=14, fastk_period=5, fastd_period=3, fastd_matype=0)
    # df['TRIX'] = ta.TRIX(cl, timeperiod=30)
    # df['ULTOSC'] = ta.ULTOSC(hi, lo, cl, timeperiod1=7, timeperiod2=14, timeperiod3=28)
    # df['WILLR'] = ta.WILLR(hi, lo, cl, timeperiod=14)
    
    # Volume Indicators
    df['AD'] = ta.AD(hi, lo, cl, vo)
    df['ADOSC'] = ta.ADOSC(hi, lo, cl, vo, fastperiod=3, slowperiod=10)
    df['OBV'] = ta.OBV(cl, vo)
    
    # Volatility Indicators
    df['ATR'] = ta.ATR(hi, lo, cl, timeperiod=14)
    df['NATR'] = ta.NATR(hi, lo, cl, timeperiod=14)
    df['TRANGE'] = ta.TRANGE(hi, lo, cl)
    
    # Cycle Indicators
    # df['HT_DCPERIOD'] = ta.HT_DCPERIOD(cl)
    # df['HT_DCPHASE'] = ta.HT_DCPHASE(cl)
    # df['HT_PHASOR_inphase'], df['HT_PHASOR_quadrature'] = ta.HT_PHASOR(cl)
    # df['HT_SINE_sine'], df['HT_SINE_leadsine'] = ta.HT_SINE(cl)
    # df['HT_TRENDMODE'] = ta.HT_TRENDMODE(cl)
    
    # Statistic Functions
    # df['BETA'] = ta.BETA(hi, lo, timeperiod=5)
    # df['CORREL'] = ta.CORREL(hi, lo, timeperiod=30)
    # df['LINEARREG'] = ta.LINEARREG(cl, timeperiod=14) - cl
    # df['LINEARREG_ANGLE'] = ta.LINEARREG_ANGLE(cl, timeperiod=14)
    # df['LINEARREG_INTERCEPT'] = ta.LINEARREG_INTERCEPT(cl, timeperiod=14) - cl
    # df['LINEARREG_SLOPE'] = ta.LINEARREG_SLOPE(cl, timeperiod=14)
    # df['STDDEV'] = ta.STDDEV(cl, timeperiod=5, nbdev=1)   
    
    return df

def add_features_security(df : pd.DataFrame, security : int):
    """
    Args:
        price (pd.DataFrame)  : pd.DataFrame include stock_price
        security (int)  : A local code for a listed company
    Returns:
        feature DataFrame (pd.DataFrame)
    """
    data_security = df[df.SecuritiesCode == security].copy()
    
    # Adds all 42 features
    data_security = get_talib_features(data_security)
    
    # filling data for nan and inf
    data_security = data_security.fillna(method='ffill')
    data_security = data_security.replace([np.inf, -np.inf, np.nan], 0)

    return data_security

def add_features_dataframe(df : pd.DataFrame) :
    codes = sorted(df["SecuritiesCode"].unique())
    buff = []
    for code in tqdm(codes):
        security_features = add_features_security(df, code)
        buff.append(security_features)
    df_augmented = pd.concat(buff)
    
    return df_augmented

def drop_null_values_lagged(df) :
  codes = sorted(df["SecuritiesCode"].unique())
  buff = []
  for code in tqdm(codes):
    data_security = df[df.SecuritiesCode == code].copy()
    data_security.reset_index(drop=True, inplace=True)
    idx_drop = data_security[features_nodate].ne(0).idxmax().max()
    data_security.drop(data_security.iloc[:idx_drop].index, inplace=True)
    buff.append(data_security)
  df_clean = pd.concat(buff)
  
  return df_clean

In [None]:
df_prices = add_features_dataframe(df_prices)

100%|██████████| 2000/2000 [00:39<00:00, 50.45it/s]


### 2. Training the model

In [None]:
# features_training = [
#     # OLHCV
#     "Open",
#     "High",
#     "Low",
#     "Close",
#     "Volume",
#     # New
#     "put/call",
#     "Volatility",
#     "VolumeSector",
#     # Overlap Studies
#     "BBANDS_upper",
#     "BBANDS_lower",
#     "EMA_10",
#     "EMA_20",
#     "EMA_30",
#     #"SMA",
#     #"TEMA_15",
#     #"TEMA_30",
#     #"HT_TRENDLINE",
#     # Momentum Indicators
#     "MACD_macd",
#     "RSI",
#     "APO",
#     "ADX",
#     # Volume Indicators
#     "AD",
#     "OBV",
#     # Volatility Indicators
#     "ATR",
#     # Cycle Indicators
#     #"HT_TRENDMODE",
#     "Date",
#     "Target"
# ]

In [None]:
features_training = [
    "SecuritiesCode",
    # OLHCV
    "Open",
    "High",
    "Low",
    "Close",
    "Volume",
    #"AdjustmentFactor",
    #"ExpectedDividend",
    # New
    # Overlap Studies
    "BBANDS_upper",
    "BBANDS_lower",
    "EMA_20",
    "TEMA_15",
    "TEMA_30",
    "TEMA_50",
    "HT_TRENDLINE",
    # Momentum Indicators
    "MACD_macd",
    "RSI",
    "APO",
    "ADX",
    # Volume Indicators
    "AD",
    "OBV",
    # Volatility Indicators
    "NATR",
    "put/call",
    "VolumeSector",
    ######
    "Date",
    "Target"
]

In [None]:
features_nodate = features_training.copy()
features_nodate.remove("Date")

In [None]:
features_nodate_nocode = features_nodate.copy()
features_nodate_nocode.remove("SecuritiesCode")

In [None]:
df_prices = drop_null_values_lagged(df_prices)

100%|██████████| 2000/2000 [00:22<00:00, 87.11it/s] 


In [None]:
securities = df_prices.SecuritiesCode.astype(int).unique()

In [None]:
optimizer = tf.keras.optimizers.Adam()

In [None]:
# Save all "global" variables within the G class (G stands for global)
@dataclass
class G:
    TRAIN_END = "2022-04-30"
    TEST_START = "2022-05-01"
    WINDOW_SIZE = 30
    BATCH_SIZE = 32
    SHUFFLE_BUFFER_SIZE = 1000

In [None]:
features_to_scale = features_nodate_nocode.copy()
features_to_scale.remove("put/call")
features_to_scale.remove("VolumeSector")
# features_to_scale.remove("Target")

In [None]:
def windowed_dataset(series, window_size=G.WINDOW_SIZE, batch_size=G.BATCH_SIZE, shuffle_buffer=G.SHUFFLE_BUFFER_SIZE):
    # Create dataset from the series
    dataset = tf.data.Dataset.from_tensor_slices(series)
    
    # Slice the dataset into the appropriate windows
    dataset = dataset.window(window_size + 1,shift = 1, drop_remainder = True)
    
    # Flatten the dataset
    dataset = dataset.flat_map(lambda window : window.batch(window_size+1))
    
    # Suffle to reduce biais
    dataset = dataset.shuffle(shuffle_buffer)
    
    # Split it into the features and labels
    dataset = dataset.map(lambda window : (window[:-1, :-1], window[-1, -1]))
    
    # Batch it
    dataset = dataset.batch(batch_size)
    
    return dataset

In [None]:
def create_uncompiled_model():

    model = tf.keras.models.Sequential([ 
        tf.keras.layers.LSTM(64, activation='tanh', return_sequences=False),
        tf.keras.layers.Dense(32, activation='gelu'),
        # tf.keras.layers.Dropout(0.1),
        # tf.keras.layers.LSTM(50, activation='tanh', return_sequences=False),
        # tf.keras.layers.Dropout(0.1),
        tf.keras.layers.Dense(1, activation='linear')
    ]) 

    return model

In [None]:
def create_big_dataset(df_prices) :

  scaler_global_variables = MinMaxScaler()
  global_variables = ['put/call', 'VolumeSector']
  scaler_global_variables.fit(df_prices[global_variables])
  globals()["global_scaler"] = scaler_global_variables

  full_dataset_train = None
  full_dataset_test = None
  for security in tqdm(securities) :
    df_security = df_prices[df_prices.SecuritiesCode==security]
    df_security.reset_index(drop=True, inplace=True)
    # idx_drop = df_security[features_nodate].ne(0).idxmax().max() # last index where a value is zero because of the lagged features
    # # so we drop everything before :
    # df_security.drop(df_security.iloc[:idx_drop].index, inplace=True)

    # On utilise MinMax scale car les features ne suivent pas de distribution normale, et qu'il n'y a pas de outlier
    scaler = MinMaxScaler()
    df_security_scaled = scaler.fit_transform(df_security[features_to_scale])
    df_security_scaled = pd.DataFrame(data=df_security_scaled, columns=features_to_scale)

    df_date = df_security.Date # on ne l'enregistre pas en variable globale, elle sera copiée et n'a pas d'importance
    df_date.reset_index(inplace=True, drop=True)
    df_security_scaled['Date'] = df_date
    df_code = df_security.SecuritiesCode # on ne l'enregistre pas en variable globale, elle sera copiée et n'a pas d'importance
    df_code.reset_index(inplace=True, drop=True)
    df_security_scaled['SecuritiesCode'] = df_code

    df_global_variables = scaler_global_variables.transform(df_security[global_variables])
    df_global_variables = pd.DataFrame(data=df_global_variables, columns=global_variables)
    df_security_scaled = pd.concat([df_global_variables, df_security_scaled], axis=1) 
    string_security_scaled = "df_security_{}_scaled".format(security)
    globals()[string_security_scaled] = df_security_scaled

    # On enregistre le min et le max des features pour pouvoir inverse scale lors de la prediction
    string_min_target = "min_target_{}".format(security)
    string_max_target = "max_target_{}".format(security)
    globals()[string_min_target] = scaler.data_min_[-1]
    globals()[string_max_target] = scaler.data_max_[-1]

    # On split pour garder que le train
    df_train = df_security_scaled[df_security_scaled['Date'] <= G.TRAIN_END][features_nodate_nocode]
    df_test = df_security_scaled[df_security_scaled['Date'] >= G.TEST_START][features_nodate_nocode]

    tf_dataset_train = windowed_dataset(df_train)
    tf_dataset_test = windowed_dataset(df_test)    

    # On merge 
    if security==1301 :  # On crée le dataset à partir de la première security
      full_dataset_train = tf_dataset_train
      full_dataset_test = tf_dataset_test
    else :                # Puis on ajoute les autres
      full_dataset_train = tf.data.Dataset.concatenate(full_dataset_train, tf_dataset_train)
      full_dataset_test = tf.data.Dataset.concatenate(full_dataset_test, tf_dataset_test)
  
  return full_dataset_train, full_dataset_test


In [None]:
full_dataset_train, full_dataset_test = create_big_dataset(df_prices)

100%|██████████| 2000/2000 [02:58<00:00, 11.20it/s]


In [None]:
filepath = "/content/drive/My Drive/Colab Notebooks/saved_models/checkpoint_4"
my_callbacks = tf.keras.callbacks.ModelCheckpoint(filepath,  
                                     verbose=0, 
                                     save_best_only=False,
                                     save_weights_only=False, 
                                     mode='auto', 
                                     save_freq='epoch')

In [None]:
model = tf.keras.models.load_model("/content/drive/My Drive/Colab Notebooks/saved_models/model_full_dataset_1epoch_targetlocal")

In [None]:
# model = create_uncompiled_model()
# model.compile(loss=tf.keras.losses.Huber(),
#               optimizer=optimizer, 
#               metrics=["mae"],
#               )
history = model.fit(full_dataset_train, epochs=3, validation_data=full_dataset_test)
model.save("/content/drive/My Drive/Colab Notebooks/saved_models/model_full_dataset_4epoch_targetlocal")

Epoch 1/3
Epoch 2/3
Epoch 3/3




INFO:tensorflow:Assets written to: /content/drive/My Drive/Colab Notebooks/saved_models/model_full_dataset_4epoch_targetlocal/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/Colab Notebooks/saved_models/model_full_dataset_4epoch_targetlocal/assets


In [None]:
model = tf.keras.models.load_model(filepath)

In [None]:
model = tf.keras.models.load_model("/content/drive/My Drive/Colab Notebooks/saved_models/model_full_dataset_1epoch_targetlocal")

In [None]:
def rescale_min_max(value, min_, max_) :
    return value * (max_ - min_) + min_

In [None]:
def rescale_std(value, mean, std) :
  return value * std + mean

In [None]:
# Define the function to return the SMAPE value
def calculate_smape(actual, predicted) -> float:
  
    # Convert actual and predicted to numpy
    # array data type if not already
    if not all([isinstance(actual, np.ndarray), isinstance(predicted, np.ndarray)]):
        actual, predicted = np.array(actual), np.array(predicted)
  
    return round(
        np.mean(np.abs(predicted - actual) / 
            ((np.abs(predicted) + np.abs(actual))/2))*100, 2)

In [None]:
def compute_metrics(true_series, forecast):
    mse = mean_squared_error(true_series, forecast)
    mae = mean_absolute_error(true_series, forecast)
    mape = mean_absolute_percentage_error(true_series, forecast)
    smape = calculate_smape(true_series, forecast)
    r = pearsonr(true_series, forecast)
    print(f"mse: {mse:.5f}, mae: {mae:.5f}, mape: {mape:.5f},  smape: {smape:.5f}, pearsoncorr: {r[0]:.3f} for forecast \n")
    return mse, mae, mape, r, smape

In [None]:
def calc_spread_return_sharpe(df: pd.DataFrame, portfolio_size: int = 200, toprank_weight_ratio: float = 2) -> float:
    """
    Args:
        df (pd.DataFrame): predicted results
        portfolio_size (int): # of equities to buy/sell
        toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
    Returns:
        (float): sharpe ratio
    """
    def _calc_spread_return_per_day(df, portfolio_size, toprank_weight_ratio):
        """
        Args:
            df (pd.DataFrame): predicted results
            portfolio_size (int): # of equities to buy/sell
            toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
        Returns:
            (float): spread return
        """
        assert df['Rank'].min() == 0
        assert df['Rank'].max() == len(df['Rank']) - 1
        weights = np.linspace(start=toprank_weight_ratio, stop=1, num=portfolio_size)
        purchase = (df.sort_values(by='Rank')['Target'][:portfolio_size] * weights).sum() / weights.mean()
        short = (df.sort_values(by='Rank', ascending=False)['Target'][:portfolio_size] * weights).sum() / weights.mean()
        return purchase - short

    buf = df.groupby('Date').apply(_calc_spread_return_per_day, portfolio_size, toprank_weight_ratio)
    sharpe_ratio = buf.mean() / buf.std()
    return sharpe_ratio

In [None]:
def calc_spread_average_return(df: pd.DataFrame, portfolio_size: int = 200, toprank_weight_ratio: float = 2) -> float:
    """
    Args:
        df (pd.DataFrame): predicted results
        portfolio_size (int): # of equities to buy/sell
        toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
    Returns:
        (float): sharpe ratio
    """
    def _calc_spread_return_per_day(df, portfolio_size, toprank_weight_ratio):
        """
        Args:
            df (pd.DataFrame): predicted results
            portfolio_size (int): # of equities to buy/sell
            toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
        Returns:
            (float): spread return
        """
        assert df['Rank'].min() == 0
        assert df['Rank'].max() == len(df['Rank']) - 1
        weights = np.linspace(start=toprank_weight_ratio, stop=1, num=portfolio_size)
        purchase = (df.sort_values(by='Rank')['Target'][:portfolio_size] * weights).sum() / weights.mean()
        short = (df.sort_values(by='Rank', ascending=False)['Target'][:portfolio_size] * weights).sum() / weights.mean()
        return purchase - short

    buf = df.groupby('Date').apply(_calc_spread_return_per_day, portfolio_size, toprank_weight_ratio)
    sharpe_ratio = buf.mean()
    return sharpe_ratio

In [None]:
def set_rank_date(df):
    """
    Args:
        df (pd.DataFrame): df including predict_column for a Date
    Returns:
        df (pd.DataFrame): df with Rank for a day
    """ 
    # sort records to set Rank
    df = df.sort_values("TargetPredicted", ascending=False)
    # set Rank starting from 0
    df["Rank"] = np.arange(len(df["Target"]))
    return df

def set_rank_prediction(df) :
        
    df['Target'] = pred_model.predict(df.drop('Date', axis=1))
    df = df.sort_values(["Date", "Target"], ascending=[True, False])
    df = df.groupby("Date").apply(set_rank_date)
    df = df.reset_index(drop=True)
    return df

In [None]:
def make_predictions_security(security) :
  string_security_scaled = "df_security_{}_scaled".format(security)
  df = globals()[string_security_scaled]
  df.reset_index(drop=True, inplace=True)
  first_idx_test = (df['Date'] >= G.TEST_START).idxmax() - 30
  if first_idx_test<=0 :  # Si il n' a pas de date à predire pour la période de test
    return 0
  last_idx_test = df.shape[0] # en fait il faut faire -1 pour avoir le dernier indice du dataframe
  df_window = windowed_dataset(df.iloc[first_idx_test:][features_nodate_nocode])

  predictions = df.iloc[first_idx_test + 30:][['Date', 'Target']]

  forecast = model.predict(df_window)

  # Rescale target
  string_min_target = "min_target_{}".format(security)
  string_max_target = "max_target_{}".format(security)
  min = globals()[string_min_target]
  max = globals()[string_max_target]
  # rescale = functools.partial(rescale_min_max, min_=min, max_=max)
  # forecasts = list(map(rescale, forecasts))
  # targets = list(map(rescale, targets))
  # min = globals()["global_scaler"].data_min_[-1]
  # max = globals()["global_scaler"].data_max_[-1]

  predictions['TargetPredicted'] = forecast

  predictions.Target = predictions.Target.apply(lambda x :rescale_min_max(x, min, max))
  predictions.TargetPredicted = predictions.TargetPredicted.apply(lambda x :rescale_min_max(x, min, max))
  # # We output in a dataframe
  # predictions = {'Date':days,'TargetPredicted':forecasts, 'Target' : targets}
  # predictions = pd.DataFrame(predictions)

  return predictions

In [None]:
# def make_predictions_security(security) :
#   string_security_scaled = "df_security_{}_scaled".format(security)
#   df = globals()[string_security_scaled]
#   df.reset_index(drop=True, inplace=True)
#   first_idx_test = (df['Date'] >= G.TEST_START).idxmax()
#   if first_idx_test==0 :  # Si il n' a pas de date à predire pour la période de test
#     return 0
#   last_idx_test = df.shape[0] # en fait il faut faire -1 pour avoir le dernier indice du dataframe

#   days = []
#   forecasts = []
#   targets = []
#   for day in range(first_idx_test, last_idx_test) :
#     df_window = df.iloc[day-30:day].drop(columns=['Date', 'Target'])
#     tf_window = tf.convert_to_tensor(df_window)
#     forecast = model.predict(tf.expand_dims(tf_window, axis=0)).astype(float)[0,0]
#     forecasts.append(forecast)
#     days.append(df.iloc[day].Date)
#     targets.append(df.iloc[day].Target)

#   # Rescale target
#   string_min_target = "min_target_{}".format(security)
#   string_max_target = "max_target_{}".format(security)
#   min = globals()[string_min_target]
#   max = globals()[string_max_target]
#   rescale = functools.partial(rescale_min_max, min_=min, max_=max)
#   forecasts = list(map(rescale, forecasts))
#   targets = list(map(rescale, targets))

#   # We output in a dataframe
#   predictions = {'Date':days,'TargetPredicted':forecasts, 'Target' : targets}
#   predictions = pd.DataFrame(predictions)

#   return predictions


In [None]:
def add_ranking(df) :
  # Predicts Target for securities over all the Test period
  securities = sorted(df.SecuritiesCode.unique())
  buff = []
  for security in tqdm(securities) :
    pred_security = make_predictions_security(security)
    if type(pred_security) == int : continue
    else :
      buff.append(pred_security)
  df = pd.concat(buff)
  df = df.sort_values(["Date", "TargetPredicted"], ascending=[True, False])
  df = df.groupby("Date").apply(set_rank_date)
  df = df.reset_index(drop=True)
  return df

In [None]:
predictions = add_ranking(df_prices)

100%|██████████| 2000/2000 [06:08<00:00,  5.42it/s]


In [None]:
predictions

In [None]:
calc_spread_return_sharpe(predictions)

0.5526647957949242

In [None]:
calc_spread_average_return(predictions)

0.4187879334893368

In [None]:
predicted_rank.to_pickle("/content/drive/My Drive/Colab Notebooks/predicted_rank.pkl")  