from : https://www.kaggle.com/code/lancezero/simple-leakage-submission-test/notebook?scriptVersionId=95129549

In [1]:
import pandas as pd
import numpy as np

In [5]:
submission = pd.read_csv('data/jpx-tokyo-stock-exchange-prediction/example_test_files/sample_submission.csv')

In [6]:
submission.head(2)

Unnamed: 0,Date,SecuritiesCode,Rank
0,2021-12-06,1301,0
1,2021-12-06,1332,1


### Experiments on sharp ratio

In [7]:
spread_return_1 = np.full(56, 12) + np.random.normal(loc=0,scale=1,size=56)
spread_return_2 = np.full(56, 15) + np.random.normal(loc=0,scale=2,size=56)

In [8]:
spread_return_1.mean(), spread_return_2.mean()

(11.863156113194576, 14.860402503660664)

In [9]:
spread_return_1.std(), spread_return_2.std()

(1.1121528011893278, 1.8632714716831453)

In [10]:
sharp_ratio_1 = spread_return_1.mean() / spread_return_1.std()
sharp_ratio_2 = spread_return_2.mean() / spread_return_2.std()

In [11]:
sharp_ratio_1,sharp_ratio_2  

(10.666840114513226, 7.975436070105687)

**We observe that even though the spread return of experiment 1 is lower, since its std is also lower, then its sharp ratio is better than exp. 2**

In [12]:

def calc_spread_return_sharpe(df: pd.DataFrame, portfolio_size: int = 200, toprank_weight_ratio: float = 2) -> float:
    """
    Args:
        df (pd.DataFrame): predicted results
        portfolio_size (int): # of equities to buy/sell
        toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
    Returns:
        (float): sharpe ratio
    """

    def _calc_spread_return_per_day(df, portfolio_size, toprank_weight_ratio):
        """
        Args:
            df (pd.DataFrame): predicted results
            portfolio_size (int): # of equities to buy/sell
            toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
        Returns:
            (float): spread return
        """
        assert df['Rank'].min() == 0
        assert df['Rank'].max() == len(df['Rank']) - 1
        
        weights = np.linspace(start=toprank_weight_ratio, stop=1, num=portfolio_size)
        purchase = (df.sort_values(by='Rank')['Target'][:portfolio_size] * weights).sum() / weights.mean()
        short = (df.sort_values(by='Rank', ascending=False)['Target'][:portfolio_size] * weights).sum() / weights.mean()
        return purchase - short

    buf = df.groupby('Date').apply(_calc_spread_return_per_day, portfolio_size, toprank_weight_ratio)
    sharpe_ratio = buf.mean() / buf.std()
    return sharpe_ratio


### supplemental_files

In [14]:
df = pd.read_csv('data/jpx-tokyo-stock-exchange-prediction/supplemental_files/stock_prices.csv', parse_dates=["Date"])
df.head(2)

Unnamed: 0,RowId,Date,SecuritiesCode,Open,High,Low,Close,Volume,AdjustmentFactor,ExpectedDividend,SupervisionFlag,Target
0,20211206_1301,2021-12-06,1301,2982.0,2982.0,2965.0,2971.0,8900,1.0,,False,-0.003263
1,20211206_1332,2021-12-06,1332,592.0,599.0,588.0,589.0,1360800,1.0,,False,-0.008993


In [15]:
df['Date'].nunique()

56

### rank stocks by target, disregarding volatility, and consider the higher the return the better

In [16]:
df['Rank'] = df.groupby("Date")["Target"].rank(ascending=False, method="first") - 1

In [17]:
df['Rank'] = df['Rank'].astype('int')
df.head()

Unnamed: 0,RowId,Date,SecuritiesCode,Open,High,Low,Close,Volume,AdjustmentFactor,ExpectedDividend,SupervisionFlag,Target,Rank
0,20211206_1301,2021-12-06,1301,2982.0,2982.0,2965.0,2971.0,8900,1.0,,False,-0.003263,1394
1,20211206_1332,2021-12-06,1332,592.0,599.0,588.0,589.0,1360800,1.0,,False,-0.008993,1670
2,20211206_1333,2021-12-06,1333,2368.0,2388.0,2360.0,2377.0,125900,1.0,,False,-0.009963,1712
3,20211206_1375,2021-12-06,1375,1230.0,1239.0,1224.0,1224.0,81100,1.0,,False,-0.015032,1844
4,20211206_1376,2021-12-06,1376,1339.0,1372.0,1339.0,1351.0,6200,1.0,,False,0.002867,978


**gives the column rank, which indicates the rank of each stock's target for each day among the 2000 stocks for that day.**

**But the submission is asking for the table to be sorted by date, and then within each date it is sorted by rank. From smallest to largest, from 0-1999**

In [18]:
df_submission = df.sort_values(["Date", "Rank"],ascending=True)
df_submission.head(2)

Unnamed: 0,RowId,Date,SecuritiesCode,Open,High,Low,Close,Volume,AdjustmentFactor,ExpectedDividend,SupervisionFlag,Target,Rank
739,20211206_4699,2021-12-06,4699,2660.0,2660.0,2660.0,2660.0,2900,1.0,,True,0.300633,0
1278,20211206_7036,2021-12-06,7036,3800.0,3860.0,3730.0,3765.0,1500,1.0,,False,0.178344,1


and then look at the submission file, you need to sunmission file rank column and our prediction on the correspondence, that is, through the SecuritiesCode column, and then find each day corresponding to the date, df_submisson in the SecuritiesCode corresponding to the rank is how much.

In [19]:
submission

Unnamed: 0,Date,SecuritiesCode,Rank
0,2021-12-06,1301,0
1,2021-12-06,1332,1
2,2021-12-06,1333,2
3,2021-12-06,1375,3
4,2021-12-06,1376,4
...,...,...,...
111995,2022-02-28,9990,1995
111996,2022-02-28,9991,1996
111997,2022-02-28,9993,1997
111998,2022-02-28,9994,1998


### Calculate the score yourself first, theoretically we don't need to calculate it, the background will calculate it after submitting the file

In [20]:
calc_spread_return_sharpe(df_submission)

5.434704919895069

### Submit to the system

In [21]:
import jpx_tokyo_market_prediction
env = jpx_tokyo_market_prediction.make_env()
iter_test = env.iter_test()

for prices, _, _, _, _, sample_prediction in iter_test:
    day_df = df_submission[df_submission['Date'] == prices["Date"].iloc[0]]
    map_dict = day_df.set_index("SecuritiesCode")["Rank"]
    sample_prediction["Rank"] = sample_prediction.SecuritiesCode.map(map_dict)
    env.predict(sample_prediction)

ModuleNotFoundError: No module named 'jpx_tokyo_market_prediction'