In [6]:
import jpx_tokyo_market_prediction
from sklearn.tree import DecisionTreeRegressor
from tqdm.notebook import tqdm
import optuna
optuna.logging.set_verbosity(optuna.logging.CRITICAL)
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc
import os
import matplotlib.pyplot as plt
from pprint import pprint
import seaborn as sns
from matplotlib import cm
from IPython.core.display import display, HTML
from tqdm import tqdm
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
from tqdm.notebook import tqdm
from itertools import combinations, product
from functools import partial
from multiprocessing import Pool, Manager, cpu_count
from IPython.display import display_html
sns.set_context("notebook")

import warnings
warnings.filterwarnings("ignore")

In [7]:
def prep_prices(price):
    
    from decimal import ROUND_HALF_UP, Decimal
    
    pcols = ["Open","High","Low","Close"]

    price.ExpectedDividend.fillna(0,inplace=True)
    
    def qround(x):
        return float(Decimal(str(x)).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP))
    
    def adjust_prices(df):
        df = df.sort_values("Date", ascending=False)
        df.loc[:, "CumAdjust"] = df["AdjustmentFactor"].cumprod()

        # generate adjusted prices
        for p in pcols:     
            df.loc[:, p] = (df["CumAdjust"] * df[p]).apply(qround)
        df.loc[:, "Volume"] = df["Volume"] / df["CumAdjust"]
        df.ffill(inplace=True)
        df.bfill(inplace=True)
        
        # generate and fill Targets
        #df.loc[:, "Target"] = df.Close.pct_change().shift(-2).fillna(df.Target).fillna(0)
        df.Target.fillna(0,inplace=True)

        return df

    # generate Adjusted
    price = price.sort_values(["SecuritiesCode", "Date"])
    price = price.groupby("SecuritiesCode").apply(adjust_prices).reset_index(drop=True)
    price = price.sort_values("RowId")
    return price

In [8]:
options = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/train_files/options.csv")
options.head(3)

Unnamed: 0,DateCode,Date,OptionsCode,WholeDayOpen,WholeDayHigh,WholeDayLow,WholeDayClose,NightSessionOpen,NightSessionHigh,NightSessionLow,...,Putcall,LastTradingDay,SpecialQuotationDay,SettlementPrice,TheoreticalPrice,BaseVolatility,ImpliedVolatility,InterestRate,DividendRate,Dividend
0,20170104_132010018,2017-01-04,132010018,650.0,650.0,480.0,480.0,0.0,0.0,0.0,...,1,20170112,20170113,480.0,478.4587,17.4736,17.5865,0.0091,0.0,0.0
1,20170104_132010118,2017-01-04,132010118,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,20170112,20170113,575.0,571.1385,17.4736,16.5,0.0091,0.0,0.0
2,20170104_132010218,2017-01-04,132010218,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,20170112,20170113,680.0,677.371,17.4736,15.8644,0.0091,0.0,0.0


In [9]:
stock_fin_spec = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/data_specifications/stock_fin_spec.csv")
stock_fin_spec

Unnamed: 0,Column,Sample value,Type,Addendum,Remarks
0,DisclosureNumber,20161025419878,Int64,,Unique ID for disclosure documents.
1,DateCode,20170106_7888,string,,combination of TradeDate and LocalCode (this i...
2,Date,2017-01-06 0:00:00,date,,Trade date. This column is used to align with ...
3,SecuritiesCode,7888,Int64,,Local Securities Code
4,DisclosedDate,2017-01-06 0:00:00,date,,Date on which the document disclosed.
5,DisclosedTime,15:30:00,time,,Time on which the document disclosed.
6,DisclosedUnixTime,1483684200,Int64,,Unix time of the datetime on which the documen...
7,TypeOfDocument,2QFinancialStatements_Consolidated_JP,string,,Document type indicated by ID.
8,CurrentPeriodEndDate,2016-11-30,date,,End date of the current accounting period.
9,TypeOfCurrentPeriod,2Q,date,"[Normal] 1Q (1st Quarter), 2Q (2nd Quarter), 3...",Type of the current accounting period.\n\n[Not...


In [10]:
stock_list = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/stock_list.csv")
stock_list.head(3)

Unnamed: 0,SecuritiesCode,EffectiveDate,Name,Section/Products,NewMarketSegment,33SectorCode,33SectorName,17SectorCode,17SectorName,NewIndexSeriesSizeCode,NewIndexSeriesSize,TradeDate,Close,IssuedShares,MarketCapitalization,Universe0
0,1301,20211230,"KYOKUYO CO.,LTD.",First Section (Domestic),Prime Market,50,"Fishery, Agriculture and Forestry",1,FOODS,7,TOPIX Small 2,20211230.0,3080.0,10928280.0,33659110000.0,True
1,1305,20211230,Daiwa ETF-TOPIX,ETFs/ ETNs,,-,-,-,-,-,-,20211230.0,2097.0,3634636000.0,7621831000000.0,False
2,1306,20211230,NEXT FUNDS TOPIX Exchange Traded Fund,ETFs/ ETNs,,-,-,-,-,-,-,20211230.0,2073.5,7917718000.0,16417390000000.0,False


In [11]:
options_spec = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/data_specifications/options_spec.csv")
options_spec

Unnamed: 0,Column,Sample value,Type,Addendum,Remarks
0,DateCode,20170104_144122718,string,,Unique ID for option price records
1,Date,2017-01-04 0:00:00,date,,Trade date and time
2,OptionsCode,144122718,string,,Local Securities Code (link to https://www.jpx...
3,WholeDayOpen,0,float,,Opening Price for Whole Trading Day
4,WholeDayHigh,0,float,,High Price for Whole Trading Day
5,WholeDayLow,0,float,,Low Price for Whole Trading Day
6,WholeDayClose,0,float,,Closing Price for Whole Trading Day
7,NightSessionOpen,0,float,,Opening Price for Night Session
8,NightSessionHigh,0,float,,High Price for Night Session
9,NightSessionLow,0,float,,Low Price for Night Session


In [12]:
stock_list_spec = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/data_specifications/stock_list_spec.csv")
stock_list_spec

Unnamed: 0,Column,Sample value,Type,Addendum,Remarks
0,SecuritiesCode,1301,Int64,,Local Securities Code
1,EffectiveDate,20211230,date,,the effective date
2,Name,"KYOKUYO CO.,LTD.",string,,Name of security
3,Section/Products,First Section (Domestic),string,,Section/Product
4,NewMarketSegment,Prime Market,string,,New market segment effective from 2022-04-04 (...
5,33SectorCode,50,Int64,,33 Sector Name\n\nref. https://www.jpx.co.jp/e...
6,33SectorName,"Fishery, Agriculture and Forestry",string,,33 Sector Name\n\nref. https://www.jpx.co.jp/e...
7,17SectorCode,1,Int64,,17 Sector Code\nref. https://www.jpx.co.jp/eng...
8,17SectorName,FOODS,string,,17 Sector Name\nref. https://www.jpx.co.jp/eng...
9,NewIndexSeriesSizeCode,7,Int64,,TOPIX New Index Series code\n\nref. https://ww...


In [13]:
financials = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/train_files/financials.csv")
financials.head(3)

Unnamed: 0,DisclosureNumber,DateCode,Date,SecuritiesCode,DisclosedDate,DisclosedTime,DisclosedUnixTime,TypeOfDocument,CurrentPeriodEndDate,TypeOfCurrentPeriod,...,ForecastEarningsPerShare,ApplyingOfSpecificAccountingOfTheQuarterlyFinancialStatements,MaterialChangesInSubsidiaries,ChangesBasedOnRevisionsOfAccountingStandard,ChangesOtherThanOnesBasedOnRevisionsOfAccountingStandard,ChangesInAccountingEstimates,RetrospectiveRestatement,NumberOfIssuedAndOutstandingSharesAtTheEndOfFiscalYearIncludingTreasuryStock,NumberOfTreasuryStockAtTheEndOfFiscalYear,AverageNumberOfShares
0,20161210000000.0,20170104_2753,2017-01-04,2753.0,2017-01-04,07:30:00,1483483000.0,3QFinancialStatements_Consolidated_JP,2016-12-31,3Q,...,319.76,,False,True,False,False,False,6848800.0,－,6848800.0
1,20170100000000.0,20170104_3353,2017-01-04,3353.0,2017-01-04,15:00:00,1483510000.0,3QFinancialStatements_Consolidated_JP,2016-11-30,3Q,...,485.36,,False,True,False,False,False,2035000.0,118917,1916083.0
2,20161230000000.0,20170104_4575,2017-01-04,4575.0,2017-01-04,12:00:00,1483499000.0,ForecastRevision,2016-12-31,2Q,...,-93.11,,,,,,,,,


In [15]:
%%time
# Loading Stock Prices
path = "../input/jpx-tokyo-stock-exchange-prediction/"

df_train = pd.read_csv(f"{path}train_files/stock_prices.csv", parse_dates=["Date"])
df_train = df_train[df_train.Date>"2020-10-02"] #Targets not Nulls and 2000 secutities data
df_train = prep_prices(df_train)

df_test = pd.read_csv(f"{path}supplemental_files/stock_prices.csv", parse_dates=["Date"])
df_test = prep_prices(df_test)

CPU times: user 47 s, sys: 842 ms, total: 47.8 s
Wall time: 47.8 s


In [16]:
def fill_nans(prices):
    prices.set_index(["SecuritiesCode", "Date"], inplace=True)
    prices.ExpectedDividend.fillna(0,inplace=True)
    prices.ffill(inplace=True)
    prices.fillna(0,inplace=True)
    prices.reset_index(inplace=True)
    return prices
# ffill은 가장 많이 쓰는 방법으로 전기의 값을 쭉 채워주는 것.(이성적으로도 맞는 방법) -> 기본적으로 ffill 적용 후에 
# parameter 조정할 때 혹은 feature을 추가 시킬 때, 보간법(선형, 비선형 등)을 적용한다. 

In [17]:
%%time


df_train= fill_nans(df_train)

CPU times: user 204 ms, sys: 28 µs, total: 204 ms
Wall time: 203 ms


In [18]:
# create new dataframe with just closing price for each stock
df = df_train.pivot(index='Date', columns='SecuritiesCode', values='Close')
df

SecuritiesCode,1301,1332,1333,1375,1376,1377,1379,1381,1407,1413,...,9982,9983,9984,9987,9989,9990,9991,9993,9994,9997
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-10-05,2819.0,453.0,2412.0,2062.0,1615.0,3775.0,2238.0,3330.0,2133.8,1978.0,...,1799.0,66790.0,6750.0,3980.0,4010.0,571.0,951.0,1794.0,2169.0,934.0
2020-10-06,2824.0,450.0,2398.0,2044.0,1588.0,3785.0,2236.0,3355.0,2203.1,1988.0,...,1829.0,66700.0,6913.0,4060.0,3985.0,573.0,952.0,1788.0,2185.0,943.0
2020-10-07,2780.0,446.0,2345.0,2035.0,1541.0,3730.0,2243.0,3300.0,2237.7,1990.0,...,1868.0,66490.0,6955.0,3980.0,3940.0,569.0,932.0,1808.0,2185.0,960.0
2020-10-08,2786.0,443.0,2345.0,2035.0,1502.0,3800.0,2254.0,3250.0,2330.8,1976.0,...,1862.0,67290.0,7085.0,4025.0,4045.0,568.0,944.0,1824.0,2195.0,953.0
2020-10-09,2799.0,436.0,2336.0,2099.0,1510.0,3745.0,2244.0,3300.0,2469.2,1989.0,...,1711.0,69220.0,6997.0,4020.0,4080.0,569.0,934.0,1840.0,2169.0,958.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-11-29,2951.0,567.0,2269.0,1248.0,1381.0,3125.0,1889.0,3160.0,6430.0,2126.0,...,1725.0,70500.0,6208.0,3070.0,3055.0,528.0,773.0,1680.0,2358.0,668.0
2021-11-30,2900.0,573.0,2277.0,1217.0,1348.0,3125.0,1842.0,3135.0,6550.0,2118.0,...,1690.0,67400.0,6030.0,3085.0,2989.0,520.0,768.0,1680.0,2328.0,667.0
2021-12-01,2911.0,574.0,2299.0,1231.0,1353.0,3135.0,1889.0,3185.0,6290.0,2123.0,...,1722.0,67080.0,5900.0,3090.0,2937.0,522.0,770.0,1671.0,2340.0,685.0
2021-12-02,2933.0,573.0,2303.0,1212.0,1327.0,3110.0,1867.0,3150.0,6070.0,2112.0,...,1680.0,67140.0,5599.0,3105.0,2947.0,507.0,778.0,1650.0,2362.0,684.0


In [20]:
from fbprophet import Prophet
from fbprophet.make_holidays import make_holidays_df

# **creat prophet prediction value(target)**

In [82]:
Code = df.columns[0]  
cols = ['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'SecuritiesCode', 'Target']
STOCK = df_train[df_train.SecuritiesCode==Code][cols].set_index("Date")
TEST = df_test[df_test.SecuritiesCode==Code].set_index("Date")

year_list = [2017, 2018, 2019, 2020, 2021, 2022, 2023]
holidays = make_holidays_df(year_list=year_list, country='JP')
fut = TEST[:]
d = 365 # days to plot
w = 3 # windows size

LDAYS = STOCK.tail(d)
fut = pd.concat([STOCK,TEST])
xd = (LDAYS.index).append(TEST.index)

fut['Cl_lr'] = fut.Close.rolling(window=w,closed="left").apply(lambda y: 
               np.poly1d(np.polyfit(np.array(range(w)),y,1))(w),raw=True)

fut['Cl2'] = fut.Close.rolling(window=2,closed="left").apply(lambda y: 
               np.poly1d(np.polyfit([0,1],y,1))(2),raw=True)

rho = np.corrcoef(fut.Close[-len(xd):],fut.Cl_lr[-len(xd):])
ph_df = fut[["Close","Cl_lr",'Cl2']][w:len(STOCK)].reset_index()
ph_df.rename(columns={'Close': 'y', 'Date': 'ds'}, inplace=True)

m = Prophet(holidays=holidays,
        daily_seasonality=False,
        changepoint_prior_scale=0.015)
m.add_seasonality(name='monthly', period=25.5, fourier_order=5)
m.add_regressor('Cl_lr')
m.add_regressor('Cl2')
m.fit(ph_df)
future_prices = m.make_future_dataframe(periods=175, freq='d')
future_prices = future_prices[future_prices.ds.dt.dayofweek < 5]
future_prices = future_prices.set_index("ds").join(fut[['Cl_lr','Cl2']][w:], how='left')

future_prices = future_prices.reset_index()   
forecast = m.predict(future_prices)
result = TEST[['Close']].join(forecast.set_index("ds"),how='left')
result=result[['yhat']]
result=result.rename(columns = {"yhat": str(df.columns[0])}, inplace=False)
prophet_result=pd.DataFrame(result)
prophet_result=prophet_result.reset_index()
prophet_result

Initial log joint probability = -2.26114
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
      99       1134.72   0.000341394       299.122           1           1      127   
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
     199       1135.06   2.00406e-05       240.792           1           1      252   
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
     264       1135.26   1.01197e-09       220.213    0.002294      0.2388      334   
Optimization terminated normally: 
  Convergence detected: absolute parameter change was below tolerance


Unnamed: 0,Date,1301
0,2021-12-06,3039.315357
1,2021-12-07,2999.969707
2,2021-12-08,3090.926833
3,2021-12-09,3094.448575
4,2021-12-10,3098.764267
...,...,...
110,2022-05-23,3288.093367
111,2022-05-24,3315.788791
112,2022-05-25,3281.496079
113,2022-05-26,3275.915056


In [85]:
for i in range(df.columns[1],2000):
    # Select Security Code
        Code = df.columns[i]  
        cols = ['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'SecuritiesCode', 'Target']
        STOCK = df_train[df_train.SecuritiesCode==Code][cols].set_index("Date")
        TEST = df_test[df_test.SecuritiesCode==Code].set_index("Date")

        year_list = [2017, 2018, 2019, 2020, 2021, 2022, 2023]
        holidays = make_holidays_df(year_list=year_list, country='JP')
        fut = TEST[:]
        d = 365 # days to plot
        w = 3 # windows size

        LDAYS = STOCK.tail(d)
        fut = pd.concat([STOCK,TEST])
        xd = (LDAYS.index).append(TEST.index)

        fut['Cl_lr'] = fut.Close.rolling(window=w,closed="left").apply(lambda y: 
                           np.poly1d(np.polyfit(np.array(range(w)),y,1))(w),raw=True)

        fut['Cl2'] = fut.Close.rolling(window=2,closed="left").apply(lambda y: 
                           np.poly1d(np.polyfit([0,1],y,1))(2),raw=True)

        rho = np.corrcoef(fut.Close[-len(xd):],fut.Cl_lr[-len(xd):])
        ph_df = fut[["Close","Cl_lr",'Cl2']][w:len(STOCK)].reset_index()
        ph_df.rename(columns={'Close': 'y', 'Date': 'ds'}, inplace=True)

        m = Prophet(holidays=holidays,
                    daily_seasonality=False,
                    changepoint_prior_scale=0.015)
        m.add_seasonality(name='monthly', period=25.5, fourier_order=5)
        m.add_regressor('Cl_lr')
        m.add_regressor('Cl2')
        m.fit(ph_df)
        # Predict Prices
        future_prices = m.make_future_dataframe(periods=175, freq='d')
        future_prices = future_prices[future_prices.ds.dt.dayofweek < 5]
        future_prices = future_prices.set_index("ds").join(fut[['Cl_lr','Cl2']][w:], how='left')
        
        future_prices = future_prices.reset_index()   
        forecast = m.predict(future_prices)
        result = TEST[['Close']].join(forecast.set_index("ds"),how='left')
        result=result[['yhat']]
        result=result.rename(columns = {"yhat": df.columns[i]}, inplace=True)
        result=result.reset_index()
        prophet_result=pd.merge(prophet_result,result)
        print(f'----------------{df.columns[i]}is done------------------')

Initial log joint probability = -2.29617
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
      99       993.267    0.00082545        320.53      0.3791      0.8459      121   
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
     176       994.748   0.000197923        290.66   6.732e-07       0.001      246  LS failed, Hessian reset 
     199       994.795   2.80017e-06       251.298      0.8731      0.8731      275   
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
     247       994.798     1.389e-08       210.769      0.4133           1      335   
Optimization terminated normally: 
  Convergence detected: relative gradient magnitude is below tolerance
--------------
Initial log joint probability = -3.36734
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
      78       981.257   0.000374309        314.61   1.17

In [86]:
prophet_result

Unnamed: 0,Date,1301,7246pred,7250pred,7254pred,7259pred,7261pred,7267pred,7269pred,7270pred,...,9982pred,9983pred,9984pred,9987pred,9989pred,9990pred,9991pred,9993pred,9994pred,9997pred
0,2021-12-06,3039.315357,388.971374,1150.256153,483.756237,4255.126205,956.860926,3241.868636,4676.799071,2095.537872,...,1761.039850,67612.613238,5401.693048,3209.389293,3090.121002,534.528979,819.332362,1658.143663,2428.942012,693.257410
1,2021-12-07,2999.969707,388.726765,1133.642008,504.338266,4190.859796,931.753005,3216.766014,4573.594646,2069.955791,...,1800.393830,69091.843795,5036.841236,3171.588472,3175.690259,524.236303,802.690790,1648.866363,2448.384130,704.894301
2,2021-12-08,3090.926833,392.259961,1167.932325,527.934930,4410.821225,977.438904,3298.267796,4732.740331,2154.563967,...,1796.107566,68820.340665,5383.196695,3238.957288,3192.440906,540.372279,809.437379,1644.312397,2465.893620,721.143824
3,2021-12-09,3094.448575,395.163714,1171.441020,535.462482,4503.355634,983.456225,3273.346373,4803.113716,2181.679295,...,1780.014804,69380.046135,5767.983444,3254.022608,3198.266384,548.553747,816.768738,1669.404354,2446.819518,745.129151
4,2021-12-10,3098.764267,394.779020,1128.202669,516.370867,4342.298392,950.609226,3212.969027,4670.771149,2150.378033,...,1771.492686,68693.891260,5588.329291,3324.483538,3116.613476,549.436388,808.091652,1675.006425,2431.391192,728.293278
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90,2022-04-19,3151.567827,372.678769,941.338641,488.431952,3765.923338,915.954936,3220.829675,4141.514253,1914.368404,...,1316.335304,66146.146321,5448.737948,3597.143801,2835.788260,478.592282,775.528511,1559.189814,2441.829081,681.513630
91,2022-04-20,3145.792102,379.640403,947.502848,494.992112,3887.606363,970.865454,3275.485616,4232.363104,1968.368297,...,1300.686298,62157.925126,5233.552613,3588.623214,2759.679956,491.912527,775.154426,1564.208038,2451.806414,677.746362
92,2022-04-21,3216.566367,383.129738,978.490908,494.284008,4047.066414,1004.324417,3411.781458,4362.297330,2068.868486,...,1294.885367,64740.925458,5355.915880,3693.989821,2916.203812,490.320409,778.845735,1592.817458,2457.090922,696.905411
93,2022-04-22,3258.423068,387.201397,976.966934,502.914152,4023.024931,980.043085,3397.284700,4399.309423,2084.206003,...,1292.086174,67151.551265,5405.771262,3696.849768,2995.236381,499.975026,786.704129,1608.696826,2475.692373,696.160524


In [None]:
def absHighPass(df, absThresh):
    df.loc[:, ]
    passed = set()
    for (r,c) in combinations(df.columns, 2):
        if (abs(df.loc[r,c]) >= absThresh):
            passed.add(r)
            passed.add(c)
    passed = sorted(passed)
    return df.loc[passed,passed]

corr = df.corr()

In [None]:
mat = absHighPass(corr,0.978)
mask = np.triu(np.ones_like(mat))
fig, ax = plt.subplots(figsize=(20, 20))
sns.heatmap(mat, annot=True, mask=mask, cmap="viridis")
plt.show()

In [None]:
fig = sns.PairGrid(df[[9101, 9104, 9107, 9110, 6532]].dropna())
fig.map_upper(plt.scatter, color='blue')
fig.map_lower(sns.kdeplot, cmap='cool_d')
fig.map_diag(sns.distplot, bins=30);

In [None]:
cols = ['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'SecuritiesCode', 'Target']
STOCK = df_train[df_train.SecuritiesCode==Code][cols].set_index("Date")
TEST = df_test[df_test.SecuritiesCode==Code].set_index("Date")
display_html(STOCK)

In [None]:
STOCK

In [None]:
TEST

In [None]:
plt.figure(figsize=(15,7))
plt.title("Plot a Histogram of the Daily Closing Price - TRAIN set")
sns.distplot(STOCK['Close'].dropna(), bins=50, color='blue');

In [None]:
plt.figure(figsize=(15,7))
plt.title("Plot a Histogram of the Daily Closing Price - TEST set")
sns.distplot(TEST['Close'].dropna(), bins=50, color='magenta');

In [None]:
STOCK=STOCK.reset_index()
STOCK['Date']=pd.to_datetime(STOCK['Date'])
STOCK

In [None]:
STOCK=STOCK.set_index('Date')
STOCK

In [None]:
TEST

In [None]:
plt.figure(figsize=(15,7))
top = plt.subplot2grid((4,4), (0, 0), rowspan=3, colspan=4)
bottom = plt.subplot2grid((4,4), (3,0), rowspan=1, colspan=4)
top.plot(STOCK.index, STOCK.Close, label="Train set")
top.plot(TEST.index,TEST.Close, color="magenta", label="Test set")
bottom.bar(STOCK.index, STOCK.Volume)
bottom.bar(TEST.index, TEST.Volume, color="magenta")
top.legend(bbox_to_anchor=(1.01, 1., 0.11, 0.), loc='upper right', borderaxespad=0.)
 
# set the labels
top.axes.xaxis.set_ticklabels([])
top.axvline(TEST.index[0], color='red', linestyle='--')
bottom.axvline(TEST.index[0], color='red', linestyle='--')
top.set_title(Code)
top.grid(True)
top.set_ylabel('Closing Price')
bottom.set_ylabel('Volume')
bottom.grid(True);

In [None]:
df_test

In [None]:
d = 287
# days to plot
e = 15  # days to extrapolate
g = 2  # max degree of regression

width = .8
width2 = .1
col1 = 'green'
col2 = 'red'

LDAYS = STOCK.head(d)
up = LDAYS[LDAYS.Close>=LDAYS.Open]
down = LDAYS[LDAYS.Close<LDAYS.Open]
fut = TEST[:]


xd = (LDAYS.index).append(TEST.head(e).index)

x = [x for x in range(d)]
p = {}
for i in range(g):
    z = np.polyfit(x, LDAYS.Close, i+1)
    p[i] = np.poly1d(z)
x = np.array(range(d+e))

plt.figure(figsize=(15,7))
top = plt.subplot2grid((4,4), (0, 0), rowspan=3, colspan=4)
top.set_title(f"CandleStick Chart of last {d} days with {e} extrapolated days by polynomials of degrees until {g}")
top.axes.xaxis.set_ticklabels([])
for i in range(g):
    top.plot(xd, p[i](x), linewidth=1, label=f'Degree {i+1}')
top.plot(fut.index,fut.Close,  marker="o", markersize=5, color="magenta", linewidth=0, label="Test Close")
top.bar(up.index,up.Close-up.Open,width,bottom=up.Open,color=col1)
top.bar(up.index,up.High-up.Close,width2,bottom=up.Close,color=col1)
top.bar(up.index,up.Low-up.Open,width2,bottom=up.Open,color=col1)
top.bar(down.index,down.Close-down.Open,width,bottom=down.Open,color=col2)
top.bar(down.index,down.High-down.Open,width2,bottom=down.Open,color=col2)
top.bar(down.index,down.Low-down.Close,width2,bottom=down.Close,color=col2)
top.axvline(TEST.index[0], color='red', linestyle='--')
top.legend(bbox_to_anchor=(1.01, 1., 0.11, 0.), loc='upper right', borderaxespad=0.)
top.grid(True)

bottom = plt.subplot2grid((4,4), (3,0), rowspan=1, colspan=4)
bottom.bar(xd, np.append(LDAYS['Volume'].values,np.zeros(e))) 
bottom.bar(fut.index, fut.Volume, color="magenta") 
bottom.axvline(TEST.index[0], color='red', linestyle='--')

bottom.grid(True)
plt.show();


In [None]:
fut

In [None]:
TEST

In [None]:
d = 365 # days to plot
w = 3 # windows size

LDAYS = STOCK.tail(d)
fut = pd.concat([STOCK,TEST])
xd = (LDAYS.index).append(TEST.index)

fut['Cl_lr'] = fut.Close.rolling(window=w,closed="left").apply(lambda y: 
                   np.poly1d(np.polyfit(np.array(range(w)),y,1))(w),raw=True)

fut['Cl2'] = fut.Close.rolling(window=2,closed="left").apply(lambda y: 
                   np.poly1d(np.polyfit([0,1],y,1))(2),raw=True)

rho = np.corrcoef(fut.Close[-len(xd):],fut.Cl_lr[-len(xd):])

plt.figure(figsize=(15,7))
plt.plot(LDAYS.index, LDAYS.Close, label="Train set")
plt.plot(TEST.index,TEST.Close, color="magenta", label="Test set")
plt.plot(xd,fut.Cl_lr[-len(xd):], color="black", label="Rolling LR")
plt.axvline(TEST.index[0], color='red', linestyle='--')
plt.title(f"Rolling Liner Regression with {w}-days window - Pearson Correlation = {rho[0,1]:.3f}")
plt.legend(bbox_to_anchor=(1.01, 1., 0.11, 0.), loc='upper right', borderaxespad=0.)
plt.show();

In [None]:
# Generate diagonal line to plot.
fig, ax = plt.subplots(figsize=(8,8))
d_x = np.linspace(start=TEST.Close.min() - 1, stop=TEST.Close.max() + 1, num=100)
sns.regplot(x=TEST.Close, y=fut.Cl_lr[-len(TEST):], color='magenta', label='test', ax=ax)
sns.lineplot(x=d_x, y=d_x, dashes={'linestyle': ''}, color='blue', ax=ax)
ax.lines[1].set_linestyle('--')
ax.set(title=f'Test Data vs Predictions - Corr = {np.corrcoef(TEST.Close,fut.Cl_lr[-len(TEST):])[0,1]:.3f}');

In [None]:

# Generate diagonal line to plot.
rho = np.corrcoef(result.Close,result.yhat)
fig, ax = plt.subplots(figsize=(8,8))
d_x = np.linspace(start=TEST.Close.min() - 1, stop=TEST.Close.max() + 1, num=175)
sns.regplot(x=result.Close, y=result.yhat, color='magenta', label='test', ax=ax)
sns.lineplot(x=d_x, y=d_x, dashes={'linestyle': ''}, color='blue', ax=ax)
ax.lines[1].set_linestyle('--')
ax.legend(loc='upper left')
ax.set(title=f'Test Data vs Predictions - Corr = {np.corrcoef(result.Close,result.yhat)[0,1]:.3f}');

In [None]:

fig = m.plot(forecast)
plt.title(f"{Code} Stock Price Forecast", fontsize=16)
plt.xlabel("Date", fontsize=12)
plt.ylabel("Close Price", fontsize=12)
plt.axvline(TEST.index[0], color='red', linestyle='--')
plt.plot(TEST.index, TEST.Close,  marker="o", markersize=3, color="magenta", linewidth=0, label="Test Close")
plt.show()

In [None]:

fig2 = m.plot_components(forecast)
plt.show()

In [None]:
forecast.columns

In [87]:
df

SecuritiesCode,1301,1332,1333,1375,1376,1377,1379,1381,1407,1413,...,9982,9983,9984,9987,9989,9990,9991,9993,9994,9997
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-10-05,2819.0,453.0,2412.0,2062.0,1615.0,3775.0,2238.0,3330.0,2133.8,1978.0,...,1799.0,66790.0,6750.0,3980.0,4010.0,571.0,951.0,1794.0,2169.0,934.0
2020-10-06,2824.0,450.0,2398.0,2044.0,1588.0,3785.0,2236.0,3355.0,2203.1,1988.0,...,1829.0,66700.0,6913.0,4060.0,3985.0,573.0,952.0,1788.0,2185.0,943.0
2020-10-07,2780.0,446.0,2345.0,2035.0,1541.0,3730.0,2243.0,3300.0,2237.7,1990.0,...,1868.0,66490.0,6955.0,3980.0,3940.0,569.0,932.0,1808.0,2185.0,960.0
2020-10-08,2786.0,443.0,2345.0,2035.0,1502.0,3800.0,2254.0,3250.0,2330.8,1976.0,...,1862.0,67290.0,7085.0,4025.0,4045.0,568.0,944.0,1824.0,2195.0,953.0
2020-10-09,2799.0,436.0,2336.0,2099.0,1510.0,3745.0,2244.0,3300.0,2469.2,1989.0,...,1711.0,69220.0,6997.0,4020.0,4080.0,569.0,934.0,1840.0,2169.0,958.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-11-29,2951.0,567.0,2269.0,1248.0,1381.0,3125.0,1889.0,3160.0,6430.0,2126.0,...,1725.0,70500.0,6208.0,3070.0,3055.0,528.0,773.0,1680.0,2358.0,668.0
2021-11-30,2900.0,573.0,2277.0,1217.0,1348.0,3125.0,1842.0,3135.0,6550.0,2118.0,...,1690.0,67400.0,6030.0,3085.0,2989.0,520.0,768.0,1680.0,2328.0,667.0
2021-12-01,2911.0,574.0,2299.0,1231.0,1353.0,3135.0,1889.0,3185.0,6290.0,2123.0,...,1722.0,67080.0,5900.0,3090.0,2937.0,522.0,770.0,1671.0,2340.0,685.0
2021-12-02,2933.0,573.0,2303.0,1212.0,1327.0,3110.0,1867.0,3150.0,6070.0,2112.0,...,1680.0,67140.0,5599.0,3105.0,2947.0,507.0,778.0,1650.0,2362.0,684.0


In [103]:
pd.concat(df[1301],prophet_result.set_index('Date')['1301'])

TypeError: first argument must be an iterable of pandas objects, you passed an object of type "Series"

In [None]:
prophet_result = prophet_result.pivot(index='Date', columns='SecuritiesCode', values='Close')
prophet_result

In [None]:
# Utilities 

def calc_spread_return_per_day(df, portfolio_size, toprank_weight_ratio):
    weights = np.linspace(start=toprank_weight_ratio, stop=1, num=portfolio_size)
    weights_mean = weights.mean()
    df = df.sort_values(by='Rank')
    purchase = (df['Target'][:portfolio_size]  * weights).sum() / weights_mean
    short    = (df['Target'][-portfolio_size:] * weights[::-1]).sum() / weights_mean
    return purchase - short

def calc_spread_return_sharpe(df, portfolio_size=200, toprank_weight_ratio=2):
    grp = df.groupby('Date')
    min_size = grp["Target"].count().min()
    if min_size<2*portfolio_size:
        portfolio_size=min_size//2
        if portfolio_size<1:
            return 0, None
    buf = grp.apply(calc_spread_return_per_day, portfolio_size, toprank_weight_ratio)
    sharpe_ratio = buf.mean() / buf.std()
    return sharpe_ratio, buf
def add_rank(df, col_name="pred"):
    df["Rank"] = df.groupby("Date")[col_name].rank(ascending=False, method="first") - 1 
    df["Rank"] = df["Rank"].astype("int")
    return df

In [None]:

# sns.set_style("whitegrid")
# sns.set(rc={'figure.figsize':(15.7,6)})

# def getadvance(x):
#     ret = 0
#     if x > 0:
#         ret = 1
#     return(ret)

# def get_month(dt):
#     x = dt.strftime("%m")
#     return(x)

# def RSI(series, period):
#     delta = series.diff().dropna()
#     u = delta * 0
#     d = u.copy()
#     u[delta > 0] = delta[delta > 0]
#     d[delta < 0] = -delta[delta < 0]
#     u[u.index[period-1]] = np.mean( u[:period] ) #first value is sum of avg gains
#     u = u.drop(u.index[:(period-1)])
#     d[d.index[period-1]] = np.mean( d[:period] ) #first value is sum of avg losses
#     d = d.drop(d.index[:(period-1)])
#     rs = pd.DataFrame.ewm(u, com=period-1, adjust=False).mean() / \
#          pd.DataFrame.ewm(d, com=period-1, adjust=False).mean()
#     return 100 - 100 / (1 + rs)

# def rsi_class(x):
#     ret = "low"
#     if x < 50:
#         ret = "low"
#     if x > 50:
#         ret = "med"
#     if x > 70:
#         ret = "hi"
#     return(ret)
# # os.listdir('../input/jpx-tokyo-stock-exchange-prediction/train_files/')
# def display_dataframe(df, title = ""):
#     #tdstring = f'<td style="text-align: left; vertical-align: middle; font-size:1.2em;">{v}</td>'
#     if (title != ""):
#         text = f'<h2>{title}</h2><table><tr>'
#     else:
#         text = '<table><tr>'
#     text += ''.join([f'<td style="text-align: left; vertical-align: middle; font-size:1.2em;"><b>{col}</b></td>' for col in df.columns.values]) + '</tr>'
#     for row in df.itertuples():
#         #text +=  '<tr>' + ''.join([f'<td valign="top">{v}</td>' for v in row[1:]]) + '</tr>'
#         text +=  '<tr>' + ''.join([ f'<td style="text-align: left; vertical-align: middle; font-size:1.1em;">{v}</td>' for v in row[1:]]) + '</tr>'
#     text += '</table>'
#     display(HTML(text))
    
# def prep_prices(price, test = False):
#     from decimal import ROUND_HALF_UP, Decimal
#     pcols = ["Open","High","Low","Close"]
#     price.ExpectedDividend.fillna(0,inplace=True)
#     def qround(x):
#         return float(Decimal(str(x)).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP))
    
#     def adjust_prices(df):
#         df = df.sort_values("Date", ascending=False)
#         df.loc[:, "CumAdjust"] = df["AdjustmentFactor"].cumprod()

#         # generate adjusted prices
#         for p in pcols:     
#             df.loc[:, p] = (df["CumAdjust"] * df[p]).apply(qround)
#         df.loc[:, "Volume"] = df["Volume"] / df["CumAdjust"]
#         df.ffill(inplace=True)
#         df.bfill(inplace=True)
        
#         # generate and fill Targets
#         #df.loc[:, "Target"] = ((df.Close.shift(-2)/df.Close.shift(-1) - 1)).fillna(df.Target)
#         if (not test):
#             df.Target.fillna(0,inplace=True)

#         return df

#     # generate Adjusted
#     price = price.sort_values(["SecuritiesCode", "Date"])
#     price = price.groupby("SecuritiesCode").apply(adjust_prices).reset_index(drop=True)
#     price = price.sort_values("RowId")
#     return price





In [None]:
def add_feat(df):
    df['C4'] = df.Close.rolling(window=4,closed="left").apply(lambda y: 
               np.poly1d(np.polyfit([0,1,2,3],y,1))(4),raw=True)     
    df['C3'] = df.Close.rolling(window=3,closed="left").apply(lambda y: 
               np.poly1d(np.polyfit([0,1,2],y,1))(3),raw=True)        
    df['C2'] = df.Close.rolling(window=2,closed="left").apply(lambda y: 
               np.poly1d(np.polyfit([0,1],y,1))(2),raw=True)
    return df

def run_prophet(tr):
    m = Prophet(holidays=holidays,
                daily_seasonality='auto',
                yearly_seasonality='auto',
                weekly_seasonality='auto',
                changepoint_prior_scale=0.1)
    m.add_regressor('C4')
    m.add_regressor('C3')
    m.add_regressor('C2')
    m.fit(tr)
    pred = m.predict(tr[-2:])
    return (pred.yhat[1]/pred.yhat[0] - 1)

def run_reg(tr):
    pred = list(0.6*tr.C2[-2:]+0.4*tr.C3[-2:])
    return (pred[1]/pred[0] - 1)
def proc_cod(cod, tr):
    tr = tr[tr.SecuritiesCode==cod][["Date","Close"]]
    x = [0,1]
    for _ in x:
        tr = tr.append(
        pd.DataFrame({'Date': pd.date_range(start=tr.Date.iloc[-1], 
                                            periods=2, freq='B', 
                                            closed='right'),
                      'Close': np.poly1d(np.polyfit(x,tr.Close[-2:],1))(2)
                     })
        )
    tr = add_feat(tr)
    tr = tr[6:]
    target = run_reg(tr)
    return target


In [100]:
prices

Unnamed: 0,RowId,Date,SecuritiesCode,Open,High,Low,Close,Volume,AdjustmentFactor,ExpectedDividend,SupervisionFlag,Target
0,20211206_1301,2021-12-06,1301,2982.0,2982.0,2965.0,2971.0,8900,1.0,,False,-0.003263
1,20211206_1332,2021-12-06,1332,592.0,599.0,588.0,589.0,1360800,1.0,,False,-0.008993
2,20211206_1333,2021-12-06,1333,2368.0,2388.0,2360.0,2377.0,125900,1.0,,False,-0.009963
3,20211206_1375,2021-12-06,1375,1230.0,1239.0,1224.0,1224.0,81100,1.0,,False,-0.015032
4,20211206_1376,2021-12-06,1376,1339.0,1372.0,1339.0,1351.0,6200,1.0,,False,0.002867
...,...,...,...,...,...,...,...,...,...,...,...,...
229953,20220527_9990,2022-05-27,9990,560.0,577.0,559.0,577.0,153200,1.0,,False,0.003378
229954,20220527_9991,2022-05-27,9991,819.0,819.0,804.0,819.0,18100,1.0,,False,-0.005995
229955,20220527_9993,2022-05-27,9993,1505.0,1513.0,1505.0,1513.0,2000,1.0,,False,0.009315
229956,20220527_9994,2022-05-27,9994,2430.0,2496.0,2430.0,2483.0,15700,1.0,,False,-0.013540


In [None]:
add_feat(df_test)

In [None]:

tr = df_test[df_test.Date==prices.Date.iat[0]].copy()
tr
# run_prophet(tr)

In [None]:
import jpx_tokyo_market_prediction
env = jpx_tokyo_market_prediction.make_env()
iter_test = env.iter_test()

In [None]:
supplemental_stock_prices = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/supplemental_files/stock_prices.csv")
supplemental_stock_prices

In [None]:
supplemental_stock_prices["Rank"] = supplemental_stock_prices.groupby("Date")["Target"].rank(ascending=False,method="first") -1
supplemental_stock_prices.tail(3)

In [None]:
finday = supplemental_stock_prices[supplemental_stock_prices["Date"]=="2022-05-27"].reset_index(drop=True)
finday

In [None]:
finday[finday["Rank"]==finday["Rank"].iloc[0]]
finday["Rank"] = finday["Rank"].astype("int")

In [None]:
counter = 0
# The API will deliver six dataframes in this specific order:
for (prices, options, financials, trades, secondary_prices, sample_prediction) in iter_test:
    if counter == 0:
        print(prices.head())
        print(options.head())
        print(financials.head())
        print(trades.head())
        print(secondary_prices.head())
        print(sample_prediction.head())
    sample_prediction['Rank'] = np.arange(len(sample_prediction))
    env.predict(sample_prediction)
    counter += 1

In [None]:
findaydict = dict(zip(finday["SecuritiesCode"],finday["Rank"]))
findaydict

In [None]:
sample_prediction.head(3)

In [None]:
sample_prediction["Rank"]  = sample_prediction["SecuritiesCode"].map(findaydict)
sample_prediction

In [None]:
df_train

In [None]:
%%time


trgts = {}
for (prices,options, financials, trades, sample_prediction) in iter_test:
    cods = prices.SecuritiesCode.unique()
    df_train = pd.concat([df_train, prices])
    df_train = df_train.sort_values(["SecuritiesCode", "Date"])
    df_train.ffill(inplace=True)
    for cod in tqdm(cods):
        trgts[cod] = proc_cod(cod, df_train)
    tr = df_train[df_train.Date==prices.Date.iat[0]].copy()
    tr.Target=tr["SecuritiesCode"].map(trgts) 
    tr = add_rank(tr, "Target")
    score = calc_spread_return_per_day(tr,200,2)
    print(f"Score: {score}")
    pred = tr.set_index("SecuritiesCode")["Rank"]
    sample_prediction['Rank'] = sample_prediction["SecuritiesCode"].map(pred)
    env.predict(sample_prediction)

In [None]:
tr

In [None]:
trgts

In [None]:
df_train

In [None]:
# # advance to decline ratio in a month
# sns.set(rc={'figure.figsize':(14.7,3)})
# sns.set_style("whitegrid")
# seclist = [1301, 1332, 1333, 1376, 1377]
# for SECURITY in seclist:
# #seclist = [1301]
# #SECURITY = 1301
#     df_c = df_prices.copy()
#     df_stock = df_c[df_c['SecuritiesCode']== SECURITY].reset_index()
#     df_stock = df_stock.sort_values(by = "Date", ascending = False)
#     df_stock['pClose'] = df_stock['Close'].shift(-1)
#     df_stock['delta'] = df_stock['Close'] - df_stock['pClose']
#     df_stock['advance'] = list(map(getadvance, df_stock['delta']))
#     df_stock['Date'] = pd.to_datetime(df_stock['Date'], format = "%Y-%m-%d")
#     df_stock['Month'] =  list(map(get_month, df_stock['Date']))
#     df_stats = df_stock.groupby(["Month"]).agg(
#                         advances = ("advance", "sum"), total = ("advance",  "count")).reset_index()
#     df_stats['advance_to_decline'] = df_stats['advances'] / (df_stats['total'] - df_stats['advances'])
#     plt.title(f"Examining advance to decline ratio for:{SECURITY}")
#     ax  = sns.barplot(x="Month", y="advance_to_decline", data=df_stats, palette="Blues_d")
#     plt.show()

In [None]:
# sns.set_style("whitegrid")
# SECURITY = 1301
# df_stock = df_prices[df_prices['SecuritiesCode'] == SECURITY].reset_index(drop = True)
# df_stock = df_stock.sort_values(by = "Date").reset_index()
# df_stock['rsi'] = RSI( df_stock['Close'], 14 )
# df_stock['rsicat'] = list(map(rsi_class, df_stock['rsi']))
# fig, (ax1, ax2, ax3) = plt.subplots(3,1, figsize = (12,8))
# fig.subplots_adjust(hspace=0.5)
# ax1.plot(df_stock.index, df_stock['Close']);
# labels = [item.get_text() for item in ax1.get_xticklabels()]
# ax1.set_xticklabels(labels)
# ax2.plot(df_stock.index, df_stock['Volume']);
# labels = [item.get_text() for item in ax2.get_xticklabels()]
# ax2.set_xticklabels(labels)
# ax3.plot(df_stock.index, df_stock['rsi']);
# labels = [item.get_text() for item in ax3.get_xticklabels()]
# ax3.set_xticklabels(labels)
# plt.suptitle(f"Comparing Price Close, Volume and RSI for security:{SECURITY}")
# plt.tight_layout()

In [None]:
# # Utilities 

# def calc_spread_return_per_day(df, portfolio_size, toprank_weight_ratio):
#     weights = np.linspace(start=toprank_weight_ratio, stop=1, num=portfolio_size)
#     weights_mean = weights.mean()
#     df = df.sort_values(by='Rank')
#     purchase = (df['Target'][:portfolio_size]  * weights).sum() / weights_mean
#     short    = (df['Target'][-portfolio_size:] * weights[::-1]).sum() / weights_mean
#     return purchase - short

# def calc_spread_return_sharpe(df, portfolio_size=200, toprank_weight_ratio=2):
#     grp = df.groupby('Date')
#     min_size = grp["Target"].count().min()
#     if min_size<2*portfolio_size:
#         portfolio_size=min_size//2
#         if portfolio_size<1:
#             return 0, None
#     buf = grp.apply(calc_spread_return_per_day, portfolio_size, toprank_weight_ratio)
#     sharpe_ratio = buf.mean() / buf.std()
#     return sharpe_ratio, buf

# def add_rank(df, col_name="pred"):
#     df["Rank"] = df.groupby("Date")[col_name].rank(ascending=False, method="first") - 1 
#     df["Rank"] = df["Rank"].astype("int")
#     return df

In [None]:
# ## By Yuike - https://www.kaggle.com/code/ikeppyo/examples-of-higher-scores-than-perfect-predictions

# # This function adjusts the predictions so that the daily spread return approaches a certain value.
# # 측정을 위한 단위 만드는 함수        
# def adjuster(df):
#     def calc_pred(df, x, y, z):
        
#         return df['Target'].where(df['Target'].abs() < x, df['Target'] * y + np.sign(df['Target']) * z)

#     def objective(trial, df):
#         x = trial.suggest_uniform('x', 0, 0.2)
#         y = trial.suggest_uniform('y', 0, 0.05)
#         z = trial.suggest_uniform('z', 0, 1e-3)
#         df["Rank"] = calc_pred(df, x, y, z).rank(ascending=False, method="first") - 1 
#         return calc_spread_return_per_day(df, 200, 2)

#     def predictor_per_day(df):
#         study = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=SD))#5187
#         study.optimize(lambda trial: abs(objective(trial, df) - 3), 3)
#         return calc_pred(df, *study.best_params.values())

#     return df.groupby("Date").apply(predictor_per_day).reset_index(level=0, drop=True)

# def _predictor_base(feature_df):
#     return model.predict(feature_df[feats])

# def _predictor_with_adjuster(feature_df):
#     df_pred = feature_df.copy()
#     df_pred["Target"] = model.predict(feature_df[feats])
#     return adjuster(df_pred).values.T

In [None]:
# # Close는 종가가격/ Target은 수익율
# np.random.seed(0)
# feats = ["Close"]
# max_score = 0
# max_depth = 0
# for md in tqdm(range(3,40)):
#     model = DecisionTreeRegressor( max_depth=md ) # Controlling the overfit with max_depth parameter
#     model.fit(df_prices[feats],df_prices["Target"])
#     predictor = _predictor_base
#     prices["pred"] = predictor(prices)
#     score, buf = calc_spread_return_sharpe(add_rank(prices))
#     if score>max_score:
#         max_score = score
#         max_depth = md
        
# model = DecisionTreeRegressor( max_depth=max_depth )
# model.fit(df_prices[feats],df_prices["Target"])
# print(f'Max_deph={max_depth} : Sharpe Ratio Score base -> {max_score}')

In [None]:
# # Controlling the Sharpe Ratio Score (≃3)
# predictor = _predictor_with_adjuster
# err = 1
# maxSD = 3770
# for SD in tqdm(range(maxSD,5000)):
#     prices["pred"] = predictor(prices)
#     score, buf = calc_spread_return_sharpe(add_rank(prices))
#     if abs(score-3)<=err and score<3:
#         err=abs(score-3)
#         maxSD = SD
#         print(f'{maxSD} Sharpe Ratio Score with adjuster -> {score}')
        
# SD = maxSD

In [None]:
# %%time
# env = jpx_tokyo_market_prediction.make_env()
# iter_test = env.iter_test()

# for prices, options, financials, trades, secondary_prices, sample_prediction in iter_test:
#     prices = fill_nans(prices)
#     prices.loc[:,"pred"] = predictor(prices)
#     prices = add_rank(prices)
#     rank = prices.set_index('SecuritiesCode')['Rank'].to_dict()
#     sample_prediction['Rank'] = sample_prediction['SecuritiesCode'].map(rank)
#     env.predict(sample_prediction)