# FIAM Hackathon

## Importing the Necessary Libraries

In [1]:
import pandas as pd 
import pandas_datareader.data as web
import numpy as np 
import yfinance as yf 
import matplotlib.pyplot as plt 
import seaborn as sns 
import scipy as sp 
from sklearn import preprocessing, decomposition, model_selection, linear_model, neighbors, feature_selection, metrics  
rs = np.random.seed(123)
# pd.set_option("display.max_rows", None)  
# pd.set_option("display.max_columns", None) 
pd.set_option("display.float_format", "{:.4f}".format) 
import warnings 
warnings.filterwarnings("ignore") 

In [2]:
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats("svg") 

## Loading the Data

In [3]:
ticker_data = pd.read_csv("C:/Users/khail/OneDrive/Desktop/Github Projects/clean_hackathon_data.csv", parse_dates = ["date"])
ticker_data.sort_values(by = ["date", "stock_ticker"], inplace = True, ignore_index = True)
ticker_data.drop(columns = ["shrcd", "rf"], inplace = True)
ticker_data["date"] = ticker_data['date'].dt.strftime("%Y-%m")
# ticker_data.set_index(["date", "stock_ticker"], inplace = True)

In [4]:
ticker_train = ticker_data[(ticker_data["date"] >= "2000-01") & (ticker_data["date"] < "2010-01")].copy( )
ticker_test = ticker_data[(ticker_data["date"] >= "2010-01")].copy( )

In [5]:
ticker_train 

Unnamed: 0,date,exchcd,mspread,year,month,stock_exret,stock_ticker,eps_medest,eps_meanest,prc,...,betadown_252d,bidaskhl_21d,corr_1260d,betabab_1260d,rmax5_rvol_21d,age,qmj,qmj_prof,qmj_growth,qmj_safety
0,2000-01,1,0.0083,2000,1,-0.1645,AA,1.6300,1.4000,0.7186,...,-0.8435,-0.1793,-0.4296,-0.4097,0.4194,0.0552,0.7280,0.1597,0.8241,0.4226
1,2000-01,3,0.0163,2000,1,0.0050,AAPL,0.0100,0.0100,0.8241,...,0.9032,0.4087,-0.5181,0.0722,-0.0355,-0.6442,0.6339,0.5790,0.6176,0.0468
2,2000-01,1,0.0102,2000,1,-0.1149,ABF,0.5500,0.4900,-0.6156,...,-0.6548,-0.5574,0.5505,0.5866,0.1806,-0.3006,0.6687,0.5984,0.5481,0.2000
3,2000-01,3,0.0190,2000,1,-0.0079,ABGX,-0.0200,-0.0200,0.9146,...,0.5387,0.9418,0.0000,0.0000,0.9984,-0.9509,0.0000,-0.9952,0.0000,0.2048
4,2000-01,3,0.0284,2000,1,0.0636,ABIZ,-1.1000,-1.0500,0.1508,...,0.3097,0.7819,0.0000,0.0000,0.8145,-0.9264,0.0000,-0.9597,0.0000,-0.9935
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114377,2009-12,3,0.0063,2009,12,0.1208,YHOO,0.1100,0.1100,-0.7345,...,-0.1091,-0.5909,-0.8407,-0.5550,-0.9977,-0.7956,0.2625,0.0739,-0.3835,0.8273
114378,2009-12,1,0.0039,2009,12,-0.0086,YUM,0.4700,0.4800,0.0889,...,-0.4159,-0.5455,0.3911,-0.2623,0.2205,-0.7956,0.3341,0.6951,0.6679,-0.5364
114379,2009-12,3,0.0056,2009,12,0.0653,ZBRA,0.2500,0.2500,-0.2667,...,-0.3591,-0.9455,0.1803,-0.3747,0.3091,-0.6934,0.1408,0.1490,-0.4122,0.6136
114380,2009-12,3,0.0076,2009,12,-0.0244,ZION,-1.6200,-1.6100,-0.8002,...,0.8909,0.9545,-0.2881,0.9251,-0.2068,-0.3139,-0.9117,-0.9477,-0.7778,-0.6841


### Time Series CV : Rolling Window 

In [6]:
class RollingTimeSeriesCV :
    
    def __init__(self, train_duration, test_duration, lookahead, n_splits) :
        self.test_duration = test_duration 
        self.train_duration = train_duration 
        self.lookahead = lookahead
        self.n_splits = n_splits

    def split(self, X, y = None, groups = None) :

        unique_dates = X["date"].unique( )  ## Extract unique dates 
        days = sorted(unique_dates, reverse = True)   ## Sort unique dates in descending order 
        
        split_idx = [  ]
        for i in range(self.n_splits) :
            
            test_end_idx = i * self.test_duration 
            test_start_idx = test_end_idx + self.test_duration - 1
                
            train_end_idx = test_start_idx + 1 + self.lookahead 
            train_start_idx = train_end_idx + self.train_duration - 1 
            
            split_idx.append([train_start_idx, train_end_idx, test_start_idx, test_end_idx])

        for split in split_idx :
            train_start_idx, train_end_idx, test_start_idx, test_end_idx = split
    
            # Translate indices to dates using the 'days' list
            train_start_date = days[train_start_idx] if train_start_idx < len(days) else None
            train_end_date = days[train_end_idx] if train_end_idx < len(days) else None
            test_start_date = days[test_start_idx] if test_start_idx < len(days) else None
            test_end_date = days[test_end_idx] if test_end_idx < len(days) else None
            
            yield train_start_date, train_end_date, test_start_date, test_end_date 
        
    def get_n_splits(self, X, y, groups = None) :
        return self.n_splits

### Test Time Series CV 

**Key Consideration** :
> $\text{train_duration} + \text{test_duration} \times \text{n_splits} + \text{lookahead} <= \text{total number of months in training set}$

In [7]:
cv = RollingTimeSeriesCV(train_duration = 70, test_duration = 6, lookahead = 1, n_splits = 8)

In [8]:
for train_start_date, train_end_date, test_start_date, test_end_date in cv.split(X = ticker_train) :
    
    print(f"Train : {train_start_date} to {train_end_date} , Test : {test_start_date} to {test_end_date}")
    print("")
    
    # train_df = ticker_data[(ticker_data["date"] >= train_start_date) & (ticker_data["date"] < train_end_date)]
    # test_df = ticker_data[(ticker_data["date"] >= test_start_date) & (ticker_data["date"] < test_end_date)]
    
    # Reset the index and concatenate the train and test sets
    #split_df = pd.concat(objs = [train_df.reset_index(drop = True), test_df.reset_index(drop = True)])
    #print(split_df.index)
    #print("")

Train : 2003-08 to 2009-05 , Test : 2009-07 to 2009-12

Train : 2003-02 to 2008-11 , Test : 2009-01 to 2009-06

Train : 2002-08 to 2008-05 , Test : 2008-07 to 2008-12

Train : 2002-02 to 2007-11 , Test : 2008-01 to 2008-06

Train : 2001-08 to 2007-05 , Test : 2007-07 to 2007-12

Train : 2001-02 to 2006-11 , Test : 2007-01 to 2007-06

Train : 2000-08 to 2006-05 , Test : 2006-07 to 2006-12

Train : 2000-02 to 2005-11 , Test : 2006-01 to 2006-06

