In [3]:
import os
baseDir = os.getcwd()
dataLocation = 'data/short_clean_closepx.csv'
dataFilePath = os.path.join(baseDir, dataLocation)
longData = os.path.join(baseDir, 'data/Master_Data.csv')

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
shortData = pd.read_csv(dataFilePath, index_col=0)
longData = pd.read_csv(longData, index_col=0)

In [5]:
shortData.head()

Unnamed: 0_level_0,MSFT,AAPL,AMZN,JPM,GOOG,GOOGL,XOM
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2005-01-03,26.74,4.520714,44.52,39.150002,100.700043,101.456459,50.09
2005-01-04,26.84,4.567143,42.139999,38.41,96.621567,97.347351,49.75
2005-01-05,26.780001,4.607143,41.77,38.490002,96.129768,96.851852,49.490002
2005-01-06,26.75,4.610714,41.049999,38.709999,93.665794,94.36937,50.119999
2005-01-07,26.67,4.946429,42.32,38.400002,96.298668,97.022018,49.790001


In [8]:
longData = longData.dropna(axis=1)
longData.head()

Unnamed: 0_level_0,MSFT,AAPL,AMZN,JPM,GOOG,GOOGL,XOM,BAC,PG,CSCO,...,GPS,NKTR,ROL,HRB,LEG,AIZ,JWN,MAC,FLR,MAT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2005-01-03,26.74,4.520714,44.52,39.150002,100.700043,101.456459,50.09,46.459999,55.189999,19.32,...,20.969999,19.76,3.426502,24.344999,27.719999,30.700001,23.200001,57.543472,26.525,19.17
2005-01-04,26.84,4.567143,42.139999,38.41,96.621567,97.347351,49.75,45.82,54.5,18.559999,...,20.65,19.23,3.361975,24.049999,27.709999,30.1,22.809999,56.617592,26.120001,19.190001
2005-01-05,26.780001,4.607143,41.77,38.490002,96.129768,96.851852,49.490002,45.290001,55.07,18.57,...,20.74,18.799999,3.37251,23.455,27.51,29.85,23.530001,53.886242,25.42,18.889999
2005-01-06,26.75,4.610714,41.049999,38.709999,93.665794,94.36937,50.119999,45.220001,55.34,18.85,...,20.950001,19.09,3.394897,23.895,27.959999,30.6,23.84,53.738102,25.535,18.879999
2005-01-07,26.67,4.946429,42.32,38.400002,96.298668,97.022018,49.790001,44.73,55.919998,18.719999,...,20.719999,19.190001,3.344856,23.360001,27.780001,30.379999,23.924999,53.701069,25.405001,18.77


In [9]:
def YearsContained(data):
    """
    Purpose: Return the years found in the data.
    Input: pd.DataFrame where the index values are datetime.
    Output: a list containing the years found in the index.
    """
    #years = []
    if (type(data.index) != pd.core.indexes.datetimes.DatetimeIndex):
        data.index = pd.to_datetime(data.index)
        return YearsContained(data)
    else:
        return sorted(list(set(data.index.year)))

In [10]:
def ByYear(data, year):
    '''
    Input:
      pd.DataFrame or list/tuple. If list/tuple, values must be
      type pd.DataFrame as well
    
    If list or tuple is passed in for year, data must be type int. 
    The return item will be a dictionary where the keys are the str of
    the years that where input.
    
    If an int is passed in for year, a data will be returned filtered by 
    the year.
    '''
    if type(data.index) != pd.core.indexes.datetimes.DatetimeIndex:
        try:
            data.index = pd.to_datetime(data.index)
        except Exception as exception_object:
            print('Can not conver index type to Datetime', exception_object)
    if (type(year) == list) or (type(year) == tuple):
        request = {}
        for n in year:
            assert type(n) == int, 'Year must contain type int.'
            cut = ByYear(data, n)
            # replaces timedate index with int range.
            cut.index = range(1,(len(cut)+1))
            request[n] = cut
        return request
    else:
        choose = data.index.year == year
        return data[data.index.year==year]

In [11]:
def ByStockAndYear(data):
    '''
    Purpose: take dataframe of stock's data and return it 
        broken down by stock and year.
    Input: pd.DataFrame 
    Returns: Dictionary where keys=tickers and values=DataFrame.
        Within the Values, the columns are calendar years and 
        the index are the number day in the year.
    '''
    # if data for one stock is entered
    if type(data.index) != pd.core.indexes.datetimes.DatetimeIndex:
        data.index = pd.to_datetime(data.index)
    if type(data) == pd.Series:
        request = ByYear(data, YearsContained(data))
        request = pd.DataFrame(request)
        return ByYear(data, YearsContained(data))
    
    # if data for multiple stocks is entered
    else:
        tickers = data.columns
        request = {}
        for ticker in tickers:
            getStockYears = ByYear(data[ticker], YearsContained(data))
            #getStockYears = pd.DataFrame(getStockYears)
            request[ticker] = pd.DataFrame(getStockYears)
        return request

In [12]:
def RollCorr(data, period):
    '''
    input: data=dictionary containing dataframes, period=window to use for rolling periods.
           designed for data to = ByStockAndYear output.
    output: dictionary. 
            Key=ticker symbol. 
            Value=pd.DataFrame with multilevel index,
                  value columns=calendar years, 
                  value index=outside is the day of the year the corr was calculated for,
                    insidd=calendar year
    '''
    request = {}
    for stock in data:
        request[stock] = data[stock].rolling(window=period).corr().dropna()
    return request

In [13]:
def SeasonCorrTest(dataDict, dropnum, n):
    '''
    Input: pandas correlation matrix, designed to take RollCorr output.
        dropNum: the number used to determine how many non NaN must be
        present in a column for the column to be dropped.
        n: the desired correlation level minimum
    Output: DICTIONARY whose keys are the ticker symbols.
        Values are DataFrame Correlation Matrixs that contain 
        True values if the dropNum and correlation test level
        is met (n).
    '''
    request = {}
    for stock, df in dataDict.items():
        test = (((df >= n) | (df <= -n)) & (df < 0.99))
        request[stock] = df[test].unstack(level=0).dropna(axis=1, thresh=dropnum)\
                            .unstack().dropna()
        #print(f'{stock} completed', end='|')
    return request

In [14]:
def HighCorrDays(data):
    '''
    Input: returned item from func: SeasonCorrTest()
    Output: dictionary. Key=stock, value= pd.series of the days
        that had high correlation. High correlation was established in 
        SeasonCorrTest()
    '''
    request = {}
    for stock in data:
        days = []
        for n in range(len(data[stock].index)):
            a, b, c = data[stock].index[n]
            days.append(b)
        days = list(set(days))
        days.sort()
        request[stock] = days
    return request

In [60]:
def PctReturnForDays(data, pxData, periods):
    '''
    Purpose: to extract 2 items from days that had high correlation.
             1: avg return for the period that generated a high corr.
             2: return details by calendar year for period of high corr.
    input: data = output from func HighCorrDays,
           pxData=price data from which to pull the %returns,
                  preferably from output of func byStockAndYear.
           periods=rolling time frame used in data.
    return: 3 level dictionary with the average return for the rolling
                period and all the percent returns by year for the period.
            level 1 key = 'ticker'
            level 2 key = 'DayN' where N=int() of the day analyzed
            level 3 key = 2 keys: key1='AvgReturn', key2='ReturnDetails'
    '''
    request = {}
    for stock, data in data.items():
        requestValue = {}
        for day in data:
            dataValue = {}
            # px at the day at which the high correlation occured
            end = pxData[stock].loc[day]
            # px N days prior to end day
            if (day-periods) < 1:
                start = pxData[stock].loc[(day-periods+1)]
            else:
                start = pxData[stock].loc[(day-periods)]
            pctChange = (end-start) / start
            dataValue['AvgReturn'] = round(pctChange.mean()*100, 2)
            dataValue['ReturnDetails'] = round(pctChange*100, 2)
            requestValue[f'Day{day}'] = dataValue
        request[stock] = requestValue
    return request

In [88]:
def ExecSummaryCorr(data, printupdate=False):
    '''
    input: data = returned item from func PctReturnForDays
    output: 3 level dictionary
        level 1 keys = ticker
        level 1 value = dict
        level 2 keys = 'DayN' where the day with results
        level 2 value = dict
        level 3 keys = 'TotalTrades', 'NumPos', 'NumNeg',
                       'AvgReturnOnPos', 'AvgReturnOnNeg'
        level 3 value = results
    kwargs: printupdate = will print 'load' status
    '''
    request = {}
    status = 0
    outOf = len(data.keys())
    for stock, days in data.items():
        if printupdate:
            print(f'{status}/{outOf}', end=' | ')
            status += 1
        if len(days) >= 1:
            requestValue = {}
            for day, details in days.items():
                value = {}
                data = details['ReturnDetails']
                posTest = data > 0
                daysPos = data[posTest].count()
                daysNeg = data.count() - daysPos
                value['TotalTrades'] = data.count()
                value['NumPos'] = daysPos
                value['NumNeg'] = daysNeg
                value['AvgReturnOnPos'] = round(data[posTest].mean(),2)
                value['AvgReturnOnNeg'] = round(data[data<0].mean(),2)
                requestValue[day] = value
            request[stock] = requestValue
    if printupdate == True:
        print()
    return request

In [41]:
A = ByStockAndYear(shortData)
for _, data in A.items():
    del data[2019]
B = RollCorr(A, period=60)
C = SeasonCorrTest(B, dropnum=10, n=0.75)
D = HighCorrDays(C)
E = PctReturnForDays(D, A, periods=60)

In [25]:
a = ByStockAndYear(longData)
for _,data in a.items():
    del data[2019]

<class 'dict'> <class 'dict'>


In [23]:
b = RollCorr(a, period=60)

In [37]:
c = SeasonCorrTest(b, dropnum=10, n=0.75)

In [38]:
d = HighCorrDays(c)

In [61]:
e = PctReturnForDays(d, a, periods=60)

In [67]:
E['AMZN']

{}

In [None]:
import CorrHelper

In [89]:
f = ExecSummaryCorr(e)

In [91]:
print(len(f.keys()))
print(len(a.keys()))
print(f.keys())
    

58
414
dict_keys(['MSFT', 'AAPL', 'GOOG', 'GOOGL', 'NFLX', 'CRM', 'AMGN', 'NKE', 'MMM', 'QCOM', 'LMT', 'BKNG', 'AXP', 'BLK', 'SPGI', 'GD', 'OXY', 'AON', 'ATVI', 'ROST', 'ETN', 'RHT', 'MCO', 'EA', 'PAYX', 'DLTR', 'PXD', 'ALGN', 'MCK', 'GPN', 'CCL', 'KLAC', 'HRS', 'IDXX', 'VRSN', 'BLL', 'TSS', 'FCX', 'EFX', 'IFF', 'AKAM', 'WCG', 'VAR', 'KSS', 'AES', 'LNT', 'COG', 'JKHY', 'UHS', 'JNPR', 'GRMN', 'FFIV', 'WHR', 'CPB', 'SEE', 'XEC', 'HP', 'MAT'])


In [65]:
def ExecSummaryCorr(data=e, printupdate=False):
request = {}
status = 0
outOf = len(data.keys())
for stock, days in data.items():
    if printupdate:
        print(f'{status}/{outOf}', end=' | ')
        status += 1
    if len(days) == 0:
        pass
    else:
    requestValue = {}
    for day, details in days.items():
        value = {}
        data = details['ReturnDetails']
        posTest = data > 0
        daysPos = data[posTest].count()
        daysNeg = data.count() - daysPos
        value['TotalTrades'] = data.count()
        value['NumPos'] = daysPos
        value['NumNeg'] = daysNeg
        value['AvgReturnOnPos'] = round(data[posTest].mean(),2)
        value['AvgReturnOnNeg'] = round(data[data<0].mean(),2)
        requestValue[day] = value
    request[stock] = requestValue
if printupdate == True:
    print()
return request

{'MSFT': {'Day112': {'TotalTrades': 14,
   'NumPos': 8,
   'NumNeg': 6,
   'AvgReturnOnPos': 12.65,
   'AvgReturnOnNeg': -10.21},
  'Day113': {'TotalTrades': 14,
   'NumPos': 8,
   'NumNeg': 6,
   'AvgReturnOnPos': 12.89,
   'AvgReturnOnNeg': -9.07},
  'Day115': {'TotalTrades': 14,
   'NumPos': 8,
   'NumNeg': 6,
   'AvgReturnOnPos': 11.62,
   'AvgReturnOnNeg': -8.18},
  'Day116': {'TotalTrades': 14,
   'NumPos': 8,
   'NumNeg': 6,
   'AvgReturnOnPos': 12.56,
   'AvgReturnOnNeg': -7.83},
  'Day117': {'TotalTrades': 14,
   'NumPos': 8,
   'NumNeg': 6,
   'AvgReturnOnPos': 13.07,
   'AvgReturnOnNeg': -8.21},
  'Day244': {'TotalTrades': 14,
   'NumPos': 11,
   'NumNeg': 3,
   'AvgReturnOnPos': 11.92,
   'AvgReturnOnNeg': -14.12},
  'Day245': {'TotalTrades': 14,
   'NumPos': 11,
   'NumNeg': 3,
   'AvgReturnOnPos': 12.8,
   'AvgReturnOnNeg': -15.31}},
 'AAPL': {'Day172': {'TotalTrades': 14,
   'NumPos': 12,
   'NumNeg': 2,
   'AvgReturnOnPos': 16.36,
   'AvgReturnOnNeg': -11.55},
  'Day173

In [32]:
f['MSFT']

{'Day112': {'TotalTrades': 14,
  'NumPos': 8,
  'NumNeg': 6,
  'AvgReturnOnPos': 12.65,
  'AvgReturnOnNeg': -10.21},
 'Day113': {'TotalTrades': 14,
  'NumPos': 8,
  'NumNeg': 6,
  'AvgReturnOnPos': 12.89,
  'AvgReturnOnNeg': -9.07},
 'Day115': {'TotalTrades': 14,
  'NumPos': 8,
  'NumNeg': 6,
  'AvgReturnOnPos': 11.62,
  'AvgReturnOnNeg': -8.18},
 'Day116': {'TotalTrades': 14,
  'NumPos': 8,
  'NumNeg': 6,
  'AvgReturnOnPos': 12.56,
  'AvgReturnOnNeg': -7.83},
 'Day117': {'TotalTrades': 14,
  'NumPos': 8,
  'NumNeg': 6,
  'AvgReturnOnPos': 13.07,
  'AvgReturnOnNeg': -8.21},
 'Day244': {'TotalTrades': 14,
  'NumPos': 11,
  'NumNeg': 3,
  'AvgReturnOnPos': 11.92,
  'AvgReturnOnNeg': -14.12},
 'Day245': {'TotalTrades': 14,
  'NumPos': 11,
  'NumNeg': 3,
  'AvgReturnOnPos': 12.8,
  'AvgReturnOnNeg': -15.31}}