In [None]:
'''
Code for running my analysis on pct change of the SPX underlying positions over a 30, 60, 90 day periods.
Analysis starts with the daily close price (closepx) of all SPY underlying.  
This assumes you have an excel or csv file saved in the directory with the historical closing prices.
'''

In [None]:
import os

In [None]:
os.listdir()

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [2]:
# run this cell if you want to charts to appear in a pop-up window.
# this method is recommended
%matplotlib auto

Using matplotlib backend: Qt5Agg


<h2>Import the data</h2>

In [3]:
rough_data = pd.read_excel('small_closepx_data.xls', index_col=0)

In [None]:
# rough_data.head()

<h2>Clean the Data</h2>

In [4]:
# this sections removes the columns with NaN values.
# I remove the columns as I would like a full history to analyze

clean_data = rough_data.dropna(axis=1)


In [None]:
'''
print('Change in columns: ', len(clean_data.columns) - len(rough_data.columns))
for ticker in rough_data.columns:
    if ticker not in clean_data.columns:
        print(ticker)
'''

In [None]:
# Save the clean data to file
#clean_data.to_csv('data/short_clean_closepx.csv')


<h2>Review Clean Data</h2>

In [5]:
# pandas describe summary of the closing px for each ticker

px_summary = clean_data.describe()

In [None]:
print(px_summary.transpose())

<h4>Manually calculate variance and std dev</h4>

In [6]:
variance = (((clean_data-clean_data.mean())**2).sum(axis=0))/\
              (len(clean_data)-1)

variance2 = clean_data.var()

In [None]:
variance == variance2

In [None]:
'''
px_variance = {}
for ticker in clean_data:
    variance = sum(((clean_data[ticker] - clean_data[ticker].mean())**2) / 
              (len(clean_data[ticker] - 1)))
    px_variance[ticker] = variance
    
px_variance
'''

In [None]:
# Std Dev method 1
from math import sqrt

px_std1 = {}
for ticker in clean_data:
    std = sqrt(px_variance[ticker])
    px_std1[ticker] = std

print(px_std1)


px_std = clean_data.std()
print(px_std)

<h4>Rolling Std Dev</h4>

In [7]:
px_rolling_std = clean_data.rolling(20).std()
px_rolling_std = px_rolling_std.dropna()


In [None]:
clean_data.rolling(20).std()

In [None]:
# print(type(px_rolling_std))
# print(px_rolling_std.tail(10))

In [None]:
# px_rolling_std.plot()

<h4>Plot rolling std against price for each position.</h4>
<ol><li><h4>Select specific chart to run</h4></li></ol>

In [None]:
# if you want to chart a specific symbol, use this cell to run your chart.
# otherwise you can cycle through the charts using 1 of the 2 funcs below.
fig, ax1 = plt.subplots()

color = 'tab:orange'
ax1.set_xlabel('Date')
ax1.set_ylabel('Std Dev', color=color)
ax1.plot(px_rolling_std['MSFT'], color=color)

ax2 = ax1.twinx()

color = 'tab:blue'
ax2.set_ylabel('Price', color=color)
plt.plot(clean_data['MSFT'], color=color)


plt.show()

<ol start='2'><li><h4>Cycle through charts with Std Dev and Px in same Pane</h4></li></ol>

In [None]:
# Plots Std Dev and Price in same chart (share x-axis)
def cycle_charts():
    for ticker in clean_data.columns:

        fig, ax1 = plt.subplots()
        color = 'tab:orange'
        ax1.set_title(ticker)
        ax1.set_xlabel('Date')
        ax1.set_ylabel('Std Dev', color=color)
        ax1.plot(px_rolling_std[ticker], color=color)

        ax2 = ax1.twinx()
        color = 'tab:blue'
        ax2.set_ylabel('Price', color=color)
        plt.plot(clean_data[ticker], color=color)

        yield plt.show()
    

In [None]:
cycle = cycle_charts()

In [None]:
# re-run this cell to cycle through viewing each chart
next(cycle)

<ol start='3'><li><h4>
  Cycle through charts with Std Dev and Px on different Panes
</h4></li></ol>

In [None]:
# Plots Price in chart's pane 1 and Std Dev in chart's pane 2

def cycle_charts2():
    for ticker in clean_data.columns:
        avg_std = px_rolling_std[ticker].mean()
        
        fig, axes = plt.subplots(2,1)
        axes[0].set_title(ticker)
        axes[0].set_xlabel('Date')
        # Top Pane
        axes[0].plot(clean_data[ticker], color='blue')
        axes[0].set_ylabel('Price')
        # Bottom Pane
        axes[1].set_xlabel('Date')
        axes[1].plot(px_rolling_std[ticker], color='orange', label='Rolling Std Dev')
        axes[1].axhline(y=avg_std, label='Std Dev Mean')
        axes[1].set_ylabel('Std Dev')
        axes[1].legend()
        yield plt.show()  
        
        
    

In [None]:
# run this cell to begin cycling through viewing each chart
cycle = cycle_charts2()

In [None]:
# run this cell when ready to move onto next chart
next(cycle)

<h2>Note:</h2>
<p>Keep in mind that the std dev is increasing over time because the price of the underlying
is increasing.</p>
<p>So we are finding the std dev of larger numbers hence the increase in std dev</p>
<p>We will now examine the % return and accompaniyng stats.</p>

<h2>Calculating % Change over N periods</h2>

In [8]:
# 20 day rolling return
pctChange_20 = round((((clean_data - clean_data.shift(20)) / clean_data.shift(20)) * 100), 0)
pctChange_20 = pctChange_20.dropna()

# 40 day rolling return
pctChange_40 = round((((clean_data - clean_data.shift(40)) / clean_data.shift(40)) * 100), 0)
pctChange_40 = pctChange_40.dropna()

# 60 day rolling return
pctChange_60 = round((((clean_data - clean_data.shift(60)) / clean_data.shift(60)) * 100), 0)
pctChange_60 = pctChange_60.dropna()


pctChangeList = [pctChange_20, pctChange_40, pctChange_60]

In [None]:
pctChange_20.head()

<h4>Save the data to a file</h4>

In [None]:
'''
pctChange_20.to_csv('data/PctChange20.csv')
pctChange_40.to_csv('data/PctChange40.csv')
pctChange_60.to_csv('data/PctChange60.csv')
'''

<h3>Create Summary Data Frame for Pct Change Stats</h3>
<h4><p>This method is inefficient.  I used it prior to creating a helper function.</p>
<p>As i have learned in programming, if you are repeating yourself, there is a better way</p>
</h4>

<h4>Method 1</h4>

In [None]:
# Create the base dataframe
# pctChange_summary = pd.DataFrame()

In [None]:
# Begin adding the desired data
'''
pctChange_summary['Std Dev 20'] = pctchange_20.std().transpose()
pctChange_summary['Std Dev 40'] = pctchange_40.std().transpose()
pctChange_summary['Std Dev 60'] = pctchange_60.std().transpose()
'''

In [None]:
'''
pctChange_summary['20 Min %Change'] = pctchange_20.min().transpose()
pctChange_summary['40 Min %Change'] = pctchange_40.min().transpose()
pctChange_summary['60 Min %Change'] = pctchange_60.min().transpose()

pctChange_summary['20 Min Date'] = pctchange_20.idxmin().transpose()
pctChange_summary['40 Min Date'] = pctchange_40.idxmin().transpose()
pctChange_summary['60 Min Date'] = pctchange_60.idxmin().transpose()

pctChange_summary['20 Max %Change'] = pctchange_20.max().transpose()
pctChange_summary['40 Max %Change'] = pctchange_40.max().transpose()
pctChange_summary['60 Max %Change'] = pctchange_60.max().transpose()

pctChange_summary['20 Max Date'] = pctchange_20.idxmax().transpose()
pctChange_summary['40 Max Date'] = pctchange_40.idxmax().transpose()
pctChange_summary['60 Max Date'] = pctchange_60.idxmax().transpose()

pctChange_summary['20 Mean'] = pctchange_20.mean().transpose()
pctChange_summary['40 Mean'] = pctchange_40.mean().transpose()
pctChange_summary['60 Mean'] = pctchange_60.mean().transpose()

pctChange_summary['20 Median'] = pctchange_20.median().transpose()
pctChange_summary['40 Median'] = pctchange_40.median().transpose()
pctChange_summary['60 Median'] = pctchange_60.median().transpose()

pctChange_summary['20 Mode'] = pctchange_20.mode().transpose()
pctChange_summary['40 Mode'] = pctchange_40.mode().transpose()
pctChange_summary['60 Mode'] = pctchange_60.mode().transpose()
'''

In [None]:
#pctChange_summary

<h4>
<p>Method 2</p>
</h4>

In [None]:
from my_helpers import my_summary

In [None]:
my_summary(pctChange_20)

In [None]:
my_summary(pctChange_40)

In [None]:
my_summary(pctChange_60)

<h2>Plotting Pct Change</h2>

In [None]:
# the variable periods will equal one of the below variables.
pctChangeSummary_20 =my_summary(pctChange_20)
pctChangeSummary_40 = my_summary(pctChange_40)
pctChangeSummary_60 = my_summary(pctChange_60)

<h4>Method 1
<p>Modify the ticer variable and periods veriable for desired plot</p>
</h4>

In [None]:
# Simple Chart, enter specific symbol and rolling periods
# ticker = the symbol you want to study. Must be type string.
# periods = enter the dataframe that contains the specific rolling time period desired.

ticker = 'MSFT'
periods = pctChangeSummary_20
fig, axes = plt.subplots()
axes.set_title('Pct Change')
axes.set_ylabel('Pct Change')
axes.set_xlabel('Date')
axes.axhline(y=(periods['mean'].loc[ticker]))
axes.axhline(y=(periods['mean'].loc[ticker] + periods['std'].loc[ticker]), color='red')
axes.axhline(y=(periods['mean'].loc[ticker] - periods['std'].loc[ticker]), color='red')
axes.plot(pctChange_20['MSFT'])
plt.show()

In [None]:
# Simple Histogram

pctchange_20['MSFT'].hist(bins=50, color='orange')
#pctchange_40['MSFT'].hist(bins=100, color='blue')
#pctchange_60['MSFT'].hist(bins=100, color='green')

In [None]:
# Histogram with Line

data = pctchange_20['MSFT']
density = stats.gaussian_kde(data)
xs = 50
density.covariance_factor = lambda : .25
density._compute_covariance()
plt.plot(xs,density(xs))
plt.show()

In [None]:
# Multi Pane Charting

density = stats.gaussian_kde(pctchange_20['MSFT'])
fig, axes = plt.subplots(3,1)
axes[0].hist(pctchange_20['MSFT'], bins=100)
axes[0].plot()

<h2>Collecting thoughts on data</h2>

<h4>Looking at the 1 standard deviation bounds</h4>
<h4> Code to create a table that contains the upper and lower 1-standard dev bounds for each ticker</h4>
<h3>Method 1</h3>

In [None]:
ticker = 'MSFT'
periods = pctChangeSummary_20

upper_1std = periods['mean'].loc[ticker] + periods['std'].loc[ticker]
lower_1std = periods['mean'].loc[ticker] - periods['std'].loc[ticker]


In [None]:
# Code to automate finding the upper and lower %change 1stddev bounds

UpperLower_20 = pd.DataFrame()
UpperLower_40 = pd.DataFrame()
UpperLower_60 = pd.DataFrame()

In [None]:
UpperLower_20['Upper_1std_20'] = pctChangeSummary_20['mean'] + pctChangeSummary_20['std']
UpperLower_20['Lower_1std_20'] = pctChangeSummary_20['mean'] - pctChangeSummary_20['std']

UpperLower_40['Upper_1std_40'] = pctChangeSummary_40['mean'] + pctChangeSummary_40['std']
UpperLower_40['Lower_1std_40'] = pctChangeSummary_40['mean'] - pctChangeSummary_40['std']

UpperLower_60['Upper_1std_60'] = pctChangeSummary_60['mean'] + pctChangeSummary_60['std']
UpperLower_60['Lower_1std_60'] = pctChangeSummary_60['mean'] - pctChangeSummary_60['std']

In [None]:
print(UpperLower_20)
print(UpperLower_40)
print(UpperLower_60)

In [None]:
UpperLower_list = [UpperLower_20, UpperLower_40, UpperLower_60]

In [None]:
'''
for item in UpperLower_list:
    for index, column in zip(item.index, item.columns):
        print(column)
'''     

In [None]:
'''
for item in UpperLower_list:
    print(item.index)
    print(item.columns)
'''

In [None]:
print(UpperLower_list)

<h3>Method 2</h3>
<h4>Attempt to automate the creation of the UpperLower bounds dataframe
   <br> <p>My Preferred Method</p>
</h4>


In [9]:
def UpperLowerBounds(df):
    '''
    Takes in a DataFrame and returnds the 1 standard deviation bounds
    of the data. The return object is type pandas.DataFrame.
    Can also take in a list or tuple of DataFrames.
    '''
    if type(df) == list or type(df) == tuple:
        assert type(df[0]) == type(pd.DataFrame()), 'List does not contain type DataFrame'
        bounds = []
        for item in df:
            bounds.append(UpperLowerBounds(item))
        return bounds
    else:
        assert type(df)==type(pd.DataFrame()), 'Input not type Pandas DataFrame.'
        upper = df.mean() + df.std()
        lower = df.mean() - df.std()
        bounds = pd.DataFrame([upper, lower], index=['UpperBounds', 'LowerBounds'])
        return bounds
    

In [10]:
pctBounds_20 = UpperLowerBounds(pctChange_20)
pctBounds_40 = UpperLowerBounds(pctChange_40)
pctBounds_60 = UpperLowerBounds(pctChange_60)
pctBoundsList = UpperLowerBounds(pctChangeList)
pctBounds_20

Unnamed: 0,MSFT,AAPL,AMZN,JPM,GOOG,GOOGL,XOM
UpperBounds,7.239287,11.827653,13.095637,9.045707,9.965487,9.962457,5.664039
LowerBounds,-5.104175,-6.773441,-7.778984,-7.165807,-6.47202,-6.477886,-4.901737


In [None]:
print(type(test))
type(test) == list or type(test) == tuple

<h3>Method 3</h3>
<h4>I created the following code in an attempt to find a way to automate creating the UpperLower data</h4>


In [None]:
'''
UpperLower_data = [UpperLower_20, UpperLower_40, UpperLower_60]
pctChangeSummary_list = [pctChangeSummary_20, pctChangeSummary_40, pctChangeSummary_60]
for item, data in zip(UpperLower_data, pctChangeSummary_list):
    item['Upper_1std'] = data['mean'] + data['std']
    item['Lower_1std'] = data['mean'] - data['std']
    item['PctRange'] = (data['mean'] + data['std']) - (data['mean'] - data['std'])
''' 

In [None]:
for item in UpperLower_list:
    print(item)

In [None]:
'''
for position, value in enumerate(UpperLower_list):
    print(position)
    print(value)
    print()
'''

<h2>Number of times a stock exceeded it upper and lower 1std dev bounds</h2>

<h3>Method 1</h3>

In [None]:
tickers = list(clean_data.columns)
pctChange_list = [pctChange_20, pctChange_40, pctChange_60]
timePeriods = ['20', '40', '60']
UpperLower_data = [UpperLower_20, UpperLower_40, UpperLower_60]

In [None]:
UpperLower_data[0].loc['MSFT'].iloc[0]

In [None]:
# Count will the to total of times the pct return was greater or less than value (mean +- stddev).

countBoundsExceeded = pd.DataFrame(0, index=tickers, columns=['Exceeds_20', 'Exceeds_40', 'Exceeds_60'])
counter = 1
steps = 0
for ticker in tickers:
    #print('Step: ', counter, ' / ', len(tickers))
    counter += 1
    location = 0
    for data, bench, position in zip(pctChange_list, UpperLower_data,\
                                     ['Exceeds_20', 'Exceeds_40', 'Exceeds_60']):
        upper = bench.loc[ticker].iloc[0]
        lower = bench.loc[ticker].iloc[1]
        for value in data[ticker]:
            steps +=1
            if value > upper or value < lower:
                countBoundsExceeded[position].loc[ticker] += 1
        location += 1
print(steps)

In [None]:
countBoundsExceeded

In [None]:
countBoundsExceeded['Total'] = countBoundsExceeded.sum(axis=1)

In [None]:
countBoundsExceeded

In [None]:
print(len(pctChange_20))
pctChange_20.shape

<h3>Method 2</h3>
<h4>My preferred method</h4>

In [11]:
def TimesBoundsExceeded(data):
    '''
    Takes in a DataFrame and returnds the 1 standard deviation bounds
    of the data. The return object is type pandas.DataFrame.
    Can also take in a list or tuple of DataFrames.
    If a list in entered, the returned item is a list of DataFrames. 
    In this list, each index = the result of DataFrame in respective order of how they were contained 
    in the list.
    '''
    if type(data) == list or type(data) == tuple:
        assert type(data[0]) == type(pd.DataFrame()), 'List does not contain type DataFrame'
        bounds = []
        for item in data:
            bounds.append(TimesBoundsExceeded(item))
        request = pd.DataFrame(bounds)
        return request
    else:
        assert (type(data)==type(pd.DataFrame())), 'Input not type Pandas DataFrame.'
        upper = data > UpperLowerBounds(data).loc['UpperBounds']
        lower = data < UpperLowerBounds(data).loc['LowerBounds']
        count = data[upper].count() + data[lower].count()
        return count

In [12]:
countBoundsExceeded = TimesBoundsExceeded([pctChange_20, pctChange_40, pctChange_60]).transpose()
countBoundsExceeded.columns = ['20 Day', '40 Day', '60 Day']
countBoundsExceeded

Unnamed: 0,20 Day,40 Day,60 Day
MSFT,922,1109,1056
AAPL,1030,942,1030
AMZN,910,912,894
JPM,721,692,795
GOOG,982,983,885
GOOGL,998,966,889
XOM,1164,1093,1140


<h2> Converting the number of times to percentage of time</h2>
<h3>Method 1</h3>


In [13]:
pctBoundsExceeded = round(countBoundsExceeded / len(pctChange_20), 3)
pctBoundsExceeded

Unnamed: 0,20 Day,40 Day,60 Day
MSFT,0.256,0.308,0.294
AAPL,0.286,0.262,0.286
AMZN,0.253,0.254,0.249
JPM,0.2,0.192,0.221
GOOG,0.273,0.273,0.246
GOOGL,0.277,0.269,0.247
XOM,0.324,0.304,0.317


<h1>The next cells became obsolete after I revisited the above items.
<p>Go to 'Continue Here.'</h1>
<h2>Create a benchmark dataframe with the same shape as pctChange</h2>

<h3>Method 1</h3>
<h4>Non-efficient method</h4>
<h4>I was able to build a better system. Go to method 2</h4>

In [None]:
UpperLower_20.transpose()

In [None]:
UpperLower_20.transpose().loc['Upper_1std_20']

In [None]:
msft = pd.DataFrame(7.23, index=range(0,3597), columns=['MSFT'])
aapl = pd.DataFrame(11.82, index=range(0,3597), columns=['AAPL'])
amzn = pd.DataFrame(13.0956, index=range(0,3597), columns=['AMZN'])
jpm = pd.DataFrame(9.04, index=range(0,3597), columns=['JPM'])
goog = pd.DataFrame(9.96, index=range(0,3597), columns=['GOOG'])
googl = pd.DataFrame(9.96, index=range(0,3597), columns=['GOOGL'])
xom = pd.DataFrame(5.66, index=range(0,3597), columns=['XOM'])

In [None]:
upperBench = pd.concat([msft, aapl, amzn, jpm, goog, googl, xom], axis=1)

In [None]:
upperBench.head()

In [None]:
print(pctChange_20.shape)
print(bench.shape)

In [None]:
UpperLower_20.transpose().loc['Lower_1std_20']

In [None]:
msft = pd.DataFrame(-5.104175, index=range(0,3597), columns=['MSFT'])
aapl = pd.DataFrame(-6.77, index=range(0,3597), columns=['AAPL'])
amzn = pd.DataFrame(-7.77, index=range(0,3597), columns=['AMZN'])
jpm = pd.DataFrame(-7.16, index=range(0,3597), columns=['JPM'])
goog = pd.DataFrame(-6.47, index=range(0,3597), columns=['GOOG'])
googl = pd.DataFrame(-6.47, index=range(0,3597), columns=['GOOGL'])
xom = pd.DataFrame(-4.90, index=range(0,3597), columns=['XOM'])

In [None]:
lowerBench = pd.concat([msft, aapl, amzn, jpm, goog, googl, xom], axis=1)

In [None]:
# Match the indexs / convert to datetime
upperBench.index = pctChange_20.index
lowerBench.index = pctChange_20.index

In [None]:
outsideBoundsTest = (pctChange_20 > bench) | (pctChange_20 < lowerBench)
datesOutsideBounds = pctChange_20[uppertest]

In [None]:
datesOutsideBounds['JPM'].dropna().shape

In [None]:
countBoundsExceeded

In [None]:
datesOutsideBounds['MSFT'].dropna().plot.bar()

<h3> Method 2 </h3>
<p>attempt to build efficient code for method 1<p>

In [None]:
def BenchBuilder(df, data):
    # df should be the UpperLower for the timeframe ie: UpperLower_20
    # data should be the % Change df for the timeframe ie: pctChange_20
    upperBench = pd.DataFrame(0, index=np.arange(len(data)), columns=tickers)
    lowerBench = pd.DataFrame(0, index=np.arange(len(data)), columns=tickers)
    for col1, col2 in df.iteritems():
        for ticker, bench in col2.iteritems():
            name = col1.split('_')
            if 'Upper' in name:
                upperBench[ticker] = bench
            else:
                lowerBench[ticker] = bench
    upperBench.index = data.index
    lowerBench.index = data.index
    outsideBoundsTest = (data > upperBench) | (data < lowerBench)
    datesOutsideBounds = pctChange_20[outsideBoundsTest]
    return datesOutsideBounds

In [None]:
print(len(pctChange_20.index))
bench = BenchBuilder(UpperLower_20, pctChange_20)
outsideBoundsTest = (pctChange_20 > bench['Upper']) | (pctChange_20 < bench['Lower'])
datesOutsideBounds = pctChange_20[outsideBoundsTest]

<h1><font color='red'>Continue Here</font><h1>

<h3>Break down by year</h3>
<h4>I now want to search for a seasonality in the data by slicing the data in yearly periods</h4>

In [14]:
def ByYear(data, year):
    '''
    Input:
      pd.DataFrame or list/tuple. If list/tuple, values must be
      type pd.DataFrame as well
    
    If list or tuple is passed in for year, data must be type int. 
    The return item will be a dictionary where the keys are the str of
    the years that where input.
    
    If an int is passed in for year, a data will be returned filtered by 
    the year.
    '''
    if (type(year) == list) or (type(year) == tuple):
        request = {}
        for n in year:
            assert type(n) == int, 'Year must contain type int.'
            cut = ByYear(data, n)
            # replaces timedate index with int range.
            cut.index = range(1,(len(cut)+1))
            request[n] = cut
        return request
    else:
        choose = data.index.year == year
        return data[choose]

In [15]:
def YearsContained(data):
    years = []
    for year in data.index.year:
        if year in years:
            pass
        else:
            years.append(year)
    return years

In [21]:
YearsContained(clean_data)
print(type(clean_data))
clean_data.index

<class 'pandas.core.frame.DataFrame'>


DatetimeIndex(['2005-01-03', '2005-01-04', '2005-01-05', '2005-01-06',
               '2005-01-07', '2005-01-10', '2005-01-11', '2005-01-12',
               '2005-01-13', '2005-01-14',
               ...
               '2019-05-03', '2019-05-06', '2019-05-07', '2019-05-08',
               '2019-05-09', '2019-05-10', '2019-05-13', '2019-05-14',
               '2019-05-15', '2019-05-16'],
              dtype='datetime64[ns]', name='Date', length=3617, freq=None)

In [16]:
a = ByYear(pctChange_20['MSFT'], YearsContained(pctChange_20))
a = pd.DataFrame(a)

<h3>Break down by year and stock</h3>

In [18]:
def ByStockAndYear(data):
    '''
    Input: pd.DataFrame 
    Returns: dictionary where keys=tickers and values=DataFrame.
        Within the Values, the columns are calendar years and 
        the index are the number day in the year.
    '''
    # if data for one stock is entered
    if type(data) == pd.Series:
        request = ByYear(data, YearsContained(data))
        request = pd.DataFrame(request)
        return ByYear(data, YearsContained(data))
    
    # if data for multiple stocks is entered
    else:
        tickers = data.columns
        request = {}
        for ticker in tickers:
            getStockYears = ByYear(data[ticker], YearsContained(data))
            getStockYears = pd.DataFrame(getStockYears)
            request[ticker] = getStockYears
        return request
        
        
        
    

In [19]:
byStockAndYear_20 = ByStockAndYear(pctChange_20)
byStockAndYear_40 = ByStockAndYear(pctChange_40)
byStockAndYear_60 = ByStockAndYear(pctChange_60)
byStockAndYearList = [byStockAndYear_20, byStockAndYear_40, byStockAndYear_60]

<h3> Lets look at the correlation by year</h3>

In [None]:
def CorrSlideShow(data):
    '''
    Input a dictionary whose values are dataframes.
    Output: yields correlation matrix by years.
    Note: yield is being used.
    '''
    assert type(data) == dict, 'Input must be type dictionary.'
    for year in data:
        yield print(str(year),'\n',data[year].corr())
        

In [None]:
study = CorrSlideShow(byStockAndYear_20)

In [None]:
# Since the slide show yields the results, this line must be run to cycle 
# forward through the data.
next(study)

<h2>Method 1 of looking at correlation</h2>
<h3>Not using pandas broadcasting</h3>

In [None]:
def CorrOverN(data, n): 
    '''
    input: data frame
    output: correlation table where the True values are those 
        with a correlation <= or >= to n.
    '''
    if type(n) == list:
        Ns = []
        for item in n:
            Ns.appen(item)
        test = (data>Ns[0]) | (data<Ns[1])
        return data[test]
    else:
        test = (data > n) | (data < -n)
        return data[test]

In [None]:
CorrOverN(byStockAndYear_20['MSFT'].corr(), 0.5)

<h2>Method 2</h2>
<h3>Using Pandas broadcasting</h3>

In [23]:
def CorrelationTest(data, r):
    '''
    Input Type: pd.DataFrame
    Input: DataFrame that was the output of the an output of the byStockAndYear function.
        DataFrame must be for a single stock. 
    Output: The instances in which the correlation was >= or <= to r.
        All corr values of 1 are excluded.
    Purpose: a test to see if there is any correlation in the data by year.
    '''
    data = data.corr() 
    test = ((data >= r) | (data <= -r)) & (data != 1)
    request = data[test].unstack().sort_values().dropna()
    if len(request) == 0:
        return print('There were no instances that met your criteria')
    return request.drop_duplicates()

In [24]:
trial = CorrelationTest(byStockAndYear_20['AAPL'], 0.7)

There were no instances that met your criteria


In [25]:
for stock in byStockAndYear_20:
    print(stock)
    print(CorrelationTest(byStockAndYear_20[stock], 0.6))

MSFT
2007  2015    0.620206
      2017    0.671419
dtype: float64
AAPL
2005  2019   -0.648047
dtype: float64
AMZN
There were no instances that met your criteria
None
JPM
2014  2019   -0.753404
dtype: float64
GOOG
2005  2014   -0.716644
2011  2015    0.632413
dtype: float64
GOOGL
2005  2014   -0.706365
2011  2015    0.624912
dtype: float64
XOM
2005  2017   -0.609238
dtype: float64


<h2>Lets look at Rolling Correlation by year</h2>

<h2> Lets look for any potential seasonality in the stocks
using Charts</h2>

In [None]:
# byStockAndYear = dict
# keys: ['MSFT', 'AAPL', 'AMZN', 'JPM', 'GOOG', 'GOOGL', 'XOM']
type(byStockAndYear_20['MSFT'])

In [None]:
'''
I left of attempting to create the below function. I am
cycling through the charts in order.  I need to start adding the labels
to the charts.
'''

In [126]:
def CycleRollingCharts(data):
    '''
    Input: byStockAndYear[ticker]
    Output: 3 pane chart where the charts cycle in the pattern abc, bcd, cde...
    Area for Improvement: a better method for showing the next chart. Clear() works ok.
        Labeling and add key.
        Change the color of the lines.
        Must change chart dimensions to better fit the window.
    '''
    for stock, value in data.items():
        for i, year in enumerate(value.columns):
            # axes = pane; 0=top, 1=middle, 2=bottom
            fig, axes = plt.subplots(3,1)
            axes[0].set(ylabel=year+2)
            axes[1].set(ylabel=year+1)
            axes[2].set(ylabel=year)
            axes[0].plot(value[year+2])
            axes[1].plot(value[year+1])
            axes[2].plot(value[year])
            print(i)
            yield plt.show()      

In [128]:
trial = CycleRollingCharts(byStockAndYear_20)

In [129]:
next(trial)

<generator object CycleRollingCharts at 0x00000274FA7D1D68>

In [199]:
def cycle_charts2():
    for ticker in clean_data.columns:
        avg_std = px_rolling_std[ticker].mean()
        
        fig, axes = plt.subplots(2,1)
        axes[0].set_title(ticker)
        axes[0].set_xlabel('Date')
        # Top Pane
        axes[0].plot(clean_data[ticker], color='blue')
        axes[0].set_ylabel('Price')
        # Bottom Pane
        axes[1].set_xlabel('Date')
        axes[1].plot(px_rolling_std[ticker], color='orange', label='Rolling Std Dev')
        axes[1].axhline(y=avg_std, label='Std Dev Mean')
        axes[1].set_ylabel('Std Dev')
        axes[1].legend()
        yield plt.show() 