In [1]:
import math
import yfinance as yf
import pandas_datareader as web
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

In [7]:
# Top 5 companies in each sector by stock symbol (as of 2000)

top_companies_2000 = {
    "Information Technology": ["IBM", "MSFT", "INTC", "ORCL", "CSCO"],
    "Health Care": ["JNJ", "PFE", "MRK", "ABT", "LLY"],
    "Financials": ["JPM", "BAC", "C", "WFC", "AXP"],
    "Consumer Discretionary": ["F", "NKE", "HD", "MCD", "DIS"],
    "Communication Services": ["T", "VZ", "BLS", "CMCSA", "EA"],
    "Consumer Staples": ["PG", "KO", "PEP", "MO", "CL"],
    "Energy": ["XOM", "CVX", "COP", "HAL", "SLB"],
    "Industrials": ["GE", "BA", "MMM", "HON", "CAT"],
    "Materials": ["PPG", "DD", "AA", "IP", "NEM"],
    "Real Estate": ["SPG", "PSA", "EQR", "VNO", "BXP"],
    "Utilities": ["DUK", "SO", "EXC", "ED", "AEP"]
}

# Flatten the list of symbols
all_symbols = [symbol for sector in top_companies_2000.values() for symbol in sector]

# Define the date range
start_date = "2000-05-07"
end_date = "2000-11-07"

# Download the data and store it in a DataFrame
data = yf.download(all_symbols, start=start_date, end=end_date, group_by='ticker')

[*********************100%***********************]  55 of 55 completed


In [8]:
stock_2000 = pd.concat([data[symbol].assign(Symbol=symbol) for symbol in all_symbols], axis=0)
stock_2000.to_csv('stock_2000.csv')

In [9]:
print(stock_2000.head())

Price                            Open        High         Low       Close  \
Date                                                                        
2000-05-08 00:00:00+00:00  103.310226  105.521034  102.891968  104.923515   
2000-05-09 00:00:00+00:00  105.640533  105.760040  103.489487  104.206497   
2000-05-10 00:00:00+00:00  100.860420  101.039673   97.574089   98.470360   
2000-05-11 00:00:00+00:00   99.426384  102.772469   98.649620   99.844643   
2000-05-12 00:00:00+00:00   99.187378  101.099426   98.709366   99.844643   

Price                      Adj Close    Volume Symbol  
Date                                                   
2000-05-08 00:00:00+00:00  56.289772   5048937    IBM  
2000-05-09 00:00:00+00:00  55.905148   5088685    IBM  
2000-05-10 00:00:00+00:00  52.827805  10473284    IBM  
2000-05-11 00:00:00+00:00  53.565060   8479085    IBM  
2000-05-12 00:00:00+00:00  53.565060   5857286    IBM  


In [12]:
top_companies_2004 = {
    "Information Technology": ["MSFT", "INTC", "CSCO", "ORCL", "IBM"],
    "Health Care": ["JNJ", "PFE", "MRK", "ABT", "LLY"],
    "Financials": ["JPM", "BAC", "C", "WFC", "AXP"],
    "Consumer Discretionary": ["F", "NKE", "HD", "MCD", "DIS"],
    "Communication Services": ["T", "VZ", "CMCSA", "BLS", "EA"],
    "Consumer Staples": ["PG", "KO", "PEP", "MO", "CL"],
    "Energy": ["XOM", "CVX", "COP", "HAL", "SLB"],
    "Industrials": ["GE", "BA", "MMM", "CAT", "HON"],
    "Materials": ["PPG", "DD", "AA", "IP", "NEM"],
    "Real Estate": ["SPG", "EQR", "AMT", "VNO", "BXP"],
    "Utilities": ["DUK", "SO", "EXC", "D", "AEP"]
}

# Flatten the list of symbols
all_symbols2 = [symbol for sector in top_companies_2004.values() for symbol in sector]

# Define the date range
start_date = "2004-05-02"
end_date = "2004-11-02"

# Download the data and store it in a DataFrame
data2004 = yf.download(all_symbols2, start=start_date, end=end_date, group_by='ticker')

[*********************100%***********************]  55 of 55 completed


In [17]:
# Combine the data into a single DataFrame with ticker as a new column
stock_2004 = pd.concat([data2004[symbol].assign(Symbol=symbol) for symbol in all_symbols2], axis=0)

# Save to CSV
stock_2004.to_csv("stock_2004.csv")

In [18]:
print(stock_2004.head())

Price                           Open       High        Low      Close  \
Date                                                                    
2004-05-03 00:00:00+00:00  26.190001  26.520000  26.190001  26.350000   
2004-05-04 00:00:00+00:00  26.350000  26.540001  26.020000  26.330000   
2004-05-05 00:00:00+00:00  26.320000  26.600000  26.250000  26.299999   
2004-05-06 00:00:00+00:00  26.160000  26.340000  26.030001  26.120001   
2004-05-07 00:00:00+00:00  26.030001  26.379999  25.750000  25.780001   

Price                      Adj Close    Volume Symbol  
Date                                                   
2004-05-03 00:00:00+00:00  16.390930  65916200   MSFT  
2004-05-04 00:00:00+00:00  16.378490  55496400   MSFT  
2004-05-05 00:00:00+00:00  16.359835  51841700   MSFT  
2004-05-06 00:00:00+00:00  16.247864  62693900   MSFT  
2004-05-07 00:00:00+00:00  16.036366  68290200   MSFT  


In [22]:
top_companies_2008 = {
    "Information Technology": ["MSFT", "INTC", "CSCO", "ORCL", "IBM"],
    "Health Care": ["JNJ", "PFE", "MRK", "ABT", "LLY"],
    "Financials": ["JPM", "BAC", "C", "WFC", "AXP"],
    "Consumer Discretionary": ["F", "NKE", "HD", "MCD", "DIS"],
    "Communication Services": ["T", "VZ", "CMCSA", "BLS", "EA"],
    "Consumer Staples": ["PG", "KO", "PEP", "MO", "CL"],
    "Energy": ["XOM", "CVX", "COP", "HAL", "SLB"],
    "Industrials": ["GE", "BA", "MMM", "CAT", "HON"],
    "Materials": ["PPG", "DD", "AA", "IP", "NEM"],
    "Real Estate": ["SPG", "EQR", "AMT", "VNO", "BXP"],
    "Utilities": ["DUK", "SO", "EXC", "D", "AEP"]
}

# Flatten the list of symbols
all_symbols3 = [symbol for sector in top_companies_2008.values() for symbol in sector]

# Define the date range
start_date = "2008-05-04"
end_date = "2008-11-04"

# Download the data and store it in a DataFrame
data2008 = yf.download(all_symbols3, start=start_date, end=end_date, group_by='ticker')

[*********************100%***********************]  55 of 55 completed


In [30]:
# Combine the data into a single DataFrame with ticker as a new column
stock_2008 = pd.concat([data2008[symbol].assign(Symbol=symbol) for symbol in all_symbols3], axis=0)

# Save to CSV
stock_2008.to_csv("stock_2008.csv")

In [31]:
print(stock_2008.head())

Price                           Open       High        Low      Close  \
Date                                                                    
2008-05-05 00:00:00+00:00  29.930000  30.230000  28.990000  29.080000   
2008-05-06 00:00:00+00:00  29.000000  29.860001  28.930000  29.700001   
2008-05-07 00:00:00+00:00  29.690001  30.139999  29.080000  29.209999   
2008-05-08 00:00:00+00:00  29.280001  29.389999  29.000000  29.270000   
2008-05-09 00:00:00+00:00  29.209999  29.549999  28.950001  29.389999   

Price                      Adj Close     Volume Symbol  
Date                                                    
2008-05-05 00:00:00+00:00  21.127420  119687700   MSFT  
2008-05-06 00:00:00+00:00  21.577864   93582000   MSFT  
2008-05-07 00:00:00+00:00  21.221876   88775000   MSFT  
2008-05-08 00:00:00+00:00  21.265467   69589900   MSFT  
2008-05-09 00:00:00+00:00  21.352640   51621200   MSFT  


In [35]:
top_companies_2012 = {
    "Information Technology": ["AAPL", "MSFT", "IBM", "GOOGL", "INTC"],
    "Health Care": ["JNJ", "PFE", "MRK", "ABT", "UNH"],
    "Financials": ["JPM", "BAC", "C", "WFC", "GS"],
    "Consumer Discretionary": ["AMZN", "HD", "MCD", "DIS", "NKE"],
    "Communication Services": ["T", "VZ", "GOOG", "CMCSA", "TWX"],
    "Consumer Staples": ["PG", "KO", "PEP", "MO", "WMT"],
    "Energy": ["XOM", "CVX", "COP", "SLB", "HAL"],
    "Industrials": ["GE", "BA", "MMM", "CAT", "HON"],
    "Materials": ["PPG", "DD", "FCX", "IP", "NEM"],
    "Real Estate": ["SPG", "PLD", "EQR", "AVB", "VNO"],
    "Utilities": ["DUK", "SO", "EXC", "D", "AEP"]
}

# Flatten the list of symbols
all_symbols4 = [symbol for sector in top_companies_2012.values() for symbol in sector]

# Define the date range
start_date = "2012-05-06"
end_date = "2012-11-06"

# Download the data and store it in a DataFrame
data2012 = yf.download(all_symbols4, start=start_date, end=end_date, group_by='ticker')

[*********************100%***********************]  55 of 55 completed


In [36]:
# Combine the data into a single DataFrame with ticker as a new column
stock_2012 = pd.concat([data2012[symbol].assign(Symbol=symbol) for symbol in all_symbols4], axis=0)

# Save to CSV
stock_2012.to_csv("stock_2012.csv")

In [54]:
print(stock_2012.head())

Price                           Open       High        Low      Close  \
Date                                                                    
2012-05-07 00:00:00+00:00  20.053572  20.456072  20.043928  20.338572   
2012-05-08 00:00:00+00:00  20.342142  20.410713  19.954643  20.292143   
2012-05-09 00:00:00+00:00  20.132143  20.499287  20.030357  20.327856   
2012-05-10 00:00:00+00:00  20.520714  20.567142  20.301430  20.375713   
2012-05-11 00:00:00+00:00  20.178572  20.516787  20.155357  20.239643   

Price                      Adj Close     Volume Symbol  
Date                                                    
2012-05-07 00:00:00+00:00  17.156542  460118400   AAPL  
2012-05-08 00:00:00+00:00  17.117378  497252000   AAPL  
2012-05-09 00:00:00+00:00  17.147512  480704000   AAPL  
2012-05-10 00:00:00+00:00  17.187872  333200000   AAPL  
2012-05-11 00:00:00+00:00  17.073095  399546000   AAPL  


In [39]:
top_companies_2016 = {
    "Information Technology": ["AAPL", "MSFT", "GOOGL", "INTC", "CSCO"],
    "Health Care": ["JNJ", "PFE", "MRK", "UNH", "ABBV"],
    "Financials": ["JPM", "BAC", "WFC", "C", "GS"],
    "Consumer Discretionary": ["AMZN", "HD", "DIS", "MCD", "NKE"],
    "Communication Services": ["T", "VZ", "GOOGL", "CMCSA", "TWX"],
    "Consumer Staples": ["PG", "KO", "PEP", "WMT", "MO"],
    "Energy": ["XOM", "CVX", "SLB", "COP", "PSX"],
    "Industrials": ["GE", "BA", "MMM", "CAT", "HON"],
    "Materials": ["PPG", "DD", "FCX", "IP", "APD"],
    "Real Estate": ["SPG", "PLD", "EQR", "AVB", "PSA"],
    "Utilities": ["DUK", "SO", "EXC", "D", "NEE"]
}

# Flatten the list of symbols
all_symbols5 = [symbol for sector in top_companies_2016.values() for symbol in sector]

# Define the date range
start_date = "2016-05-08"
end_date = "2016-11-08"

# Download the data and store it in a DataFrame
data2016 = yf.download(all_symbols5, start=start_date, end=end_date, group_by='ticker')

[*********************100%***********************]  54 of 54 completed


In [40]:
# Combine the data into a single DataFrame with ticker as a new column
stock_2016 = pd.concat([data2016[symbol].assign(Symbol=symbol) for symbol in all_symbols5], axis=0)

# Save to CSV
stock_2016.to_csv("stock_2016.csv")

In [55]:
print(stock_2016.head())

Price                           Open       High        Low      Close  \
Date                                                                    
2016-05-09 00:00:00+00:00  23.250000  23.442499  23.147499  23.197500   
2016-05-10 00:00:00+00:00  23.332500  23.392500  23.027500  23.355000   
2016-05-11 00:00:00+00:00  23.370001  23.392500  23.115000  23.127501   
2016-05-12 00:00:00+00:00  23.180000  23.195000  22.367500  22.584999   
2016-05-13 00:00:00+00:00  22.500000  22.917500  22.500000  22.629999   

Price                      Adj Close     Volume Symbol  
Date                                                    
2016-05-09 00:00:00+00:00  21.258570  131745600   AAPL  
2016-05-10 00:00:00+00:00  21.402908  134747200   AAPL  
2016-05-11 00:00:00+00:00  21.194424  114876400   AAPL  
2016-05-12 00:00:00+00:00  20.697254  305258800   AAPL  
2016-05-13 00:00:00+00:00  20.738499  177571200   AAPL  


In [45]:
top_companies_2020 = {
    "Information Technology": ["AAPL", "MSFT", "GOOGL", "NVDA", "ADBE"],
    "Health Care": ["JNJ", "PFE", "UNH", "ABBV", "MRK"],
    "Financials": ["JPM", "BAC", "WFC", "GS", "MS"],
    "Consumer Discretionary": ["AMZN", "HD", "NKE", "MCD", "LOW"],
    "Communication Services": ["GOOGL", "T", "VZ", "CMCSA", "DIS"],
    "Consumer Staples": ["PG", "KO", "PEP", "WMT", "COST"],
    "Energy": ["XOM", "CVX", "COP", "SLB", "PSX"],
    "Industrials": ["HON", "UNP", "CAT", "RTX", "LMT"],
    "Materials": ["LIN", "APD", "NEM", "DD", "SHW"],
    "Real Estate": ["AMT", "PLD", "SPG", "EQIX", "PSA"],
    "Utilities": ["NEE", "DUK", "SO", "D", "AEP"]
}

# Flatten the list of symbols
all_symbols6 = [symbol for sector in top_companies_2020.values() for symbol in sector]

# Define the date range
start_date = "2020-05-03"
end_date = "2020-11-03"

# Download the data and store it in a DataFrame
data2020 = yf.download(all_symbols6, start=start_date, end=end_date, group_by='ticker')

[*********************100%***********************]  54 of 54 completed


In [46]:
# Combine the data into a single DataFrame with ticker as a new column
stock_2020 = pd.concat([data2020[symbol].assign(Symbol=symbol) for symbol in all_symbols6], axis=0)

# Save to CSV
stock_2020.to_csv("stock_2020.csv")

In [56]:
print(stock_2020.head())

Price                           Open       High        Low      Close  \
Date                                                                    
2020-05-04 00:00:00+00:00  72.292503  73.422501  71.580002  73.290001   
2020-05-05 00:00:00+00:00  73.764999  75.250000  73.614998  74.389999   
2020-05-06 00:00:00+00:00  75.114998  75.809998  74.717499  75.157501   
2020-05-07 00:00:00+00:00  75.805000  76.292503  75.492500  75.934998   
2020-05-08 00:00:00+00:00  76.410004  77.587502  76.072502  77.532501   

Price                      Adj Close     Volume Symbol  
Date                                                    
2020-05-04 00:00:00+00:00  71.222008  133568000   AAPL  
2020-05-05 00:00:00+00:00  72.290977  147751200   AAPL  
2020-05-06 00:00:00+00:00  73.036827  142333600   AAPL  
2020-05-07 00:00:00+00:00  73.792351  115215200   AAPL  
2020-05-08 00:00:00+00:00  75.548752  133838400   AAPL  


In [52]:
top_companies_2024 = {
    "Information Technology": ["NVDA", "AAPL", "MSFT", "GOOGL", "TSM"],
    "Health Care": ["JNJ", "PFE", "UNH", "MRK", "ABBV"],
    "Financials": ["JPM", "BAC", "WFC", "C", "GS"],
    "Consumer Discretionary": ["AMZN", "HD", "MCD", "NKE", "DIS"],
    "Communication Services": ["GOOGL", "META", "VZ", "T", "CMCSA"],
    "Consumer Staples": ["PG", "KO", "PEP", "WMT", "COST"],
    "Energy": ["XOM", "CVX", "COP", "SLB", "PSX"],
    "Industrials": ["HON", "BA", "CAT", "GE", "UPS"],
    "Materials": ["LIN", "APD", "NEM", "DD", "SHW"],
    "Real Estate": ["AMT", "PLD", "SPG", "EQIX", "PSA"],
    "Utilities": ["NEE", "DUK", "SO", "D", "AEP"]
}

# Flatten the list of symbols
all_symbols7 = [symbol for sector in top_companies_2024.values() for symbol in sector]

# Define the date range
start_date = "2024-05-05"
end_date = "2024-11-05"

# Download the data and store it in a DataFrame
data2024 = yf.download(all_symbols7, start=start_date, end=end_date, group_by='ticker')

[*********************100%***********************]  54 of 54 completed


In [53]:
# Combine the data into a single DataFrame with ticker as a new column
stock_2024 = pd.concat([data2024[symbol].assign(Symbol=symbol) for symbol in all_symbols7], axis=0)

# Save to CSV
stock_2024.to_csv("stock_2024.csv")

In [57]:
print(stock_2024.head())

Price                           Open       High        Low      Close  \
Date                                                                    
2024-05-06 00:00:00+00:00  89.389999  92.220001  89.055000  92.139999   
2024-05-07 00:00:00+00:00  91.098000  91.780998  89.011002  90.554001   
2024-05-08 00:00:00+00:00  89.483002  91.194000  89.419998  90.412003   
2024-05-09 00:00:00+00:00  90.528999  91.071999  88.231003  88.747002   
2024-05-10 00:00:00+00:00  90.305000  91.401001  89.226997  89.877998   

Price                      Adj Close     Volume Symbol  
Date                                                    
2024-05-06 00:00:00+00:00  92.124550  376203000   NVDA  
2024-05-07 00:00:00+00:00  90.538818  437342000   NVDA  
2024-05-08 00:00:00+00:00  90.396843  325721000   NVDA  
2024-05-09 00:00:00+00:00  88.732124  378013000   NVDA  
2024-05-10 00:00:00+00:00  89.862923  335325000   NVDA  
