In [1]:
import os
import csv
import yfinance as yf
import pandas as pd

In [2]:
# Read the data from html link
# Open the link and download S&P company details in a table
wiki_data=pd.read_html("https://en.wikipedia.org/wiki/List_of_S%26P_500_companies") 

# All data is stored in first cell
data = wiki_data[0] 
display(data.head())

# Sort the dataframe on ticker in alphabetical ascending order
snp500_info_df = data.sort_values(by=["Symbol"], ascending=True) 

Unnamed: 0,Symbol,Security,SEC filings,GICS Sector,GICS Sub-Industry,Headquarters Location,Date first added,CIK,Founded
0,MMM,3M,reports,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1976-08-09,66740,1902
1,AOS,A. O. Smith,reports,Industrials,Building Products,"Milwaukee, Wisconsin",2017-07-26,91142,1916
2,ABT,Abbott,reports,Health Care,Health Care Equipment,"North Chicago, Illinois",1964-03-31,1800,1888
3,ABBV,AbbVie,reports,Health Care,Pharmaceuticals,"North Chicago, Illinois",2012-12-31,1551152,2013 (1888)
4,ABMD,Abiomed,reports,Health Care,Health Care Equipment,"Danvers, Massachusetts",2018-05-31,815094,1981


In [3]:
# Create new DataFrame taking only needed columns
snp500_info_df = pd.DataFrame(snp500_info_df, columns=["Symbol", 
                                                       "Security", 
                                                       "GICS Sector", 
                                                       "GICS Sub-Industry", 
                                                       "Headquarters Location",
                                                       "Date first added"])
snp500_info_df

Unnamed: 0,Symbol,Security,GICS Sector,GICS Sub-Industry,Headquarters Location,Date first added
13,A,Agilent Technologies,Health Care,Health Care Equipment,"Santa Clara, California",2000-06-05
31,AAL,American Airlines Group,Industrials,Airlines,"Fort Worth, Texas",2015-03-23
10,AAP,Advance Auto Parts,Consumer Discretionary,Automotive Retail,"Raleigh, North Carolina",2015-07-09
46,AAPL,Apple,Information Technology,"Technology Hardware, Storage & Peripherals","Cupertino, California",1982-11-30
3,ABBV,AbbVie,Health Care,Pharmaceuticals,"North Chicago, Illinois",2012-12-31
...,...,...,...,...,...,...
499,YUM,Yum! Brands,Consumer Discretionary,Restaurants,"Louisville, Kentucky",1997-10-06
501,ZBH,Zimmer Biomet,Health Care,Health Care Equipment,"Warsaw, Indiana",2001-08-07
500,ZBRA,Zebra,Information Technology,Electronic Equipment & Instruments,"Lincolnshire, Illinois",2019-12-23
502,ZION,Zions Bancorp,Financials,Regional Banks,"Salt Lake City, Utah",2001-06-22


In [4]:
# Rename DataFrame Column name
snp500_info_df.rename({"Symbol":"Ticker",
                       "Security":"Company",
                       "GICS Sector":"GICS_Sector",
                       "GICS Sub-Industry":"GICS_Sub-Industry",
                       "Headquarters Location":"Headquarters",
                       "Date first added": "Date_added"}, axis="columns", inplace=True)
snp500_info_df

Unnamed: 0,Ticker,Company,GICS_Sector,GICS_Sub-Industry,Headquarters,Date_added
13,A,Agilent Technologies,Health Care,Health Care Equipment,"Santa Clara, California",2000-06-05
31,AAL,American Airlines Group,Industrials,Airlines,"Fort Worth, Texas",2015-03-23
10,AAP,Advance Auto Parts,Consumer Discretionary,Automotive Retail,"Raleigh, North Carolina",2015-07-09
46,AAPL,Apple,Information Technology,"Technology Hardware, Storage & Peripherals","Cupertino, California",1982-11-30
3,ABBV,AbbVie,Health Care,Pharmaceuticals,"North Chicago, Illinois",2012-12-31
...,...,...,...,...,...,...
499,YUM,Yum! Brands,Consumer Discretionary,Restaurants,"Louisville, Kentucky",1997-10-06
501,ZBH,Zimmer Biomet,Health Care,Health Care Equipment,"Warsaw, Indiana",2001-08-07
500,ZBRA,Zebra,Information Technology,Electronic Equipment & Instruments,"Lincolnshire, Illinois",2019-12-23
502,ZION,Zions Bancorp,Financials,Regional Banks,"Salt Lake City, Utah",2001-06-22


In [5]:
# Get the columns and the rows that are not null
snp500_info_df.count()

Ticker               504
Company              504
GICS_Sector          504
GICS_Sub-Industry    504
Headquarters         504
Date_added           459
dtype: int64

In [6]:
# Get the columns and the rows that are not null.
snp500_info_df.isnull().sum()

Ticker                0
Company               0
GICS_Sector           0
GICS_Sub-Industry     0
Headquarters          0
Date_added           45
dtype: int64

In [7]:
# Drop null values
snp500_info_clean_df = snp500_info_df.dropna()
snp500_info_clean_df

Unnamed: 0,Ticker,Company,GICS_Sector,GICS_Sub-Industry,Headquarters,Date_added
13,A,Agilent Technologies,Health Care,Health Care Equipment,"Santa Clara, California",2000-06-05
31,AAL,American Airlines Group,Industrials,Airlines,"Fort Worth, Texas",2015-03-23
10,AAP,Advance Auto Parts,Consumer Discretionary,Automotive Retail,"Raleigh, North Carolina",2015-07-09
46,AAPL,Apple,Information Technology,"Technology Hardware, Storage & Peripherals","Cupertino, California",1982-11-30
3,ABBV,AbbVie,Health Care,Pharmaceuticals,"North Chicago, Illinois",2012-12-31
...,...,...,...,...,...,...
499,YUM,Yum! Brands,Consumer Discretionary,Restaurants,"Louisville, Kentucky",1997-10-06
501,ZBH,Zimmer Biomet,Health Care,Health Care Equipment,"Warsaw, Indiana",2001-08-07
500,ZBRA,Zebra,Information Technology,Electronic Equipment & Instruments,"Lincolnshire, Illinois",2019-12-23
502,ZION,Zions Bancorp,Financials,Regional Banks,"Salt Lake City, Utah",2001-06-22


In [8]:
# Check again columns and the rows that are not null
snp500_info_clean_df.count()

Ticker               459
Company              459
GICS_Sector          459
GICS_Sub-Industry    459
Headquarters         459
Date_added           459
dtype: int64

In [9]:
# Check again columns and the rows that are not null
snp500_info_clean_df.isnull().sum()

Ticker               0
Company              0
GICS_Sector          0
GICS_Sub-Industry    0
Headquarters         0
Date_added           0
dtype: int64

In [10]:
# Get in yfinance all_sectors performance 10 years period

tickers = ["ATVI","CHTR", "CMCSA", "DIS", "DISH", "EA", "FB", "FOX", "FOXA", "GOOG", "GOOGL", "IPG", "LUMN", "LYV", "MTCH", "NFLX", "NWS", "NWSA", "OMC", "PARA", "T", "TMUS", "TTWO", "TWTR", "VZ","WBD",
 "AAP","AMZN", "APTV", "AZO", "BBWI", "BBY", "BKNG", "BWA", "CCL", "CMG", "CZR", "DG", "DHI", "DLTR", "DPZ", "DRI", "EBAY", "ETSY", "EXPE", "F", "GM", "GPC", "GRMN", "HAS", "HD", "HLT", "KMX", "LEN", "LKQ", "LOW", "LVS", "MAR", "MCD", "MGM", "MHK", "NCLH", "NKE", "NVR", "NWL", "ORLY", "PENN", "PHM", "POOL", "PVH", "RCL", "RL", "ROST", "SBUX", "TGT", "TJX", "TPR", "TSCO", "TSLA", "UA", "UAA", "ULTA", "VFC", "WHR", "WYNN","YUM",
 "ADM","CAG", "CHD", "CL", "CLX", "COST", "CPB", "EL", "GIS", "HRL", "HSY", "K", "KHC", "KMB", "KO", "KR", "LW", "MDLZ", "MKC", "MNST", "MO", "PEP", "PG", "PM", "SJM", "STZ", "SYY", "TAP", "TSN", "WBA","WMT",
 "APA","BKR", "COP", "CTRA", "CVX", "DVN", "EOG", "FANG", "HAL", "HES", "KMI", "MPC", "MRO", "OKE", "OXY", "PSX", "PXD", "SLB", "VLO", "WMB", "XOM",
 "AFL","AIG", "AIZ", "AJG", "ALL", "AMP", "AON", "AXP", "BAC", "BEN", "BK ", "BLK", "BRO", "C", "CB", "CBOE", "CFG", "CINF", "CMA", "CME", "COF", "DFS", "FDS", "FITB", "FRC", "GL", "GS", "HBAN", "HIG", "ICE", "IVZ", "JPM", "KEY", "L", "LNC", "MCO", "MET", "MKTX", "MMC", "MS", "MSCI", "MTB", "NDAQ", "NTRS", "PFG", "PGR", "PNC", "PRU", "RE", "RF", "RJF", "SBNY", "SCHW", "SIVB", "SPGI", "STT", "SYF", "TFC", "TROW", "TRV", "USB", "WFC", "WRB", "WTW","ZION",
 "A","ABBV", "ABC", "ABMD", "ABT", "ALGN", "AMGN", "ANTM", "BAX", "BDX", "BIIB", "BIO", "BMY", "BSX", "CAH", "CERN", "CI", "CNC", "COO", "CRL", "CTLT", "CVS", "DGX", "DHR", "DVA", "DXCM", "EW", "GILD", "HCA", "HOLX", "HSIC", "HUM", "IDXX", "ILMN", "INCY", "IQV", "ISRG", "JNJ", "LH", "LLY", "MCK", "MDT", "MOH", "MRK", "MRNA", "MTD", "OGN", "PFE", "PKI", "REGN", "RMD", "STE", "SYK", "TECH", "TFX", "TMO", "UHS", "UNH", "VRTX", "VTRS", "WAT", "WST", "XRAY", "ZBH","ZTS",
 "AAL","ALK", "ALLE", "AME", "AOS", "BA", "CARR", "CAT", "CHRW", "CMI", "CPRT", "CSX", "CTAS", "DAL", "DE", "DOV", "EFX", "EMR", "ETN", "EXPD", "FAST", "FBHS", "FDX", "FTV", "GD", "GE", "GNRC", "GWW", "HII", "HON", "HWM", "IEX", "IR", "ITW", "J", "JBHT", "JCI", "LDOS", "LHX", "LMT", "LUV", "MAS", "MMM", "NDSN", "NLSN", "NOC", "NSC", "ODFL", "OTIS", "PCAR", "PH", "PNR", "PWR", "RHI", "ROK", "ROL", "ROP", "RSG", "RTX", "SNA", "SWK", "TDG", "TDY", "TT", "TXT", "UAL", "UNP", "UPS", "URI", "VRSK", "WAB", "WM","XYL",
 "AAPL","ACN", "ADBE", "ADI", "ADP", "ADSK", "AKAM", "AMAT", "AMD", "ANET", "ANSS", "APH", "AVGO", "BR", "CDAY", "CDNS", "CDW", "CRM", "CSCO", "CTSH", "CTXS", "DXC", "ENPH", "EPAM", "FFIV", "FIS", "FISV", "FLT", "FTNT", "GLW", "GPN", "HPE", "HPQ", "IBM", "INTC", "INTU", "IPGP", "IT", "JKHY", "JNPR", "KEYS", "KLAC", "LRCX", "MA", "MCHP", "MPWR", "MSFT", "MSI", "MU", "NLOK", "NOW", "NTAP", "NVDA", "NXPI", "ORCL", "PAYC", "PAYX", "PTC", "PYPL", "QCOM", "QRVO", "SEDG", "SNPS", "STX", "SWKS", "TEL", "TER", "TRMB", "TXN", "TYL", "V", "VRSN", "WDC","ZBRA",
 "ALB","AMCR", "APD", "AVY", "BALL", "CE", "CF", "CTVA", "DD", "DOW", "ECL", "EMN", "FCX", "FMC", "IFF", "IP", "LIN", "LYB", "MLM", "MOS", "NEM", "NUE", "PKG", "PPG", "SEE", "SHW", "VMC","WRK",
 "AMT","ARE", "AVB", "BXP", "CBRE", "CCI", "CPT", "DLR", "DRE", "EQIX", "EQR", "ESS", "EXR", "FRT", "HST", "IRM", "KIM", "MAA", "O", "PEAK", "PLD", "PSA", "REG", "SBAC", "SPG", "UDR", "VNO", "VTR", "WELL","WY",
 "AEE","AEP", "AES", "ATO", "AWK", "CEG", "CMS", "CNP", "D", "DTE", "DUK", "ED", "EIX", "ES", "ETR", "EVRG", "EXC", "FE", "LNT", "NEE", "NI", "NRG", "PEG", "PNW", "PPL", "SO", "SRE", "WEC","XEL"]

# Create a list
df_list = list()

for ticker in tickers:
    data = yf.download(ticker, group_by="Ticker", period="10y")
    
    # Add this column because the dataframe doesn't contain a column with the ticker
    data["Ticker"] = ticker  
    df_list.append(data)

# Combine all Ticker into a dataframe
snp500_perf_data_df = pd.concat(df_list)
snp500_perf_data_df

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Ticker
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2012-05-25,12.090000,12.280000,12.030000,12.240000,11.394887,4928300.0,ATVI
2012-05-29,12.250000,12.290000,11.910000,11.960000,11.134218,10601200.0,ATVI
2012-05-30,11.870000,12.010000,11.810000,11.910000,11.087672,9295300.0,ATVI
2012-05-31,11.910000,11.950000,11.630000,11.740000,10.929410,11647000.0,ATVI
2012-06-01,11.630000,11.860000,11.520000,11.650000,10.845624,8971200.0,ATVI
...,...,...,...,...,...,...,...
2022-05-18,75.480003,75.709999,74.650002,74.889999,74.889999,3151900.0,XEL
2022-05-19,74.800003,75.470001,73.839996,75.010002,75.010002,2954900.0,XEL
2022-05-20,75.040001,75.150002,73.620003,74.220001,74.220001,4574000.0,XEL
2022-05-23,75.269997,75.269997,73.959999,74.870003,74.870003,2722700.0,XEL


In [11]:
# Rename DataFrame Column name
snp500_perf_data_df.rename({"Adj Close":"Adj_Close"}, axis="columns", inplace=True)
snp500_perf_data_df

Unnamed: 0_level_0,Open,High,Low,Close,Adj_Close,Volume,Ticker
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2012-05-25,12.090000,12.280000,12.030000,12.240000,11.394887,4928300.0,ATVI
2012-05-29,12.250000,12.290000,11.910000,11.960000,11.134218,10601200.0,ATVI
2012-05-30,11.870000,12.010000,11.810000,11.910000,11.087672,9295300.0,ATVI
2012-05-31,11.910000,11.950000,11.630000,11.740000,10.929410,11647000.0,ATVI
2012-06-01,11.630000,11.860000,11.520000,11.650000,10.845624,8971200.0,ATVI
...,...,...,...,...,...,...,...
2022-05-18,75.480003,75.709999,74.650002,74.889999,74.889999,3151900.0,XEL
2022-05-19,74.800003,75.470001,73.839996,75.010002,75.010002,2954900.0,XEL
2022-05-20,75.040001,75.150002,73.620003,74.220001,74.220001,4574000.0,XEL
2022-05-23,75.269997,75.269997,73.959999,74.870003,74.870003,2722700.0,XEL


In [12]:
# Get the columns and the rows that are not null
snp500_perf_data_df.count()

Open         1226975
High         1226975
Low          1226975
Close        1226975
Adj_Close    1226975
Volume       1226975
Ticker       1226988
dtype: int64

In [13]:
# Get the columns and the rows that are not null.
snp500_perf_data_df.isnull().sum()

Open         13
High         13
Low          13
Close        13
Adj_Close    13
Volume       13
Ticker        0
dtype: int64

In [14]:
# Drop null values
snp500_perf_clean_data_df = snp500_perf_data_df.dropna()
snp500_perf_clean_data_df

Unnamed: 0_level_0,Open,High,Low,Close,Adj_Close,Volume,Ticker
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2012-05-25,12.090000,12.280000,12.030000,12.240000,11.394887,4928300.0,ATVI
2012-05-29,12.250000,12.290000,11.910000,11.960000,11.134218,10601200.0,ATVI
2012-05-30,11.870000,12.010000,11.810000,11.910000,11.087672,9295300.0,ATVI
2012-05-31,11.910000,11.950000,11.630000,11.740000,10.929410,11647000.0,ATVI
2012-06-01,11.630000,11.860000,11.520000,11.650000,10.845624,8971200.0,ATVI
...,...,...,...,...,...,...,...
2022-05-18,75.480003,75.709999,74.650002,74.889999,74.889999,3151900.0,XEL
2022-05-19,74.800003,75.470001,73.839996,75.010002,75.010002,2954900.0,XEL
2022-05-20,75.040001,75.150002,73.620003,74.220001,74.220001,4574000.0,XEL
2022-05-23,75.269997,75.269997,73.959999,74.870003,74.870003,2722700.0,XEL


In [15]:
# Check again columns and the rows that are not null
snp500_perf_clean_data_df.count()

Open         1226975
High         1226975
Low          1226975
Close        1226975
Adj_Close    1226975
Volume       1226975
Ticker       1226975
dtype: int64

In [16]:
# Check again columns and the rows that are not null
snp500_perf_clean_data_df.isnull().sum()

Open         0
High         0
Low          0
Close        0
Adj_Close    0
Volume       0
Ticker       0
dtype: int64

In [17]:
# Combine the data into a single dataset
snp500_data_complete_df = pd.merge(snp500_info_clean_df, snp500_perf_clean_data_df, how="right", on=["Ticker","Ticker"])
snp500_data_complete_df

Unnamed: 0,Ticker,Company,GICS_Sector,GICS_Sub-Industry,Headquarters,Date_added,Open,High,Low,Close,Adj_Close,Volume
0,ATVI,Activision Blizzard,Communication Services,Interactive Home Entertainment,"Santa Monica, California",2015-08-31,12.090000,12.280000,12.030000,12.240000,11.394887,4928300.0
1,ATVI,Activision Blizzard,Communication Services,Interactive Home Entertainment,"Santa Monica, California",2015-08-31,12.250000,12.290000,11.910000,11.960000,11.134218,10601200.0
2,ATVI,Activision Blizzard,Communication Services,Interactive Home Entertainment,"Santa Monica, California",2015-08-31,11.870000,12.010000,11.810000,11.910000,11.087672,9295300.0
3,ATVI,Activision Blizzard,Communication Services,Interactive Home Entertainment,"Santa Monica, California",2015-08-31,11.910000,11.950000,11.630000,11.740000,10.929410,11647000.0
4,ATVI,Activision Blizzard,Communication Services,Interactive Home Entertainment,"Santa Monica, California",2015-08-31,11.630000,11.860000,11.520000,11.650000,10.845624,8971200.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1226970,XEL,Xcel Energy,Utilities,Multi-Utilities,"Minneapolis, Minnesota",1957-03-04,75.480003,75.709999,74.650002,74.889999,74.889999,3151900.0
1226971,XEL,Xcel Energy,Utilities,Multi-Utilities,"Minneapolis, Minnesota",1957-03-04,74.800003,75.470001,73.839996,75.010002,75.010002,2954900.0
1226972,XEL,Xcel Energy,Utilities,Multi-Utilities,"Minneapolis, Minnesota",1957-03-04,75.040001,75.150002,73.620003,74.220001,74.220001,4574000.0
1226973,XEL,Xcel Energy,Utilities,Multi-Utilities,"Minneapolis, Minnesota",1957-03-04,75.269997,75.269997,73.959999,74.870003,74.870003,2722700.0


In [18]:
# Convert the dataframe to csv file
# Index is False as we don't want to write index in csv file
snp500_data_complete_df.to_csv("snp500_data_complete.csv", mode="w", index=False) 

In [19]:
# Check dtype
snp500_data_complete_df.dtypes

Ticker                object
Company               object
GICS_Sector           object
GICS_Sub-Industry     object
Headquarters          object
Date_added            object
Open                 float64
High                 float64
Low                  float64
Close                float64
Adj_Close            float64
Volume               float64
dtype: object