# Project 9: Web Scraping, APIs & Wrappers (US Stocks)

## Web Scraping - the Dow Jones Constituents

In [31]:
import pandas as pd

In [32]:
const = pd.read_html("https://en.wikipedia.org/wiki/Dow_Jones_Industrial_Average")[1]

In [33]:
const

Unnamed: 0,Company,Exchange,Symbol,Industry,Date added,Notes,Index weighting
0,3M,NYSE,MMM,Conglomerate,1976-08-09,As Minnesota Mining and Manufacturing,2.41%
1,American Express,NYSE,AXP,Financial services,1982-08-30,,3.02%
2,Amgen,NASDAQ,AMGN,Biopharmaceutical,2020-08-31,,5.48%
3,Apple,NASDAQ,AAPL,Information technology,2015-03-19,,2.84%
4,Boeing,NYSE,BA,Aerospace and defense,1987-03-12,,3.36%
5,Caterpillar,NYSE,CAT,Construction and mining,1991-05-06,,4.52%
6,Chevron,NYSE,CVX,Petroleum industry,2008-02-19,Also 1930-07-18 to 1999-11-01,3.50%
7,Cisco,NASDAQ,CSCO,Information technology,2009-06-08,,0.96%
8,Coca-Cola,NYSE,KO,Drink industry,1987-03-12,Also 1932-05-26 to 1935-11-20,1.22%
9,Disney,NYSE,DIS,Broadcasting and entertainment,1991-05-06,,1.89%


In [34]:
const = const.iloc[:, :5].copy()
const

Unnamed: 0,Company,Exchange,Symbol,Industry,Date added
0,3M,NYSE,MMM,Conglomerate,1976-08-09
1,American Express,NYSE,AXP,Financial services,1982-08-30
2,Amgen,NASDAQ,AMGN,Biopharmaceutical,2020-08-31
3,Apple,NASDAQ,AAPL,Information technology,2015-03-19
4,Boeing,NYSE,BA,Aerospace and defense,1987-03-12
5,Caterpillar,NYSE,CAT,Construction and mining,1991-05-06
6,Chevron,NYSE,CVX,Petroleum industry,2008-02-19
7,Cisco,NASDAQ,CSCO,Information technology,2009-06-08
8,Coca-Cola,NYSE,KO,Drink industry,1987-03-12
9,Disney,NYSE,DIS,Broadcasting and entertainment,1991-05-06


In [35]:
const.rename(columns = {"Date added":"Date_Added"}, inplace = True)

In [36]:
const.Date_Added = pd.to_datetime(const.Date_Added)

In [37]:
const.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Company     30 non-null     object        
 1   Exchange    30 non-null     object        
 2   Symbol      30 non-null     object        
 3   Industry    30 non-null     object        
 4   Date_Added  30 non-null     datetime64[ns]
dtypes: datetime64[ns](1), object(4)
memory usage: 1.3+ KB


## Normalizing Unicode Strings and Getting the Ticker Symbols

In [38]:
import unicodedata

In [39]:
const.Symbol

0      MMM
1      AXP
2     AMGN
3     AAPL
4       BA
5      CAT
6      CVX
7     CSCO
8       KO
9      DIS
10     DOW
11      GS
12      HD
13     HON
14     IBM
15    INTC
16     JNJ
17     JPM
18     MCD
19     MRK
20    MSFT
21     NKE
22      PG
23     CRM
24     TRV
25     UNH
26      VZ
27       V
28     WBA
29     WMT
Name: Symbol, dtype: object

In [40]:
const.Symbol.to_list()

['MMM',
 'AXP',
 'AMGN',
 'AAPL',
 'BA',
 'CAT',
 'CVX',
 'CSCO',
 'KO',
 'DIS',
 'DOW',
 'GS',
 'HD',
 'HON',
 'IBM',
 'INTC',
 'JNJ',
 'JPM',
 'MCD',
 'MRK',
 'MSFT',
 'NKE',
 'PG',
 'CRM',
 'TRV',
 'UNH',
 'VZ',
 'V',
 'WBA',
 'WMT']

In [41]:
const.Symbol.apply(lambda x: unicodedata.normalize("NFKD", x))

0      MMM
1      AXP
2     AMGN
3     AAPL
4       BA
5      CAT
6      CVX
7     CSCO
8       KO
9      DIS
10     DOW
11      GS
12      HD
13     HON
14     IBM
15    INTC
16     JNJ
17     JPM
18     MCD
19     MRK
20    MSFT
21     NKE
22      PG
23     CRM
24     TRV
25     UNH
26      VZ
27       V
28     WBA
29     WMT
Name: Symbol, dtype: object

In [42]:
const.Symbol = const.Symbol.apply(lambda x: unicodedata.normalize("NFKD", x))

In [43]:
const.Symbol[0]

'MMM'

In [46]:
const["Ticker"] = const.Symbol.str.split(": ").apply(lambda x: x[-1])

In [47]:
const

Unnamed: 0,Company,Exchange,Symbol,Industry,Date_Added,Ticker
0,3M,NYSE,MMM,Conglomerate,1976-08-09,MMM
1,American Express,NYSE,AXP,Financial services,1982-08-30,AXP
2,Amgen,NASDAQ,AMGN,Biopharmaceutical,2020-08-31,AMGN
3,Apple,NASDAQ,AAPL,Information technology,2015-03-19,AAPL
4,Boeing,NYSE,BA,Aerospace and defense,1987-03-12,BA
5,Caterpillar,NYSE,CAT,Construction and mining,1991-05-06,CAT
6,Chevron,NYSE,CVX,Petroleum industry,2008-02-19,CVX
7,Cisco,NASDAQ,CSCO,Information technology,2009-06-08,CSCO
8,Coca-Cola,NYSE,KO,Drink industry,1987-03-12,KO
9,Disney,NYSE,DIS,Broadcasting and entertainment,1991-05-06,DIS


In [48]:
ticker_list = const.Ticker.to_list()

In [49]:
ticker_list

['MMM',
 'AXP',
 'AMGN',
 'AAPL',
 'BA',
 'CAT',
 'CVX',
 'CSCO',
 'KO',
 'DIS',
 'DOW',
 'GS',
 'HD',
 'HON',
 'IBM',
 'INTC',
 'JNJ',
 'JPM',
 'MCD',
 'MRK',
 'MSFT',
 'NKE',
 'PG',
 'CRM',
 'TRV',
 'UNH',
 'VZ',
 'V',
 'WBA',
 'WMT']

In [50]:
const.to_csv("const.csv", index = False)

## Loading and Saving Historical Stock Prices

In [52]:
import pandas as pd
import yfinance as yf
import datetime

In [79]:
dji = yf.download("^DJI", start="2007-01-01", end=datetime.date.today())
dji

[*********************100%%**********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2007-01-03,12459.540039,12580.349609,12404.820312,12474.519531,12474.519531,327200000
2007-01-04,12473.160156,12510.410156,12403.860352,12480.690430,12480.690430,259060000
2007-01-05,12480.049805,12480.129883,12365.410156,12398.009766,12398.009766,235220000
2007-01-08,12392.009766,12445.919922,12337.370117,12423.490234,12423.490234,223500000
2007-01-09,12424.769531,12466.429688,12369.169922,12416.599609,12416.599609,225190000
...,...,...,...,...,...,...
2023-08-30,34847.800781,35025.570312,34811.738281,34890.238281,34890.238281,236070000
2023-08-31,34909.089844,35070.210938,34719.769531,34721.910156,34721.910156,341900000
2023-09-01,34876.238281,34979.179688,34720.699219,34837.710938,34837.710938,286370000
2023-09-05,34843.218750,34871.261719,34635.628906,34641.968750,34641.968750,283540000


In [80]:
dji.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 4198 entries, 2007-01-03 to 2023-09-06
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Open       4198 non-null   float64
 1   High       4198 non-null   float64
 2   Low        4198 non-null   float64
 3   Close      4198 non-null   float64
 4   Adj Close  4198 non-null   float64
 5   Volume     4198 non-null   int64  
dtypes: float64(5), int64(1)
memory usage: 229.6 KB


In [81]:
dji.to_csv("dji.csv")

In [82]:
prices = yf.download(ticker_list, start = "2007-01-01", end = str(datetime.date.today()))

[*********************100%%**********************]  30 of 30 completed


In [83]:
prices

Unnamed: 0_level_0,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,...,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume
Unnamed: 0_level_1,AAPL,AMGN,AXP,BA,CAT,CRM,CSCO,CVX,DIS,DOW,...,MRK,MSFT,NKE,PG,TRV,UNH,V,VZ,WBA,WMT
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2007-01-03,2.540327,49.945019,46.492004,64.405708,38.978706,9.017500,19.244230,37.490097,28.317102,,...,15740226,76935100,17299200,9717900,3432800,8360300,,21445850,6294500,35687300
2007-01-04,2.596711,52.084480,46.153103,64.665756,38.876736,9.470000,19.750839,37.125603,28.540651,,...,13115930,45774500,15085600,8711400,2068200,5152500,,19215860,3681800,17073000
2007-01-05,2.578220,52.208614,45.544594,64.391296,38.379620,9.880000,19.757784,37.268223,28.308819,,...,11168431,44607200,14996800,9907900,2104600,6215700,,19047041,3680900,13556900
2007-01-08,2.590952,51.792385,45.975941,64.239594,38.424240,9.982500,19.868816,37.743652,28.565489,,...,7384522,50220200,10109600,11068200,2440900,4344100,,20370917,4720800,16396400
2007-01-09,2.806182,52.040657,45.683235,63.560658,38.634548,9.990000,19.757784,37.310493,28.524092,,...,9037114,44636600,15167200,10823800,1319500,5483900,,16281352,3792500,14643200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-08-30,187.649994,257.880005,160.169998,228.850006,282.329987,215.039993,56.810001,160.179993,84.279999,54.750000,...,4785000,15222100,4822100,3712200,825100,2283400,4573300.0,15021400,5883700,3655900
2023-08-31,187.869995,256.339996,157.990005,224.029999,281.130005,221.460007,57.349998,161.100006,83.680000,54.560001,...,9212000,26411000,7046200,5451400,1436900,4927700,5532600.0,24333200,10794500,6527600
2023-09-01,189.460007,256.709991,159.619995,223.399994,286.250000,221.529999,57.840000,164.300003,81.639999,55.290001,...,4941300,14931200,6359400,4126300,724900,3165000,4111800.0,14744400,33164400,4183500
2023-09-05,189.699997,254.009995,160.000000,222.570007,281.630005,218.690002,57.259998,166.460007,81.190002,54.189999,...,6379900,18553900,6698700,5097000,863300,3443300,4459600.0,17486000,23839200,5069800


In [84]:
prices.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 4198 entries, 2007-01-03 to 2023-09-06
Columns: 180 entries, ('Adj Close', 'AAPL') to ('Volume', 'WMT')
dtypes: float64(152), int64(28)
memory usage: 5.8 MB


In [85]:
prices = prices.loc[:,"Close"].copy()

In [86]:
prices.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 4198 entries, 2007-01-03 to 2023-09-06
Data columns (total 30 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   AAPL    4198 non-null   float64
 1   AMGN    4198 non-null   float64
 2   AXP     4198 non-null   float64
 3   BA      4198 non-null   float64
 4   CAT     4198 non-null   float64
 5   CRM     4198 non-null   float64
 6   CSCO    4198 non-null   float64
 7   CVX     4198 non-null   float64
 8   DIS     4198 non-null   float64
 9   DOW     1125 non-null   float64
 10  GS      4198 non-null   float64
 11  HD      4198 non-null   float64
 12  HON     4198 non-null   float64
 13  IBM     4198 non-null   float64
 14  INTC    4198 non-null   float64
 15  JNJ     4198 non-null   float64
 16  JPM     4198 non-null   float64
 17  KO      4198 non-null   float64
 18  MCD     4198 non-null   float64
 19  MMM     4198 non-null   float64
 20  MRK     4198 non-null   float64
 21  MSFT    4198 non-nu

In [87]:
prices

Unnamed: 0_level_0,AAPL,AMGN,AXP,BA,CAT,CRM,CSCO,CVX,DIS,DOW,...,MRK,MSFT,NKE,PG,TRV,UNH,V,VZ,WBA,WMT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2007-01-03,2.992857,68.400002,60.360001,89.169998,61.160000,9.017500,27.730000,70.970001,33.738300,,...,42.003819,29.860001,12.208750,64.540001,53.549999,52.570000,,35.306732,46.070000,47.549999
2007-01-04,3.059286,71.330002,59.919998,89.529999,61.000000,9.470000,28.459999,70.279999,34.004654,,...,43.043892,29.809999,12.333750,64.050003,53.099998,52.910000,,35.502777,46.160000,47.779999
2007-01-05,3.037500,71.500000,59.130001,89.150002,60.220001,9.880000,28.469999,70.550003,33.728436,,...,42.270992,29.639999,12.353750,63.500000,52.410000,52.549999,,34.895969,45.500000,47.389999
2007-01-08,3.052500,70.930000,59.689999,88.940002,60.290001,9.982500,28.629999,71.449997,34.034248,,...,42.261452,29.930000,12.316250,63.639999,52.020000,53.320000,,34.363850,45.689999,47.000000
2007-01-09,3.306071,71.269997,59.310001,88.000000,60.619999,9.990000,28.469999,70.629997,33.984924,,...,41.870228,29.959999,12.470000,63.480000,51.889999,52.680000,,34.503880,45.930000,47.389999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-08-30,187.649994,257.880005,160.169998,228.850006,282.329987,215.039993,56.810001,160.179993,84.279999,54.750000,...,110.209999,328.790009,102.099998,154.039993,161.320007,491.529999,246.229996,34.639999,25.600000,161.199997
2023-08-31,187.869995,256.339996,157.990005,224.029999,281.130005,221.460007,57.349998,161.100006,83.680000,54.560001,...,108.980003,327.760010,101.709999,154.339996,161.229996,476.579987,245.679993,34.980000,25.309999,162.610001
2023-09-01,189.460007,256.709991,159.619995,223.399994,286.250000,221.529999,57.840000,164.300003,81.639999,55.290001,...,109.839996,328.660004,102.360001,154.509995,162.300003,476.239990,248.110001,34.860001,23.430000,161.570007
2023-09-05,189.699997,254.009995,160.000000,222.570007,281.630005,218.690002,57.259998,166.460007,81.190002,54.189999,...,107.510002,333.549988,100.320000,152.440002,160.389999,480.809998,245.339996,34.299999,22.730000,160.270004


In [89]:
prices.to_csv("const_prices.csv")