**Data Science - web scraper 4**

Aim of the file:

1.   Take results and turn them into a dataframe.
2.   Save this as a CSV.

In [None]:
# // 1.  Import packages that we need:
import numpy as np
import pandas as pd
# // Web scraping: 
import requests
import string
from bs4 import BeautifulSoup
# // OS. Sometimes need this for finding working directory:
import os
# ////////////////////////////////////////////////////////////////

Here is our code so far:

In [None]:
# // Set the base url:
url_base = "https://www.google.com/finance/quote/{}:LON"

# // Pick the letters that we want to inject into this url:
tickers = ['LLOY', 'NWG', 'BARC', 'HSBA', 'STAN', 'VMUK']

# // Create an empty array that is going to house the results
# // We need to tell Python this array needs to be able to hold objects, hence dtype=object.
# // This is becuase we are not going to put just one number, or one piece of string into position in the array
# // Rather, each part of this array is going to be an array with the individual scrpaing results:
data = np.empty(length, dtype='object')

# // Begin a loop, dealing with this tickers one by one:
for t in tickers:
   
   # // Return the index number of the thing we are working with:
   s = tickers.index(t)
   
   # // Build the URL for this iteration of the loop:
   URL = url_base.format(t)
   
   # // Request the html from the URL:
   html = requests.get(URL)
   
   # // Get the soup of this page
   soup = BeautifulSoup(html.content, 'html.parser')
   
   # // Now get what we want from the page: 
   name = soup.find_all("h1")
   price = soup.find_all("div", class_="YMlKec fxKbKc")
   ticker = soup.find_all("div", class_="COaKTb OTVmSe")
   change = soup.find_all("div", class_="JwB6zf")
   
   name = name[0].text
   price = price[0].text
   change = change[0].text
   
   # // Group together:
   results = [t, name, price, change]
   
   # // Sense check: print out what we have on this point in the loop:
   s
   t
   results

   # // Find the index value of this particular ticker.
   i = tickers.index(t)
   
   # // Fill these results in to a master array of results:
   # // Fill the empty url, at the given index value, with the full url for this ticker
   data[i] = results   

Now turn this into a pandas dataframe:


In [18]:
df = pd.DataFrame(data)
df

Unnamed: 0,0
0,"[LLOY, Lloyds Banking Group PLC, GBX 46.24, 0...."
1,"[NWG, Natwest Group PLC, GBX 213.00, 0.00%]"
2,"[BARC, Barclays PLC, GBX 181.40, 0.00%]"
3,"[HSBA, HSBC Holdings plc, GBX 403.95, 0.00%]"
4,"[STAN, Standard Chartered PLC, GBX 454.20, 0.00%]"
5,"[VMUK, Virgin Money UK PLC, GBX 200.70, 0.00%]"


An alternative method is to build the dataframe within the loop
The idea is to fill up the dataframe step by step as you deal with each item in your loop
The following code does this:

In [25]:
url_base = "https://www.google.com/finance/quote/{}:LON"

tickers = ['LLOY', 'NWG', 'BARC', 'HSBA', 'STAN', 'VMUK']

# //Set up a blank DataFrame to fill up:
df = pd.DataFrame()

for t in tickers:
   
   # // Return the index number of the thing we are working with:
   s = tickers.index(t)
   
   # // Build the URL for this iteration of the loop:
   URL = url_base.format(t)
   
   # // Request the html from the URL:
   html = requests.get(URL)
   
   # // Get the soup of this page
   soup = BeautifulSoup(html.content, 'html.parser')
   
   # // Now get what we want from the page: 
   name = soup.find_all("h1")
   price = soup.find_all("div", class_="YMlKec fxKbKc")
   ticker = soup.find_all("div", class_="COaKTb OTVmSe")
   change = soup.find_all("div", class_="JwB6zf")
   
   # // Add the date we got the data:
   date = pd.to_datetime('today').normalize()
   
   name = name[0].text
   price = price[0].text
   # // ticker = ticker[0].text
   change = change[0].text
   
   # // Group together:
   results = [date, t, name, price, change]
   
   # // Convert array into dataframe:
   # // This will change with each iteration of the loop:
   # // [This is why I use subscript t, as a reminder]
   df_t = pd.DataFrame(results)
   
   # // Name this column of data:
   # // I want this column to update, named to be the name of the ticker i.e. 1, 2, 3.
   df_t.columns = ['Data']
   
   # // Add this to the master dataframe:
   df[s] = df_t['Data']

In [26]:
df

Unnamed: 0,0,1,2,3,4,5
0,2021-08-06 00:00:00,2021-08-06 00:00:00,2021-08-06 00:00:00,2021-08-06 00:00:00,2021-08-06 00:00:00,2021-08-06 00:00:00
1,LLOY,NWG,BARC,HSBA,STAN,VMUK
2,Lloyds Banking Group PLC,Natwest Group PLC,Barclays PLC,HSBC Holdings plc,Standard Chartered PLC,Virgin Money UK PLC
3,GBX 46.43,GBX 213.40,GBX 181.94,GBX 405.85,GBX 455.60,GBX 201.90
4,-0.089%,-0.089%,-0.089%,-0.089%,-0.089%,-0.089%


Trasnpose this:

In [27]:
df = df.T
df

Unnamed: 0,0,1,2,3,4
0,2021-08-06,LLOY,Lloyds Banking Group PLC,GBX 46.43,-0.089%
1,2021-08-06,NWG,Natwest Group PLC,GBX 213.40,-0.089%
2,2021-08-06,BARC,Barclays PLC,GBX 181.94,-0.089%
3,2021-08-06,HSBA,HSBC Holdings plc,GBX 405.85,-0.089%
4,2021-08-06,STAN,Standard Chartered PLC,GBX 455.60,-0.089%
5,2021-08-06,VMUK,Virgin Money UK PLC,GBX 201.90,-0.089%


Add some names to the columns:

In [28]:
# // Now add the names as the names of the columns:   
df.columns = ['Date', 'Ticker','Company', 'Price', 'Change']
df

Unnamed: 0,Date,Ticker,Company,Price,Change
0,2021-08-06,LLOY,Lloyds Banking Group PLC,GBX 46.43,-0.089%
1,2021-08-06,NWG,Natwest Group PLC,GBX 213.40,-0.089%
2,2021-08-06,BARC,Barclays PLC,GBX 181.94,-0.089%
3,2021-08-06,HSBA,HSBC Holdings plc,GBX 405.85,-0.089%
4,2021-08-06,STAN,Standard Chartered PLC,GBX 455.60,-0.089%
5,2021-08-06,VMUK,Virgin Money UK PLC,GBX 201.90,-0.089%


Now save this as today's stock prices:

In [31]:
# // Now save as today's prices:   
df.to_csv("todaysStockPrices.csv")

In [None]:
# // TIP: to find working directory - to find where the file has gone
os.getcwd()