**Data Science - web scraper 3**

Aim of the file:

1.   Scrape multiple sites with a similar URL.
2.   Do this efficiently by running a loop over an array.
3.   Collect togeter the results.

In [2]:
# // 1.  Import packages that we need:
import numpy as np
import pandas as pd
# // Web scraping: 
import requests
import string
from bs4 import BeautifulSoup
# // OS. Sometimes need this for finding working directory:
import os
# ////////////////////////////////////////////////////////////////

Introduction: using a base URL and injecting a series of stock tickers into it.

In [4]:
# // Set the base URL: 
url_base = "https://www.google.com/finance/quote/{}:LON"

# // Add an array of tickers, for major UK banks: 
tickers = ['LLOY', 'NWG', 'BARC', 'HSBA', 'STAN', 'VMUK']

# // Create an empty array that we are going to fill, base it on the length of the tickers array
length = len(tickers)
urls = np.empty(length, dtype='S50')

# // Loop across this array:
for t in tickers:
   # // Put the particular ticker into the base URL 
   stockURL = url_base.format(t)
   # // Find the index value of this particular ticker.
   i = tickers.index(t)
   # // Fill the empty url, at the given index value, with the full url for this ticker
   urls[i] = stockURL

# // Print out the urls that we have   
urls

array([b'https://www.google.com/finance/quote/LLOY:LON',
       b'https://www.google.com/finance/quote/NWG:LON',
       b'https://www.google.com/finance/quote/BARC:LON',
       b'https://www.google.com/finance/quote/HSBA:LON',
       b'https://www.google.com/finance/quote/STAN:LON',
       b'https://www.google.com/finance/quote/VMUK:LON'], dtype='|S50')

Using this in a full example:

In [11]:
# // Set the base url:
url_base = "https://www.google.com/finance/quote/{}:LON"

# // Pick the letters that we want to inject into this url:
tickers = ['LLOY', 'NWG', 'BARC', 'HSBA', 'STAN', 'VMUK']

# // Create an empty array that is going to house the results
# // We need to tell Python this array needs to be able to hold objects, hence dtype=object.
# // This is becuase we are not going to put just one number, or one piece of string into position in the array
# // Rather, each part of this array is going to be an array with the individual scrpaing results:
data = np.empty(length, dtype='object')

# // Begin a loop, dealing with this tickers one by one:
for t in tickers:
   
   # // Return the index number of the thing we are working with:
   s = tickers.index(t)
   
   # // Build the URL for this iteration of the loop:
   URL = url_base.format(t)
   
   # // Request the html from the URL:
   html = requests.get(URL)
   
   # // Get the soup of this page
   soup = BeautifulSoup(html.content, 'html.parser')
   
   # // Now get what we want from the page: 
   name = soup.find_all("h1")
   price = soup.find_all("div", class_="YMlKec fxKbKc")
   ticker = soup.find_all("div", class_="COaKTb OTVmSe")
   change = soup.find_all("div", class_="JwB6zf")
   
   name = name[0].text
   price = price[0].text
   change = change[0].text
   
   # // Group together:
   results = [t, name, price, change]
   
   # // Sense check: print out what we have on this point in the loop:
   s
   t
   results

   # // Find the index value of this particular ticker.
   i = tickers.index(t)
   
   # // Fill these results in to a master array of results:
   # // Fill the empty url, at the given index value, with the full url for this ticker
   data[i] = results   

Now examine what we have, and how we can retrive various parts of it:

In [13]:
data

array([list(['LLOY', 'Lloyds Banking Group PLC', 'GBX\xa046.24', '0.00%']),
       list(['NWG', 'Natwest Group PLC', 'GBX\xa0213.00', '0.00%']),
       list(['BARC', 'Barclays PLC', 'GBX\xa0181.40', '0.00%']),
       list(['HSBA', 'HSBC Holdings plc', 'GBX\xa0403.95', '0.00%']),
       list(['STAN', 'Standard Chartered PLC', 'GBX\xa0454.20', '0.00%']),
       list(['VMUK', 'Virgin Money UK PLC', 'GBX\xa0200.70', '0.00%'])],
      dtype=object)

In [14]:
data[1]

['NWG', 'Natwest Group PLC', 'GBX\xa0213.00', '0.00%']

In [15]:
data[0][2]

'GBX\xa046.24'