In [2]:
# Project: Do fuel price changes affect political support?

# Edward Natusch - Data Science Project - University of Bristol 

# In this notebook, I will scrape UK Conservative Party polls from Savanta Comres, using Beautiful Soup.

# I will then clean this data using pandas and create monthly averages for conservative voting intention (as the polls are published irregularly).

# I will then download the 'CPIH INDEX: Liquid fuels, vehicle fuels & lubricants' dataset from the Office for National Statistics API.

# I will the merge the data sets, creating a CSV file with the conservative polling data and the CPIH Fuel Price Index for each available month.

# I will then visualise this data in vega-lite. The visualisation can be found on my project page at https://edward-natusch.github.io/project.html.

# The Westminster Voting Intention Polls from Savanta Comres and their methodology can be found here: https://comresglobal.com/poll-category/voting-intention/.

# The CPIH INDEX: Liquid fuels, vehicle fuels & lubricants dataset (produced monthly) and its methodology can be found here: https://www.ons.gov.uk/economy/inflationandpriceindices/timeseries/l5jl/mm23.


In [3]:
# Importing the required packages 

import numpy as np

from urllib.request import Request, urlopen

import pandas as pd

import requests

from bs4 import BeautifulSoup

import os

In [4]:
# Using beautiful soup to read the html data from comresglobal

url = 'https://comresglobal.com/poll-category/voting-intention/'

request = Request(url , headers ={'User-Agent':'Mozilla/5.0'})

html = urlopen(request).read()

soup = BeautifulSoup(html, 'html.parser')

# Creating an array 'links' to store the links found in the html data 

links = []

for link in soup.find_all('a'):
   links.append(link.get('href'))

# Creating an array 'VotingIntentionLinks' to store all the links, which include Westminster Voting Intention data

VotingIntentionLinks= []

for i in range(0,len(links)):
  if '/westminster-voting-intention-' in links[i]:
    VotingIntentionLinks.append(links[i])
    print(links[i])

# Creating two arrays 'Polls' and 'Text' to hold 

Polls = []

Text = []

# Looping through all the Westminster Voting Intention links and storing the text data in the array 'Polls'

for i in range(0,len(VotingIntentionLinks)):
  url = (VotingIntentionLinks[i])
  request = Request(url , headers ={'User-Agent':'Mozilla/5.0'})
  html = urlopen(request).read()
  soup = BeautifulSoup(html, 'html.parser')
  soup_li = soup.find_all("div", {"class": "col-md-7"})
  Polls.append(soup_li[0].text)
  print(soup_li[0].text)

https://comresglobal.com/polls/westminster-voting-intention-daily-express-17-december/
https://comresglobal.com/polls/westminster-voting-intention-6-october/
https://comresglobal.com/polls/westminster-voting-intention-7-december/
https://comresglobal.com/polls/westminster-voting-intention-daily-mail-12-november/
https://comresglobal.com/polls/westminster-voting-intention-9th-november-2021/
https://comresglobal.com/polls/westminster-voting-intention-4th-november-2021/
https://comresglobal.com/polls/westminster-voting-intention-27th-october-2021/
https://comresglobal.com/polls/westminster-voting-intention-14th-october-2021/
https://comresglobal.com/polls/westminster-voting-intention-15th-september-2021/
https://comresglobal.com/polls/westminster-voting-intention-19th-august-2021/
https://comresglobal.com/polls/westminster-voting-intention-25th-august-2021/
https://comresglobal.com/polls/westminster-voting-intention-2nd-september-2021/
https://comresglobal.com/polls/westminster-voting-int

In [7]:
# Creating a variable a to store the length of the array 'Polls'

a = len(Polls)


# Extracting polling data for the conservative party 

count = 0
found = False

# Creating two arrays, the first stores the polling value and the second stores the date the poll was published 

Conservative_poll_values = []
Conservative_poll_dates = []

# For loop, loops through the text data, looking for a match for 'Con' and stores the value of the polling data in the array 'Conservative_poll_values'

# Additionally, looks for a match with 'Date Published' and stores the date in the array 'Conservative_poll_dates' 

for i in range(0,a):
  b=len(Polls[i])
  found = False
  for j in range(0,b):
    if((Polls[i][j] =='C' and Polls[i][j+1] == 'o'and Polls[i][j+2] == 'n' and Polls[i][j+3] == ' ' and Polls[i][j+4] != 'v') or (Polls[i][j] =='C' and Polls[i][j+1] == 'O'and Polls[i][j+2] == 'N' and Polls[i][j+3] != 'T')):
      print(Polls[i][j+4:j+8])
      Conservative_poll_values.append(Polls[i][j+4:j+8])
      count = count + 1
      found = True

    if((found == True) and (Polls[i][j:j+14] == 'Date Published')):
      Conservative_poll_dates.append(Polls[i][j+15:j+25])
    
  

34 (
40 (
38 (
34 (
 38%
40 (
 37%
40 (
39 (
41 (
41 (
40 (
41 (
40 (
40 (
40 (
41 (
42% 
44 (
44% 
42 (
43 (
42 (
42 (
42% 
42% 
42% 
42% 
43 (
40 (
- 41
39 (
39%



In [9]:
# Inputting the scraped polling data into a table and cleaning

# Creating a data frame to store Conservative poll data

df1 = pd.DataFrame([Conservative_poll_values,Conservative_poll_dates]).T
df1.columns = ['Conservative Poll','Date']

# Cleaning the data

df1['Conservative Poll'] = df1['Conservative Poll'].str.replace('(',' ')
df1['Conservative Poll'] = df1['Conservative Poll'].str.replace('%',' ')
df1['Conservative Poll'] = df1['Conservative Poll'].str.replace('\n',' ')
df1['Conservative Poll'] = df1['Conservative Poll'].str.replace('-',' ')
df1['Date'] = df1['Date'].str.replace('\n',' ')


df1

Unnamed: 0,Conservative Poll,Date
0,34,17/12/2021
1,40,6/10/2021
2,38,07/12/2021
3,34,12/11/2021
4,38,09/11/2021
5,40,04/11/2021
6,37,27/10/2021
7,40,14/10/2021
8,39,15/09/2021
9,41,19/08/2021


In [10]:
# Creating monthly averages for consevative polling data

df1["Date"] = pd.to_datetime(df1["Date"])

df1['Conservative Poll'] = pd.to_numeric(df1['Conservative Poll'], errors ='coerce')

DF_Conservative_Approval_Monthly = df1.resample('M', on='Date').mean().reset_index()

DF_Conservative_Approval_Monthly['Date'] = pd.DatetimeIndex(DF_Conservative_Approval_Monthly['Date']) + pd.DateOffset(1)

DF_Conservative_Approval_Monthly

Unnamed: 0,Date,Conservative Poll
0,2020-03-01,39.0
1,2020-04-01,42.0
2,2020-05-01,
3,2020-06-01,
4,2020-07-01,
5,2020-08-01,
6,2020-09-01,
7,2020-10-01,
8,2020-11-01,39.0
9,2020-12-01,


In [12]:
# Downloading the 'CPIH INDEX: Liquid fuels, vehicle fuels & lubricants' dataset from the Office for National Statistics API in JSON format 

url = 'https://api.ons.gov.uk/timeseries/L5JL/dataset/MM23/data'
html = requests.get(url)
ONS_Fuel_Index_Data = html.json()
Months = ONS_Fuel_Index_Data['months']

In [14]:
# Appending the observations from the date and value categories in the JSON file and storing them in a dataframe. 

ONS_ARRAY=[]
for month in Months:
  temp_variable={'date':month['date'],'value':month['value']}
  ONS_ARRAY.append(temp_variable)

DF_FUEL_INDEX=pd.DataFrame(ONS_ARRAY)

DF_FUEL_INDEX["date"] = pd.to_datetime(DF_FUEL_INDEX["date"])

DF_FUEL_INDEX.columns = ['Date','CPIH INDEX: Liquid fuels, vehicle fuels & lubricants (G) 2015=100']

DF_FUEL_INDEX['CPIH INDEX: Liquid fuels, vehicle fuels & lubricants (G) 2015=100'] = pd.to_numeric(DF_FUEL_INDEX['CPIH INDEX: Liquid fuels, vehicle fuels & lubricants (G) 2015=100'], errors ='coerce')

DF_FUEL_INDEX

Unnamed: 0,Date,"CPIH INDEX: Liquid fuels, vehicle fuels & lubricants (G) 2015=100"
0,1988-01-01,30.4
1,1988-02-01,30.0
2,1988-03-01,29.9
3,1988-04-01,30.5
4,1988-05-01,30.5
...,...,...
402,2021-07-01,119.6
403,2021-08-01,121.1
404,2021-09-01,121.6
405,2021-10-01,126.2


In [16]:
# Merging the data from the two data sources to create a single data set 

DF_Conservative_Approval_Monthly_With_Fuel_Index = DF_Conservative_Approval_Monthly.merge(DF_FUEL_INDEX, left_on='Date', right_on='Date')

DF_Conservative_Approval_Monthly_With_Fuel_Index.dropna(subset = ["Conservative Poll"], inplace=True)

# Filtering for 2021

DF_Conservative_Approval_Monthly_With_Fuel_Index = DF_Conservative_Approval_Monthly_With_Fuel_Index[(DF_Conservative_Approval_Monthly_With_Fuel_Index['Date'].dt.year > 2020)]

DF_Conservative_Approval_Monthly_With_Fuel_Index

Unnamed: 0,Date,Conservative Poll,"CPIH INDEX: Liquid fuels, vehicle fuels & lubricants (G) 2015=100"
11,2021-02-01,41.0,109.2
12,2021-03-01,41.0,112.3
13,2021-04-01,41.0,113.4
14,2021-05-01,41.333333,115.1
15,2021-06-01,43.0,117.4
16,2021-07-01,41.8,119.6
17,2021-08-01,39.333333,121.1
18,2021-09-01,41.0,121.6
19,2021-10-01,40.333333,126.2
20,2021-11-01,39.666667,132.6


In [17]:
# Creating a month field

DF_Conservative_Approval_Monthly_With_Fuel_Index['Month'] = pd.DatetimeIndex(DF_Conservative_Approval_Monthly_With_Fuel_Index['Date']).month

# Exporting the data to csv

DF_Conservative_Approval_Monthly_With_Fuel_Index.to_csv("Project_Conservative_Approval_Monthly_With_Fuel_Index.csv")