In [4]:
# Installing the Firefox browser driver and importing Python libraries

import json
import time

from selenium.webdriver import Firefox
from selenium.webdriver.firefox.service import Service as FirefoxService
from selenium.webdriver.common.keys import Keys

from webdriver_manager.firefox import GeckoDriverManager

def get_browser_firefox():
    return Firefox(service=FirefoxService(GeckoDriverManager().install()))

In [6]:
# Definition 1 - Creating definitions to extract the names of charity organizations and links to their subpages - the top 20 places in the ranking.

def get_organization_data(org):
    page = 1
    while page == 1:
        browser = get_browser_firefox()
        browser.implicitly_wait(10)
        browser.get(org)
        results = []
        try:
            next_btn = browser.find_element('css selector', 'div.banner-actions-container button#onetrust-accept-btn-handler')
            next_btn.click()
        except:
            continue
        companies = browser.find_elements('css selector', 'ul.rankings-entities-list li')
        for company in companies:
            url = company.find_element('css selector', 'a.rankings-entities-list-item.focus-state.ng-star-inserted')
            name = company.find_element('css selector', 'span:nth-child(3)')
            url_data = {
                    'url': url.get_attribute('href'),
                    'company_name': name.text
            }
            results.append(url_data)   
        page += 1
        browser.quit()
        return results

In [8]:
# Starting up - Definition 1 

link_org = 'https://yougov.co.uk/ratings/politics/popularity/charities-organisations/all'
charity_org = get_organization_data(link_org)

charity_org

[{'url': 'https://yougov.co.uk/topics/health/explore/not-for-profit/Macmillan_Cancer_Support',
  'company_name': 'Macmillan Cancer Support'},
 {'url': 'https://yougov.co.uk/topics/health/explore/not-for-profit/St_John_Ambulance',
  'company_name': 'St. John Ambulance'},
 {'url': 'https://yougov.co.uk/topics/health/explore/not-for-profit/Cancer_Research_UK',
  'company_name': 'Cancer Research UK'},
 {'url': 'https://yougov.co.uk/topics/health/explore/not-for-profit/British_Heart_Foundation',
  'company_name': 'British Heart Foundation'},
 {'url': 'https://yougov.co.uk/topics/health/explore/not-for-profit/Great_Ormond_Street_Hospital',
  'company_name': 'Great Ormond Street Hospital'},
 {'url': 'https://yougov.co.uk/topics/health/explore/not-for-profit/Alzheimers_Research_UK',
  'company_name': "Alzheimer's Research UK"},
 {'url': 'https://yougov.co.uk/topics/health/explore/not-for-profit/National_Trust',
  'company_name': 'National Trust'},
 {'url': 'https://yougov.co.uk/topics/health/e

In [52]:
# Definition 2 - Creating and verifying the correctness of a definition that extracts statistical data for only one organization.

def get_organization_detailed_data(org):
    page = 1
    while page == 1:
        browser = get_browser_firefox()
        browser.implicitly_wait(10)
        browser.get(org)
        results = []
        try:
            next_btn = browser.find_element('css selector', 'div.banner-actions-container button#onetrust-accept-btn-handler')
            next_btn.click()
        except:
            continue
        companies = browser.find_elements('css selector', 'div.entity-header-lines')
        for company in companies:
            fame = company.find_element('css selector', 'div.entity-header-line:nth-child(1) > div:nth-child(1) > div:nth-child(2)')
            pop = company.find_element('css selector', 'div.entity-header-line:nth-child(2) > div:nth-child(1) > div:nth-child(2)')
            dislike = company.find_element('css selector', 'div.entity-header-line:nth-child(3) > div:nth-child(1) > div:nth-child(2)')
            neutral = company.find_element('css selector', 'div.entity-header-line:nth-child(4) > div:nth-child(1) > div:nth-child(2)')
            url_data = {
                'Fame': fame.text,
                'Popularity': pop.text,
                'Disliked by': dislike.text,
                'Neutral': neutral.text   
            }
            results.append(url_data)
        page += 1
        return results, browser.quit()

In [54]:
# Starting up - Definition 2 

link_org = 'https://yougov.co.uk/topics/health/explore/not-for-profit/Macmillan_Cancer_Support'
stat_org = get_organization_detailed_data(link_org)

stat_org

([{'Fame': '97%', 'Popularity': '90%', 'Disliked by': '0%', 'Neutral': '7%'}],
 None)

In [56]:
# Definition 3 - Creating a definition that extracts data for all organizations.

def get_data(org):
    n = 0
    for n in range(0,19):
        browser = get_browser_firefox()
        browser.implicitly_wait(15)
        browser.get(org)
        results = []
        try:
            next_btn = browser.find_element('css selector', 'div.banner-actions-container button#onetrust-accept-btn-handler')
            next_btn.click()
        except:
            continue
        companies = browser.find_elements('css selector', 'ul.rankings-entities-list li')
        for company in companies:
            url = company.find_element('css selector', 'a.rankings-entities-list-item.focus-state.ng-star-inserted')
            name = company.find_element('css selector', 'span:nth-child(3)')
            url_data = {
                    'url': url.get_attribute('href'),
                    'company_name': name.text
            }
            results.append(url_data)

            time.sleep(5)
            stat_link = url.get_attribute('href')                                  
            browser = get_browser_firefox()
            browser.implicitly_wait(15)
            browser.get(stat_link)
            try:
                next_btn = browser.find_element('css selector', 'div.banner-actions-container button#onetrust-accept-btn-handler')
                next_btn.click()
            except:
                continue
            comps = browser.find_elements('css selector','div.entity-header-lines')
            for comp in comps:
                fame = comp.find_element('css selector', 'div.entity-header-line:nth-child(1) > div:nth-child(1) > div:nth-child(2)')
                pop = comp.find_element('css selector', 'div.entity-header-line:nth-child(2) > div:nth-child(1) > div:nth-child(2)')
                dislike = comp.find_element('css selector', 'div.entity-header-line:nth-child(3) > div:nth-child(1) > div:nth-child(2)')
                neutral = comp.find_element('css selector', 'div.entity-header-line:nth-child(4) > div:nth-child(1) > div:nth-child(2)')
                url_data = {
                     'Fame': fame.text,
                     'Popularity': pop.text,
                     'Disliked by': dislike.text,
                     'Neutral': neutral.text   
                 }
                results.append(url_data)
                browser.quit()
        n+=1
        browser.quit()
        return results

In [58]:
# Starting up - Definition 3

link_org = 'https://yougov.co.uk/ratings/politics/popularity/charities-organisations/all'
all_org = get_data(link_org)

all_org

ValueError: response body:
{"message":"API rate limit exceeded for 45.129.254.142. (But here's the good news: Authenticated requests get a higher rate limit. Check out the documentation for more details.)","documentation_url":"https://docs.github.com/rest/overview/resources-in-the-rest-api#rate-limiting"}

request url:
https://api.github.com/repos/mozilla/geckodriver/releases/latest
response headers:
{'Date': 'Fri, 08 Aug 2025 14:21:03 GMT', 'Server': 'Varnish', 'Strict-Transport-Security': 'max-age=31536000; includeSubdomains; preload', 'X-Content-Type-Options': 'nosniff', 'X-Frame-Options': 'deny', 'X-XSS-Protection': '1; mode=block', 'Content-Security-Policy': "default-src 'none'; style-src 'unsafe-inline'", 'Access-Control-Allow-Origin': '*', 'Access-Control-Expose-Headers': 'ETag, Link, Location, Retry-After, X-GitHub-OTP, X-RateLimit-Limit, X-RateLimit-Remaining, X-RateLimit-Reset, X-RateLimit-Used, X-RateLimit-Resource, X-OAuth-Scopes, X-Accepted-OAuth-Scopes, X-Poll-Interval, X-GitHub-Media-Type, Deprecation, Sunset', 'Content-Type': 'application/json; charset=utf-8', 'Referrer-Policy': 'origin-when-cross-origin, strict-origin-when-cross-origin', 'X-GitHub-Media-Type': 'github.v3; format=json', 'X-RateLimit-Limit': '60', 'X-RateLimit-Remaining': '0', 'X-RateLimit-Reset': '1754665446', 'X-RateLimit-Resource': 'core', 'X-RateLimit-Used': '60', 'Content-Length': '280', 'X-GitHub-Request-Id': 'E5D5:317B22:264FF79:23CCD9E:689607CF'}


In [44]:
# Saving data to a .json file

with open('charity_organizations.json', 'w') as file:
    file.write(json.dumps(all_org))

In [46]:
# Checking whether the data has been saved correctly

with open('charity_organizations.json') as file:
    org_data = json.loads(file.read())

org_data

[{'url': 'https://yougov.co.uk/topics/health/explore/not-for-profit/Macmillan_Cancer_Support',
  'company_name': 'Macmillan Cancer Support'},
 {'Fame': '97%', 'Popularity': '90%', 'Disliked by': '0%', 'Neutral': '7%'},
 {'url': 'https://yougov.co.uk/topics/health/explore/not-for-profit/St_John_Ambulance',
  'company_name': 'St. John Ambulance'},
 {'Fame': '96%', 'Popularity': '87%', 'Disliked by': '1%', 'Neutral': '8%'},
 {'url': 'https://yougov.co.uk/topics/health/explore/not-for-profit/Cancer_Research_UK',
  'company_name': 'Cancer Research UK'},
 {'Fame': '98%', 'Popularity': '87%', 'Disliked by': '3%', 'Neutral': '8%'},
 {'url': 'https://yougov.co.uk/topics/health/explore/not-for-profit/British_Heart_Foundation',
  'company_name': 'British Heart Foundation'},
 {'Fame': '97%', 'Popularity': '85%', 'Disliked by': '2%', 'Neutral': '10%'},
 {'url': 'https://yougov.co.uk/topics/health/explore/not-for-profit/Great_Ormond_Street_Hospital',
  'company_name': 'Great Ormond Street Hospital'}