In [1]:
# Dependencies
from bs4 import BeautifulSoup
import requests, sqlite3, time, re

In [2]:
# Function to create SQLite table
def create_table():
    conn = sqlite3.connect('gp_data.db')
    c = conn.cursor()
    c.execute("DROP TABLE IF EXISTS gp_info")  # Delete the existing table
    c.execute('''CREATE TABLE gp_info
                 (Name TEXT, Address TEXT, Phone TEXT, 
                 Accepting_New_Patients BOOLEAN, 
                 Accepts_Out_of_Area_Registrations BOOLEAN, 
                 Online_Registration_Available BOOLEAN,
                 Average_Rating REAL)''')
    conn.commit()
    conn.close()

In [3]:
# Function to insert data into SQLite table
def insert_data(gp_data):
    conn = sqlite3.connect('gp_data.db')
    c = conn.cursor()
    for data in gp_data:
        c.execute("INSERT INTO gp_info VALUES (?, ?, ?, ?, ?, ?, ?)",
                  (data['Name'], data['Address'], data['Phone'],
                   data['Accepting New Patients'], 
                   data['Accepts Out of Area Registrations'], 
                   data['Online Registration Available'],
                   data.get('Average Rating', None)))
    conn.commit()
    conn.close()

In [4]:
# Function to scrape data
def scrape_data():
    # Hardcoded URL for a specific postcode
    url = "https://www.nhs.uk/service-search/find-a-gp/results/SW12%209LQ"

    # Send GET request
    response = requests.get(url)

    # Initialize BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Locate each GP info using 'results__details' class
    gp_blocks = soup.find_all('div', {'class': 'results__details'})

    gp_data = []
    for block in gp_blocks:
        name = block.find('h2', {'id': lambda x: x and x.startswith('orgname_')}).text.strip()
        address = block.find('p', {'id': lambda x: x and x.startswith('address_')}).text.strip()
        phone = block.find('p', {'id': lambda x: x and x.startswith('phone_')}).text.strip()
        
        tags = block.find_all('strong', {'id': lambda x: x and x.startswith('result_item_')})
        tags_text = [tag.text.strip() for tag in tags]

        gp_data.append({
            'Name': name,
            'Address': address,
            'Phone': phone,
            'Accepting New Patients': 'Accepting new patients' in tags_text,
            'Accepts Out of Area Registrations': 'Accepts out of area registrations' in tags_text,
            'Online Registration Available': 'Online registration available' in tags_text
        })
    
    return gp_data

# Create the table
create_table()

# Perform the scrape
result = scrape_data()

# Insert data into SQLite database
insert_data(result)

# Run the scrape_data function and print the result for manual inspection
result = scrape_data()
print(result)

[{'Name': 'Clapham Park Group Practice', 'Address': '72 Clarence Avenue, London, Greater London, SW4 8JP', 'Phone': '02086785420', 'Accepting New Patients': True, 'Accepts Out of Area Registrations': False, 'Online Registration Available': False}, {'Name': 'Thurleigh Road Practice', 'Address': '88A Thurleigh Road, Balham, London, Greater London, SW12 8TT', 'Phone': '02086753521', 'Accepting New Patients': True, 'Accepts Out of Area Registrations': False, 'Online Registration Available': False}, {'Name': 'Open Door Surgery', 'Address': '47 Boundaries Road, Balham, London, Greater London, SW12 8EU', 'Phone': '02086731476', 'Accepting New Patients': True, 'Accepts Out of Area Registrations': True, 'Online Registration Available': False}, {'Name': 'BEDFORD HILL FAMILY PRACTICE', 'Address': '120-124 Bedford Hill, Balham, London, Greater London, SW12 9HS', 'Phone': '02086731720', 'Accepting New Patients': True, 'Accepts Out of Area Registrations': False, 'Online Registration Available': Fals

In [5]:
# Function to scrape initial GP URLs
def scrape_gp_links():
    url = "https://www.nhs.uk/service-search/find-a-gp/results/SW12%209LQ"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    gp_links = []
    
    # Locate each GP details link
    for link in soup.find_all('a', {'class': 'nhsapp-open-in-webview'}):
        href = link.get('href')
        if not (href.startswith('javascript') or href.startswith('#')):
            gp_links.append(href)

    return gp_links

# Test the scrape_gp_links function
gp_links = scrape_gp_links()
print(gp_links[:5])  # Print first 5 links to verify

['https://www.nhs.uk/services/gp-surgery/clapham-park-group-practice/XG85109', 'https://www.nhs.uk/services/gp-surgery/thurleigh-road-practice/XH85114', 'https://www.nhs.uk/services/gp-surgery/open-door-surgery/XH85087', 'https://www.nhs.uk/services/gp-surgery/bedford-hill-family-practice/XH85009', 'https://www.nhs.uk/services/gp-surgery/balham-health-centre/XH85637']


In [6]:
# Scrape Reviews Function
def scrape_reviews(gp_links):
    for gp_url in gp_links:
        review_url = f"{gp_url}/ratings-and-reviews"
        print(f"Scraping reviews from: {review_url}")
        
        try:
            response = requests.get(review_url)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Extract GP name from webpage (assuming it's in an h1 tag, you may need to adjust)
            gp_name = soup.find('h1').text.strip()
            
            review_blocks = soup.find_all('div', {'class': 'org-review'})
            print(f"Found {len(review_blocks)} review blocks")
            
            ratings = []
            for block in review_blocks:
                rating_text = block.find('p', {'id': re.compile(r'star-rating-.*')}).text.strip()
                rating_value = float(rating_text.split(' ')[1])
                ratings.append(rating_value)
            
            if ratings:
                average_rating = sum(ratings) / len(ratings)
                update_db(gp_name, average_rating)  # Using the GP name here
            
        except requests.HTTPError as e:
            print(f"Could not fetch reviews for {gp_url}: {e}")

In [7]:
# Function to update database
def update_db(gp_name, avg_rating):
    try:
        conn = sqlite3.connect('gp_data.db')
        c = conn.cursor()
        c.execute("UPDATE gp_info SET 'Average_Rating' = ? WHERE Name = ?", (avg_rating, gp_name))
        conn.commit()
        print(f"Updated {gp_name} with average rating {avg_rating}")  # Debug line
    except sqlite3.Error as e:
        print(f"Database error: {e}")
    finally:
        if conn:
            conn.close()

In [8]:
# Create the table
create_table()

# Perform the scrape
result = scrape_data()

# Insert data into SQLite database
insert_data(result)

# Scrape the GP links
gp_links = scrape_gp_links()

# Scrape the reviews
scrape_reviews(gp_links)

Scraping reviews from: https://www.nhs.uk/services/gp-surgery/clapham-park-group-practice/XG85109/ratings-and-reviews
Found 4 review blocks
Updated Clapham Park Group Practice 
                 -

            Ratings and reviews with average rating 5.0
Scraping reviews from: https://www.nhs.uk/services/gp-surgery/thurleigh-road-practice/XH85114/ratings-and-reviews
Found 0 review blocks
Scraping reviews from: https://www.nhs.uk/services/gp-surgery/open-door-surgery/XH85087/ratings-and-reviews
Found 2 review blocks
Updated Open Door Surgery
                 -

            Ratings and reviews with average rating 2.0
Scraping reviews from: https://www.nhs.uk/services/gp-surgery/bedford-hill-family-practice/XH85009/ratings-and-reviews
Found 9 review blocks
Updated BEDFORD HILL FAMILY PRACTICE
                 -

            Ratings and reviews with average rating 3.111111111111111
Scraping reviews from: https://www.nhs.uk/services/gp-surgery/balham-health-centre/XH85637/ratings-and-reviews


Found 1 review blocks
Updated St Johns Hill Surgery
                 -

            Ratings and reviews with average rating 1.0
Scraping reviews from: https://www.nhs.uk/services/gp-surgery/haider-practice/XH85075/ratings-and-reviews
Found 0 review blocks
Scraping reviews from: https://www.nhs.uk/services/gp-surgery/beckett-house-practice/XG85100/ratings-and-reviews
Found 10 review blocks
Updated Beckett House Practice
                 -

            Ratings and reviews with average rating 4.2
Scraping reviews from: https://www.nhs.uk/services/gp-surgery/the-grantham-centre-practice/XY00020/ratings-and-reviews
Found 2 review blocks
Updated The Grantham Centre Practice
                 -

            Ratings and reviews with average rating 1.5
Scraping reviews from: https://www.nhs.uk/services/gp-surgery/clapham-junction-medical-practice/XH85088001/ratings-and-reviews
Found 0 review blocks
Scraping reviews from: https://www.nhs.uk/services/gp-surgery/clapham-junction-medical-practice/XH

In [9]:
import sqlite3
import pandas as pd

def preview_complete_table():
    conn = sqlite3.connect('gp_data.db')
    query = "SELECT * FROM gp_info"
    df = pd.read_sql_query(query, conn)
    conn.close()
    
    num_rows = len(df)
    print(f"Number of rows: {num_rows}")
    print("Complete Table Preview:")
    print(df)

# Uncomment the next line to run the function
preview_complete_table()

Number of rows: 50
Complete Table Preview:
                                      Name  \
0              Clapham Park Group Practice   
1                  Thurleigh Road Practice   
2                        Open Door Surgery   
3             BEDFORD HILL FAMILY PRACTICE   
4                     Balham Health Centre   
5                  Clapham Family Practice   
6                   Grafton Square Surgery   
7                    Edith Cavell Practice   
8              Hetherington Group Practice   
9               Bolingbroke Medical Centre   
10                    Dr Curran & Partners   
11                       Sandmere Practice   
12                 Streatham High Practice   
13               Wandsworth Medical Centre   
14            TOOTING SOUTH MEDICAL CENTRE   
15                Akerman Medical Practice   
16                  Central Medical Centre   
17             Kings College Health Centre   
18                    Nightingale Practice   
19                 Streatham Place Su