In [1]:
# Dependencies
from bs4 import BeautifulSoup
import requests, re
import pandas as pd
from geopy.geocoders import Nominatim
import os

In [2]:
# Function to scrape GP data
def scrape_data(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    gp_blocks = soup.find_all('div', {'class': 'results__details'})
    gp_data = []
    for block in gp_blocks:
        name = block.find('h2', {'id': lambda x: x and x.startswith('orgname_')}).text.strip()
        address = block.find('p', {'id': lambda x: x and x.startswith('address_')}).text.strip()
        phone = block.find('p', {'id': lambda x: x and x.startswith('phone_')}).text.strip()
        tags = block.find_all('strong', {'id': lambda x: x and x.startswith('result_item_')})
        tags_text = [tag.text.strip() for tag in tags]
        gp_data.append({
            'Name': name,
            'Address': address,
            'Phone': phone,
            'Accepting New Patients': 'Accepting new patients' in tags_text,
            'Accepts Out of Area Registrations': 'Accepts out of area registrations' in tags_text,
            'Online Registration Available': 'Online registration available' in tags_text
        })
    return gp_data

# Function to scrape initial GP URLs
def scrape_gp_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    gp_links = []
    for link in soup.find_all('a', {'class': 'nhsapp-open-in-webview'}):
        href = link.get('href')
        if not (href.startswith('javascript') or href.startswith('#')):
            gp_links.append(href)
    return gp_links

# Scrape Reviews Function
def scrape_reviews(gp_links):
    reviews_data = []
    ratings_data = []
    for gp_url in gp_links:
        review_url = f"{gp_url}/ratings-and-reviews"
        try:
            response = requests.get(review_url)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            gp_name = soup.find('h1').text.strip().split('\n')[0].strip()

            review_blocks = soup.find_all('div', {'class': 'org-review'})
            gp_ratings = []
            for block in review_blocks:
                review_text_block = block.find('p', class_='comment-text')
                if review_text_block:
                    review_text = review_text_block.get_text(strip=True)
                    reviews_data.append({'GPName': gp_name, 'ReviewText': review_text})
                
                # Extracting rating
                rating_text = block.find('p', {'id': re.compile(r'star-rating-.*')}).text.strip()
                if rating_text:
                    rating_value = float(rating_text.split(' ')[1])
                    gp_ratings.append(rating_value)
            
            # Calculate average rating if ratings were found
            if gp_ratings:
                average_rating = sum(gp_ratings) / len(gp_ratings)
                ratings_data.append({'GPName': gp_name, 'AverageRating': average_rating})
                
        except requests.HTTPError as e:
            print(f"Could not fetch reviews for {gp_url}: {e}")
    
    return reviews_data, ratings_data

In [3]:
# Function to create latitude and longitude from addresses using postcode
def geocode_address(address):
    geolocator = Nominatim(user_agent="GP_Finder_App")
    # Define the regex pattern for UK postcodes
    postcode_pattern = r'[A-Z]{1,2}[0-9R][0-9A-Z]? ?[0-9][A-Z]{2}'
    # Search for the postcode within the address using the regex pattern
    postcode_search = re.search(postcode_pattern, address)
    # If a postcode is found, use it for geocoding
    if postcode_search:
        postcode = postcode_search.group()
        location = geolocator.geocode(postcode)
        if location:
            return location.latitude, location.longitude
    return None, None

In [4]:
# Function to Export Data to CSV
def export_to_csv(data, filename, folder_path):
    # Construct the full file path
    full_path = os.path.join(folder_path, filename)
    
    # Create the directory if it doesn't exist
    os.makedirs(os.path.dirname(full_path), exist_ok=True)
    
    # Convert data to DataFrame and save as CSV
    df = pd.DataFrame(data)
    df.to_csv(full_path, index=False)
    print(f"Data exported to {full_path} successfully.")

In [8]:
# Main Execution
if __name__ == "__main__":
    # Define the folder path for data storage
    folder_path = "C:/Users/rober/DA/PersonalProjects/GP-Finder/data"

    # Ask the user for a postcode
    postcode = input("Enter a postcode: ")
    
    # Update the URL with the user's postcode
    url = f"https://www.nhs.uk/service-search/find-a-gp/results/{postcode}"
    
    # Scrape data
    gp_data = scrape_data(url)
    gp_links = scrape_gp_links(url)
    reviews_data, ratings_data = scrape_reviews(gp_links)

    # Geocode the addresses in the GP data
    for gp in gp_data:
        latitude, longitude = geocode_address(gp['Address'])
        gp['Latitude'] = latitude
        gp['Longitude'] = longitude

    # Export GP data to CSV
    export_to_csv(gp_data, 'gp_info.csv', folder_path)

    # Export Reviews data to CSV
    export_to_csv(reviews_data, 'gp_reviews.csv', folder_path)

    # Export Ratings data to CSV
    export_to_csv(ratings_data, 'gp_ratings.csv', folder_path)

    print("Data scraping and export completed.")

Enter a postcode: sw129lq
Data exported to C:/Users/rober/DA/PersonalProjects/GP-Finder/data\gp_info.csv successfully.
Data exported to C:/Users/rober/DA/PersonalProjects/GP-Finder/data\gp_reviews.csv successfully.
Data exported to C:/Users/rober/DA/PersonalProjects/GP-Finder/data\gp_ratings.csv successfully.
Data scraping and export completed.
