In [1]:
# Installing libraries
!pip install beautifulsoup4
!pip install requests



In [2]:
# Importing relevant libraries
from bs4 import BeautifulSoup, NavigableString
import requests
import re
import pandas as pd
import sqlite3
import string
from nltk.corpus import wordnet as wn
import itertools
import spacy
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# **WEB SCRAPING**

In [3]:
def scrape_page(url):
    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page
        doc = BeautifulSoup(response.content, 'html.parser')

        # Return the scraped data
        return doc

    # If the request was not successful, return None
    return None


In [4]:
# Getting the HTML from the URL using BeautifulSoup
doc_main = scrape_page("https://www.gov.wales/land-transaction-tax")
#print(doc_main.prettify())

In [5]:
h2_heading = doc_main.find('h2', class_='list-group__title')
if h2_heading:
  # Find all the links under the <h2> heading
  links = h2_heading.find_next('ul').find_all('a')

  # Extract the URLs from the links
  link_urls = [link['href'] for link in links]

#print(len(link_urls))

In [6]:
#Declaring empty lists
urls = []
heading = []
que_lst = []
para_lst = []

for url in link_urls:
  # Calling the scrape_page function to scrape the url
  doc = scrape_page(url)
  # Getting the page headings
  page_heading = doc.find("h1", class_ = 'page-header__title page-header__title--has-type').find(string=True)

  # Getting the headings
  section_class = "paragraph paragraph--type--content-section paragraph--view-mode--default"
  doc_heading = doc.find_all("div", section_class)

  # Getting the documents if exist
  doc_div = doc.find_all("div",class_ = "document--accessible document")
  doc_title = doc.find_all("h3", class_ = "document__title")

  # Getting the button links
  btn_div = doc.find_all("div", class_ ="btn--launcher")

  # If the headings exist it will append it to question list
  if(doc_heading):
    que_lst.extend([h2.get_text(strip=True) for div in doc_heading for h2 in div.find_all("h2")])

    # Getting the paragraphs of the respective headings
    para_elements = []
    for section in doc_heading:
      content = section.find_next('div', class_ = True)
      merged_content = ''
      content_class = content['class']

      while(len(content_class)!=0 and 'paragraph--type--content-section' not in content_class and 'col-md-4' not in content_class):
        merged_content += content.get_text(separator=" ") + " "
        content = content.find_next('div', class_ = True) if content else ''
        content_class = content['class'] if content else ''

      para_elements.append(merged_content)

  #If there are no headings we will only take the paragraphs and assign empty string to the question list
  else:
    content_class = "paragraph paragraph--type--content paragraph--view-mode--default"
    div_content_elements = doc.find_all("div", content_class)
    para_elements = []

    for content in div_content_elements:
      merged_content = content.get_text(separator=" ") + " "
      para_elements.append(merged_content)
      que_lst.append(" ")

  # Appending the paragraphs to the Answer list
  for div in para_elements:
        link_content = ""
        for link in doc.find_all("a"):
            link_text = link.get_text()
            link_url = link.get("href")
            link_content += f"[{link_text}]({link_url})"
            div = div.replace(link_text, f"{link_text}({link_url})")
        para_lst.append(div.split('\n', 1)[1] if '\n' in div else div)
        heading.append(page_heading)
        urls.append(url)

  # If there is a document present inside the link it will add its link
  if(doc_div):
    for index, div in enumerate(doc_div):
      span = doc_title[index].find('span')
      para_lst.append(span.get_text(strip=True) + " (" + div.find('a')['href'] + ")")
      que_lst.append("DOCUMENT")
      heading.append(page_heading)
      urls.append(url)

  # If there are button links present it will ad its link
  if(btn_div):
    for div in btn_div:
      para_lst.append(div.get_text(strip=True) + " ("+ div.find('a')['href'] + ")")
      que_lst.append("BUTTON LINK")
      heading.append(page_heading)
      urls.append(url)

In [7]:
# There are pages like "Land Transaction Tax for professionals" and "Tax collection and management: technical guidance" which have sub-links
# Taking the URLs which have sub-links inside them
nested_url = ['https://www.gov.wales/land-transaction-tax-professionals', 'https://www.gov.wales/tax-collection-and-management-technical-guidance']
sub_heading_url = []
sub_heading_name = []
main_headings = []

# Scraping the URLs to get sub-links URLs
for url in nested_url:
  nested_url_doc = doc = scrape_page(url)
  h2_div_class = nested_url_doc.find_all("div",class_ = "paragraph paragraph--type--collection-section paragraph--view-mode--default collection")
  page_heading = doc.find("h1", class_ = 'page-header__title page-header__title--has-type').find(string=True)

  for h2_div in h2_div_class:
    h2_tags = h2_div.find("h2")
    content = h2_div.find_next("div", class_=True)
    content_class = content['class']

    while(len(content_class)!=0 and 'paragraph--type--collection-section' not in content_class and 'col-md-4' not in content_class):

      if('index-list__title' in content_class):
        sub_heading_url.append(content.find('a')['href'])
        main_headings.append(page_heading + " / " + h2_tags.get_text(strip = True))
        sub_heading_name.append(content.find('span').get_text(strip=True))

      content = content.find_next('div', class_ = True) if content else ''
      content_class = content['class'] if content else ''


In [8]:
# Initialize i to 0
i=0

for url in sub_heading_url:
  # Calling the scrape_page function to scrape the page
  doc = scrape_page(url)
  #print(url)

  # Getting the headings
  section_class = "paragraph paragraph--type--content-section paragraph--view-mode--default"
  doc_heading = doc.find_all("div", section_class)

  # Getting documents
  doc_div = doc.find_all("div",class_ = "document--accessible document")
  doc_title = doc.find_all("h3", class_ = "document__title")

  if(doc_heading):
    que_lst.extend([h2.get_text(strip=True) for div in doc_heading for h2 in div.find_all("h2")])

    # Getting the paragraphs of the respective headings
    para_elements = []
    for section in doc_heading:
      content = section.find_next('div', class_ = True)
      merged_content = ''
      content_class = content['class']

      while(len(content_class)!=0 and 'paragraph--type--content-section' not in content_class and 'col-md-4' not in content_class):
        merged_content += content.get_text(separator=" ") + " "
        content = content.find_next('div', class_ = True) if content else ''
        content_class = content['class'] if content else ''

      para_elements.append(merged_content)

  # If there are no headings we will only take the paragraphs and assign empty string to the question list
  else:
    content_class = "paragraph paragraph--type--content paragraph--view-mode--default"
    div_content_elements = doc.find_all("div", content_class)

    para_elements = []
    for content in div_content_elements:
      merged_content = content.get_text(separator=" ") + " "
      para_elements.append(merged_content)
      que_lst.append(" ")

  # Appending the paragraphs to the Answer list
  for div in para_elements:
    link_content = ""
    for link in doc.find_all("a"):
        link_text = link.get_text()
        link_url = link.get("href")
        link_content += f"[{link_text}]({link_url})"
        div = div.replace(link_text, f"{link_text}({link_url})")

    para_lst.append(div.split('\n', 1)[1] if '\n' in div else div)
    heading.append(main_headings[i] + " / " + sub_heading_name[i])
    urls.append(url)


  # If there is a document present inside the link it will add its link
  if(doc_div):
    for index, div in enumerate(doc_div):
      span = doc_title[index].find('span')
      para_lst.append(span.get_text(strip=True) + " (" + div.find('a')['href'] + ") ")
      que_lst.append("DOCUMENT")
      heading.append(main_headings[i] + " / " + sub_heading_name[i])
      urls.append(url)
  i=i+1


In [9]:
# Set display options to show the complete text
pd.set_option('display.max_colwidth', None)

# Convert the headings and paragraphs in a question-answer format and storing them in a dataframe
df = pd.DataFrame({'Heading': heading,'Questions': que_lst, 'Answers': para_lst, 'URLs':urls})
df.reset_index(inplace=True)
df.rename(columns={'index': 'Index'}, inplace=True)

# Storing it in CSV File
df.to_csv("General_Guidance.csv")
