# Description:
#### This project demonstrates an advanced approach to web scraping using Python, focusing on efficiently navigating complex web structures with nested links. By implementing recursion and depth control, the script systematically explores the website, extracts structured data from relevant pages, and avoids redundant visits to previously scraped URLs. 

## Inspected Script

In [3]:
import requests
import time
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urljoin

In [None]:
base_url  = requests.get("https://www.fallingrain.com/world/NI/").text

In [None]:
soup = BeautifulSoup(base_url , 'lxml')

In [None]:
print(soup.prettify())

In [None]:
# Find the list containing the links
state_list = soup.find('ul')

print(state_list.prettify())

In [None]:
# Find the list containing the links
state_list = soup.find('ul')

# Find all the links within the list
links = state_list.find_all('a', href=True)

# Extract href attributes and link texts
extracted_links = [{'href': urljoin(base_url, link['href']), 'text': link.text.strip()} for link in links]

# Print the extracted links
for link in extracted_links:
    print(f"Link: {link['href']} - Text: {link['text']}")


#### the second url

In [None]:
# Base URL
second_url = "https://www.fallingrain.com/world/NI/57/"

# Fetch the base page content
source = requests.get(second_url).text
soup2 = BeautifulSoup(source, 'lxml')

In [None]:
# Find the list containing the links
state = soup2.find()

print(state.prettify())

In [None]:
# Find all the links for the alphabetical listings
alphabetical_links = soup2.find_all('a', href=True)

# Extract and print links
extracted_links = [{'href': urljoin(second_url, link['href']), 'text': link.text.strip()} for link in alphabetical_links]

# Print extracted links
for link in extracted_links:
    print(f"Link: {link['href']} - Text: {link['text']}")

In [None]:
#### 3rd nested URL

In [None]:
# Base URL
third_url = "https://www.fallingrain.com/world/NI/57/a/Z"

# Fetch the base page content
source = requests.get(third_url).text
soup3 = BeautifulSoup(source, 'lxml')

In [None]:
# Find the list containing the links
table = soup3.find()

print(table.prettify())

In [None]:
#### 4th URL

In [None]:
# Base URL
fouth_url = "https://www.fallingrain.com/world/NI/32/a/A/"

# Fetch the base page content
source = requests.get(fouth_url).text
soup4 = BeautifulSoup(source, 'lxml')

In [None]:
# Find the list containing the links
url4 = soup4.find()

print(url4.prettify())

In [None]:


# Base URL
fouth_url = "https://www.fallingrain.com/world/NI/32/a/A/"

# Fetch the base page content
response = requests.get(fouth_url)
soup = BeautifulSoup(response.text, 'lxml')

# Find all 'a' tags within the body
links = soup.find_all('a', href=True)

# Extract href attributes and link texts
extracted_links = [{'href': urljoin(fouth_url, link['href']), 'text': link.text.strip()} for link in links if link['href'].startswith('/')]

# Print extracted links
for link in extracted_links:
    print(f"Link: {link['href']} - Text: {link['text']}")

# The extracted_links list now contains all the links with their text


## Script 1: This function scrap and save the data individual table in a folder

In [2]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd
import os
from datetime import datetime

def scrape_fallingrain(base_url):
    import urllib3
    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)  # Suppress SSL warnings

    all_table_data = []  # To store all tables for saving to CSV
    visited_links = set()  # To track visited URLs

    def scrape_page(url, depth=0, max_depth=2):
        """
        Scrape a URL for a table or links if no table is found.
        :param url: The URL to scrape.
        :param depth: Current recursion depth.
        :param max_depth: Maximum recursion depth.
        """
        if url in visited_links:
            print(f"Skipping already visited URL: {url}")
            return  # Skip already visited links

        visited_links.add(url)  # Mark the URL as visited
        print(f"Scraping URL at depth {depth}: {url}")

        try:
            page = requests.get(url, verify=False).text
            soup = BeautifulSoup(page, 'lxml')

            # Check for a table only in the last two levels (depth >= max_depth - 1)
            if depth >= max_depth - 1:
                table = soup.find('table')
                if table:
                    print(f"Table found at depth {depth}:")

                    # Extract headers and rows
                    headers = [header.text.strip() for header in table.find_all('th')]
                    rows = [
                        [cell.text.strip() for cell in row.find_all(['td', 'th'])]
                        for row in table.find_all('tr') if row.find_all(['td', 'th'])
                    ]

                    print(f"Extracted headers: {headers}")
                    print(f"Number of rows: {len(rows)}")

                    # Validate row alignment
                    if rows:
                        for row in rows:
                            if len(row) < len(headers):
                                row.extend([''] * (len(headers) - len(row)))  # Add empty values
                            elif len(row) > len(headers):
                                row = row[:len(headers)]  # Trim excess values

                        # Store table data
                        all_table_data.append({"url": url, "headers": headers, "rows": rows})
                    return  # Stop further recursion since a table has been found

            print("No table found. Extracting links to continue scraping.")

            # Find all links on the current page
            links = soup.find_all('a', href=True)
            extracted_links = [urljoin(url, link['href']) for link in links]

            print(f"Found {len(extracted_links)} links at depth {depth}.")
            for link in extracted_links:
                print(f"Link: {link}")

            # Recursively scrape each extracted link if depth < max_depth
            if depth < max_depth:
                for link in extracted_links:
                    # Skip specific links
                    if link in [
                        'https://www.fallingrain.com/world/index.html',
                        'https://www.fallingrain.com/world/NI/index.html'
                    ]:
                        continue
                    scrape_page(link, depth=depth + 1, max_depth=max_depth)

        except requests.exceptions.RequestException as e:
            print(f"An error occurred while scraping {url}: {e}")

    try:
        # Step 1: Fetch the base URL content and parse with BeautifulSoup
        base_page = requests.get(base_url, verify=False).text  # Disable SSL verification
        soup = BeautifulSoup(base_page, 'lxml')

        # Step 2: Find the state list and extract links
        state_list = soup.find('ul')
        if not state_list:
            print("State list not found.")
            return

        links = state_list.find_all('a', href=True)
        extracted_links = [urljoin(base_url, link['href']) for link in links]

        print("Extracted links from base URL:")
        for link in extracted_links:
            print(f"Link: {link}")

        # Start scraping from all extracted links
        for link in extracted_links:
            print(f"Starting scraping for link: {link}")
            scrape_page(link, depth=0, max_depth=2)  # Allow recursion up to depth 2

    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")

    # Save each table to a separate CSV file
    if all_table_data:
        output_dir = "scraped_data"
        os.makedirs(output_dir, exist_ok=True)  # Create directory for saving CSVs

        for index, table_data in enumerate(all_table_data):
            headers = table_data["headers"]
            rows = table_data["rows"]
            url = table_data["url"]

            # Create a DataFrame
            df = pd.DataFrame(rows, columns=headers)

            # Generate a unique filename
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"{output_dir}/table_{index + 1}_{timestamp}.csv"
            df.to_csv(filename, index=False)

            print(f"Saved table from {url} to '{filename}'")

    else:
        print("No tables were found. No CSV files were created.")

# Usage
base_url = "https://www.fallingrain.com/world/NI/"
scrape_fallingrain(base_url)


Extracted links from base URL:
Link: https://www.fallingrain.com/world/NI/00/
Link: https://www.fallingrain.com/world/NI/05/
Link: https://www.fallingrain.com/world/NI/11/
Link: https://www.fallingrain.com/world/NI/16/
Link: https://www.fallingrain.com/world/NI/21/
Link: https://www.fallingrain.com/world/NI/22/
Link: https://www.fallingrain.com/world/NI/23/
Link: https://www.fallingrain.com/world/NI/24/
Link: https://www.fallingrain.com/world/NI/25/
Link: https://www.fallingrain.com/world/NI/26/
Link: https://www.fallingrain.com/world/NI/27/
Link: https://www.fallingrain.com/world/NI/28/
Link: https://www.fallingrain.com/world/NI/29/
Link: https://www.fallingrain.com/world/NI/30/
Link: https://www.fallingrain.com/world/NI/31/
Link: https://www.fallingrain.com/world/NI/32/
Link: https://www.fallingrain.com/world/NI/35/
Link: https://www.fallingrain.com/world/NI/36/
Link: https://www.fallingrain.com/world/NI/37/
Link: https://www.fallingrain.com/world/NI/39/
Link: https://www.fallingrain

KeyboardInterrupt: 

## Script 2: The function saved the scraped data in a consolated Csv file

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd
import os
from datetime import datetime

def scrape_fallingrain(base_url):
    import urllib3
    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)  # Suppress SSL warnings

    all_table_data = []  # To store all tables for merging
    visited_links = set()  # To track visited URLs

    def scrape_page(url, depth=0, max_depth=2):
        if url in visited_links:
            print(f"[INFO] Skipping already visited URL: {url}")
            return  # Skip already visited links
        
        visited_links.add(url)
        print(f"[INFO] Scraping URL at depth {depth}: {url}")

        try:
            page = requests.get(url, verify=False).text
            soup = BeautifulSoup(page, 'lxml')

            # If the depth is sufficient, look for a table
            if depth >= max_depth - 1:
                table = soup.find('table')
                if table:
                    headers = [header.text.strip() for header in table.find_all('th')]
                    rows = [
                        [cell.text.strip() for cell in row.find_all(['td', 'th'])]
                        for row in table.find_all('tr') if row.find_all(['td', 'th'])
                    ]
                    
                    if rows:
                        all_table_data.append({"url": url, "headers": headers, "rows": rows})
                        print(f"[INFO] Table found and data extracted from: {url}")
                    return  # Stop recursion after processing table

            # If no table found, extract links and recurse
            links = [urljoin(url, link['href']) for link in soup.find_all('a', href=True)]
            print(f"[INFO] Found {len(links)} links on page: {url}")
            
            if depth < max_depth:
                for link in links:
                    scrape_page(link, depth=depth + 1, max_depth=max_depth)

        except requests.exceptions.RequestException as e:
            print(f"[ERROR] Error scraping {url}: {e}")

    # Step 1: Scrape from the base URL
    try:
        print("[INFO] Starting scraping from base URL...")
        base_page = requests.get(base_url, verify=False).text
        soup = BeautifulSoup(base_page, 'lxml')
        
        # Extract links to states/regions
        state_list = soup.find('ul')
        if not state_list:
            print("[WARNING] State list not found on base URL.")
            return

        links = [urljoin(base_url, link['href']) for link in state_list.find_all('a', href=True)]
        print(f"[INFO] Extracted {len(links)} links from base URL.")

        for link in links:
            scrape_page(link, depth=0, max_depth=2)

    except Exception as e:
        print(f"[ERROR] Error accessing base URL {base_url}: {e}")

    # Step 2: Merge all tables into a single DataFrame
    if all_table_data:
        print("[INFO] Merging all extracted tables into a single DataFrame...")
        global_headers = set()  # Collect all unique headers
        for table_data in all_table_data:
            global_headers.update(table_data["headers"])

        global_headers = sorted(global_headers)  # Consistent column order
        merged_rows = []

        for idx, table_data in enumerate(all_table_data, start=1):
            headers = table_data["headers"]
            header_index_map = {header: idx for idx, header in enumerate(headers)}

            for row in table_data["rows"]:
                normalised_row = [
                    row[header_index_map[header]] if header in header_index_map and header_index_map[header] < len(row) else ""
                    for header in global_headers
                ]
                merged_rows.append(normalised_row)
            print(f"[INFO] Processed table {idx}/{len(all_table_data)} from URL: {table_data['url']}")

        # Create the final merged DataFrame
        merged_df = pd.DataFrame(merged_rows, columns=global_headers)

        # Save to a single CSV file
        output_file = f"merged_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
        merged_df.to_csv(output_file, index=False)
        print(f"[INFO] Saved merged data to {output_file}")
    else:
        print("[INFO] No tables were found. No merged data was created.")

# Usage
base_url = "https://www.fallingrain.com/world/NI/"
scrape_fallingrain(base_url)
