# Web Scraper Project
## Use to scrape data from a website(Text,URL & Table Data)

This small Web Scraper Project is a versatile web scraping tool leveraging Selenium for browser automation and Pandas for data manipulation. Users input the target website's URL, optional XPath for a cookie acceptance button, and the desired number of pages to scrape. The script initializes a Chrome WebDriver, handles cookie acceptance if applicable, and manages CSV file creation and encoding detection. Users can choose between scraping individual page elements or entire tables, with the script dynamically adapting to user-defined XPaths and providing summary reports of the scraping process. This interactive tool empowers users to efficiently collect structured data from websites, making it adaptable for a range of web scraping scenarios.


In [None]:
import csv
import time
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
import pandas as pd
from datetime import datetime
import os
import chardet

def detect_encoding(file_path):
    with open(file_path, 'rb') as file:
        result = chardet.detect(file.read())
    return result['encoding']

def read_file(file_path, encoding):
    with open(file_path, 'r', encoding=encoding) as file:
        content = file.read()
    return content

def create_csv_file(file_path):
    with open(file_path, 'w', newline='', encoding='utf-8') as csvfile:
        pass  # Just create an empty file

def get_element_data():
    element_data = []
    while True:
        elements_data = []
        while True:
            element_name = input("Enter the name of the element (e.g., 'titles', 'location', 'salary', 'contract_type', 'job_details') or type 'q' to finish: ")
            if element_name == 'q':
                break
            selector_type = int(input(f"Enter the selector type for the '{element_name}' element:\n1 for XPath\n2 for CSS selector: "))
            selector = input(f"Enter the selector for the '{element_name}' element: ")
            data_type = int(input(f"Enter the data type for the '{element_name}' element:\n1 for text\n2 for URL: "))
            elements_data.append((element_name, selector_type, selector, data_type))

        if not elements_data:
            break

        element_data.extend(elements_data)

    return element_data

def scrape_table(table_selector, csv_filename):
    try:
        table = driver.find_element("xpath", table_selector)
        header_row = table.find_element("xpath", ".//tr[1]")
        header_cells = header_row.find_elements("xpath", ".//th")
        header_data = [cell.text for cell in header_cells]

        with open(csv_filename, 'w', newline='') as csvfile:
            csv_writer = csv.writer(csvfile)

            # Write the table headers to the CSV file
            csv_writer.writerow(header_data)

            rows = table.find_elements("xpath", ".//tr[position()>1]")
            for row in rows:
                cells = row.find_elements("xpath", ".//td")
                row_data = [cell.text for cell in cells]
                csv_writer.writerow(row_data)
        print(f"Table data and headers have been scraped and saved to '{csv_filename}'")
    except NoSuchElementException:
        print("Table not found on the page.")
        
        
def write_element_data_to_csv(element_data, csv_filename):
    with open(csv_filename, 'a', newline='', encoding='utf-8') as csvfile:
        csv_writer = csv.writer(csvfile)

        element_names = [element[0] for element in element_data]
        csv_writer.writerow(element_names)

        for _ in range(page_count):
            data = []
            for element_name, selector_type, selector, data_type in element_data:
                if selector_type == 1:
                    elements = driver.find_elements("xpath", selector)
                elif selector_type == 2:
                    elements = driver.find_elements("css selector", selector)

                if data_type == 1:
                    data.append([element.text for element in elements])
                elif data_type == 2:
                    data.append([element.get_attribute('href') if element.tag_name == 'a' else '' for element in elements])

            for row in zip(*data):
                csv_writer.writerow(row)

            try:
                if page_count != 1:
                    if next_button_selector_type == 1:
                        next = driver.find_element("xpath", next_button_selector)
                    elif next_button_selector_type == 2:
                        next = driver.find_element("css selector", next_button_selector)

                    next.click()

                    time.sleep(5)
            except NoSuchElementException:
                break

    print("Scraping elements completed successfully.")

# Create a WebDriver instance
driver = webdriver.Chrome()

url = input("Enter the URL of the website you want to scrape: ")
driver.get(url)
cookie_xpath = input("Enter the XPath of the cookie acceptance button or press 'q' to skip: ")

if cookie_xpath.lower() != 'q':
    try:
        cookie_button = driver.find_element("xpath", cookie_xpath)
        cookie_button.click()
        time.sleep(5)
    except NoSuchElementException:
        print("Cookie acceptance button not found. Skipping...")

page_count = int(input("Enter the number of pages to scrape: "))
if page_count == 1:
    print("Only 1 page to scrape. Skipping the 'Next' button identifiers.")

# Specify the name of the CSV file
csv_filename = input("Enter the name of the CSV file (e.g., 'output.csv'): ")

# Create the CSV file if it doesn't exist
if not os.path.exists(csv_filename):
    create_csv_file(csv_filename)

# Detect encoding of the file
file_encoding = detect_encoding(csv_filename)

# Read the file using the detected encoding
file_content = read_file(csv_filename, file_encoding)

summary_report_filename = "summary_report.csv"
main_csv_name = csv_filename

summary_report_exists = os.path.isfile(summary_report_filename)
write_mode = 'w' if not summary_report_exists else 'a'

while True:
    print("Choose an option:")
    print("1 for scraping elements")
    print("2 for scraping a table")
    print("3 to Quit")
    option = input("Enter your choice: ")
    if option == '1':
        element_data = get_element_data()

        if page_count == 1:
            next_button_selector_type = 0
            next_button_selector = ""
    
        else:
            next_button_selector_type = int(input("Enter the selector type for the 'Next' button:\n1 for XPath\n2 for CSS selector: "))
            next_button_selector = input("Enter the selector of the 'Next' button to navigate to the next page: ")

        write_element_data_to_csv(element_data, csv_filename)

        df = pd.read_csv(csv_filename, encoding='utf-8')  # Explicitly specify encoding
        page_count = page_count if page_count != 0 else 1
        column_count = len(df.columns)
        row_count = len(df) + 1
        total_data_count = len(df.values.ravel())
        now = datetime.now()
        current_datetime = now.strftime("%Y-%m-%d %H:%M:%S")
        current_summary = pd.DataFrame({
            'Date and Time': [current_datetime],
            'Website': [url],
            'Main CSV Name': [main_csv_name],
            'Number of Pages': [page_count],
            'Column Count': [column_count],
            'Row Count': [row_count],
            'Total Data Count': [total_data_count]
        })

        current_summary.to_csv(summary_report_filename, mode=write_mode, header=not summary_report_exists, index=False)

        if not summary_report_exists:
            print(f"'{summary_report_filename}' file has been created with headers.")
        else:
            print("Summary report has been successfully updated.")

    elif option == '2':
        # Ask the user for the table selector and file name
        table_selector = input("Enter the XPath of the table you want to scrape: ")
        scrape_table(table_selector, csv_filename)
        print("Scraping table completed successfully.")

        df = pd.read_csv(csv_filename, encoding='utf-8')
        page_count = page_count if page_count != 0 else 1
        column_count = len(df.columns)
        row_count = len(df) + 1
        total_data_count = len(df.values.ravel())
        now = datetime.now()
        current_datetime = now.strftime("%Y-%m-%d %H:%M:%S")
        current_summary = pd.DataFrame({
            'Date and Time': [current_datetime],
            'Website': [url],
            'Main CSV Name': [main_csv_name],
            'Number of Pages': [page_count],
            'Column Count': [column_count],
            'Row Count': [row_count],
            'Total Data Count': [total_data_count]
        })

        current_summary.to_csv(summary_report_filename, mode=write_mode, header=not summary_report_exists, index=False)

        if not summary_report_exists:
            print(f"'{summary_report_filename}' file has been created with headers.")
        else:
            print("Summary report has been successfully updated.")

    elif option == '3':
        break

driver.quit()
