In [None]:
from bs4 import BeautifulSoup
import pandas as pd
import os

def process_html_file(html_file_path):
    try:
        # Read the HTML content from the file
        with open(html_file_path, "r", encoding="utf-8") as file:
            html_content = file.read()

        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(html_content, 'html.parser')

        # Find the <link> tag and extract the 'href' attribute value
        link_tag = soup.find('link')
        if link_tag:
            href_value = link_tag.get('href')

        # Extract the website link
        website_link = soup.find('button', class_='styles_websiteLink___Rnfc').text.strip()

        # Extract social media platforms and their associated links
        social_links = []
        social_buttons = soup.find_all('button', class_='styles_socialLink__r21Yf')
        for button in social_buttons:
            platform_name = button.text.strip()
            social_links.append((platform_name, button.find('svg')['class']))  # Extract platform name and SVG class

        # Find and extract the desired information
        dt_tags = soup.find_all('dt')

        # Initialize variables to store extracted information
        company_size = "None"
        total_raised = "None"
        company_type = "None"
        markets = "None"

        # Loop through <dt> tags to extract desired information
        for dt in dt_tags:
            if dt.previous_sibling and dt.previous_sibling.name == 'dd':
                sibling_text = dt.previous_sibling.get_text(strip=True)
                if sibling_text == 'Company size':
                    company_size = dt.get_text(strip=True)
                elif sibling_text == 'Total raised':
                    total_raised = dt.get_text(strip=True)
                elif sibling_text == 'Company type':
                    company_type = dt.get_text(strip=True)
                elif sibling_text == 'Markets':
                    # Extract markets from <a> tags
                    market_elements = dt.find_all('a')
                    markets = ', '.join([element.get_text(strip=True) for element in market_elements])

        # Extract text from content div and clean up the text
        content_div = soup.find('div', class_='styles_content__XhI8z')
        if content_div:
            extracted_text = content_div.get_text(separator=' ', strip=True)  # Replace newlines with space

            # Extract specific content from <h1> tag with proper spacing
            h1_tag = soup.find('h1', class_='text-xl font-medium text-dark-aaaa antialiased mb-4')
            if h1_tag:
                h1_text = h1_tag.get_text(strip=True)  # Get text content of <h1> tag
                # Replace comment with space to ensure proper spacing between words
                h1_text_with_space = ' '.join(h1_text.split('<!-- -->'))
                extracted_text = h1_text_with_space + ' ' + extracted_text

                # Extract company name from <h1> tag text (after removing 'careers')
                company_name = h1_text.replace('careers', '')  # Remove 'careers' from company name

        else:
            extracted_text = None
            company_name = None

        # Clean and format extracted text for proper spacing
        if extracted_text:
            # Add space before every occurrence of 'careers'
            cleaned_text = extracted_text.replace('careers', ' careers')
            # Remove extra whitespace and normalize spacing
            cleaned_text = ' '.join(cleaned_text.split())
        else:
            cleaned_text = None

        # Create a dictionary to store the extracted information
        data = {
            "wellfound_url": href_value,
            "company_url": website_link,
            "company_name": company_name,
            "description": cleaned_text,
            "employee_count": company_size,
            "Total Raised": total_raised,
            "Company Type": company_type,
            "Markets": markets,

              # Add company name to the dictionary
        }

    except Exception as e:
        # If an exception occurs during processing, print the error and return None
        print(f"Error processing file {html_file_path}: {e}")
        data = None

    return data

def process_html_files_in_directory(html_directory):
    # List all files in the specified directory
    html_files = os.listdir(html_directory)

    # Filter HTML files based on extension
    html_files = [file for file in html_files if file.endswith(".htm")]

    # Initialize a list to collect data from all HTML files
    all_data = []

    # Process each HTML file in the directory
    for html_file in html_files:
        html_file_path = os.path.join(html_directory, html_file)
        if os.path.isfile(html_file_path):
            # Process the HTML file and extract information
            file_data = process_html_file(html_file_path)
            if file_data is not None:
                all_data.append(file_data)

    # Create a DataFrame from the collected data
    df = pd.DataFrame(all_data)

    return df

# Specify the directory path where HTML files are present
html_directory = "/content/Untitled Folder"

# Process all HTML files in the directory and create a DataFrame
result_df = process_html_files_in_directory(html_directory)

# Display the DataFrame containing extracted information
pd.set_option('display.max_colwidth', None)  # Display full text content in DataFrame
print(result_df)


In [None]:
# Specify the file path where you want to save the CSV file
csv_file_path = "output.csv"

# Save the DataFrame to a CSV file
result_df.to_csv(csv_file_path, index=False)