In [None]:
import os
import pandas as pd
from bs4 import BeautifulSoup

# Specify the path to the folder containing HTML files
folder_path = '/content/Untitled Folder'

# Function to process each HTML file and extract desired information
def process_html_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            html_content = file.read()

        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(html_content, 'html.parser')

        # Extract description text
        description_element = soup.find('span', class_='description')
        about_text = description_element.get_text() if description_element else None

        # Extract specific <a> tag text
        a_tags = soup.find_all('a', class_='component--field-formatter field-type-enum accent highlight-color-contrast-light ng-star-inserted')
        extracted_text = a_tags[0].get_text(strip=True) if a_tags else None

        # Extract href attribute from a specific <a> tag
        a_tag = soup.find('a', class_='component--field-formatter')
        href_value = a_tag['href'] if a_tag else None

        # Extract URLs from <a> tags
        a_tags = soup.find_all('a', class_='component--field-formatter')
        position_to_extract = 3
        extracted_url = a_tags[position_to_extract]['href'] if len(a_tags) > position_to_extract else None

        # Extract industry values from <chips-container>
        chips_container = soup.find('chips-container')
        if chips_container:
            chip_texts = chips_container.find_all('div', class_='chip-text')
            extracted_values = [chip_text.get_text(strip=True) for chip_text in chip_texts]
            industry_values_str = ', '.join(extracted_values)
        else:
            industry_values_str = None
         # Extract profile name from a specific element (e.g., <h1>)
        profile_name_element = soup.find('h1', class_='profile-name')
        profile_name = profile_name_element.get_text() if profile_name_element else None
         # Extract founded date from specific <span> element
        founded_date_element = soup.find('span', class_='component--field-formatter field-type-date_precision ng-star-inserted')
        founded_date = founded_date_element['title'] if founded_date_element else None
        return {
            'crunchbase_url': href_value,
            'Company URL': extracted_url,
            'Company Name': profile_name,
            'Description': about_text,
            'Year Founded': founded_date,
            'employee_count': extracted_text,
            'Industry': industry_values_str,

        }

    except Exception as e:
        print(f"An error occurred while processing {file_path}: {e}")
        return None

# List to store processed data dictionaries
data_list = []

# Process each HTML file in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith('.html') or file_name.endswith('.htm'):
        file_path = os.path.join(folder_path, file_name)
        extracted_data = process_html_file(file_path)
        if extracted_data:
            data_list.append(extracted_data)

# Create a DataFrame from the processed data
df = pd.DataFrame(data_list)

# Display the DataFrame
print(df)


In [None]:
# Specify the path to save the CSV file
csv_file_path = "/content/crunchbase.csv"

# Save the DataFrame to a CSV file
df.to_csv(csv_file_path, index=False)

# Display a message indicating successful CSV file creation
print(f"CSV file saved successfully at: {csv_file_path}")

In [None]:
import pandas as pd

# Load the extracted data from the Excel file
input_file = '/content/crunchbase.csv'
df = pd.read_csv(input_file)

# Clean up the description format by removing extra spaces and line breaks
df['Description'] = df['Description'].apply(lambda x: ' '.join(x.split()) if isinstance(x, str) else x)

# Save the cleaned DataFrame back to the Excel file
output_file_cleaned = 'crunchbase_cleaned.xlsx'
df.to_excel(output_file_cleaned, index=False)

# Display the path to the cleaned output file
print(f"Cleaned data has been saved to '{output_file_cleaned}'.")

# Display the first few rows of the cleaned DataFrame
print(df.head())
