In [None]:
from bs4 import BeautifulSoup
import pandas as pd

def extract_card_data(card):
    data = {}

    # Extract "What they do:" information
    what_they_do_elem = card.find('b', id='card-header', text="What they do: ")
    if what_they_do_elem:
        # Navigate to the next element to get the text content
        what_they_do_text = what_they_do_elem.find_next_sibling('br').next_sibling.strip()
        data['description'] = what_they_do_text

    # Extract industry tags
    industry_tags = [tag.text.strip() for tag in card.find_all('span', class_='badge rounded-pill bg-success', id='industry-tags')]
    data['industry'] = ', '.join(industry_tags)  # Convert list to comma-separated string

    # Extract company size
    company_size_elem = card.find('span', class_='badge rounded-pill bg-success', id='company-size-tags')
    if company_size_elem:
        company_size = company_size_elem.text.strip()
        data['company_size'] = company_size

    # Extract founded year
    founded_elem = card.find('span', class_='badge rounded-pill bg-success', text=lambda text: text and text.startswith('Founded:'))
    if founded_elem:
        founded_year = founded_elem.text.replace('Founded: ', '').strip()
        data['founded_year'] = founded_year

    # Extract third funding information
    funding_info_elems = card.find_all('span', class_='badge rounded-pill bg-primary', id='funding-tags')
    if len(funding_info_elems) >= 1:
        third_funding_info = funding_info_elems[-1].text.strip()
        data['funding_amount'] = third_funding_info

    # Extract URL and company name
    div_tag = card.find('div', class_='col-8 col-md-9')
    if div_tag:
        a_tag = div_tag.find('a', href=True)
        if a_tag:
            url = a_tag['href']  # Extract URL from href attribute
            company_name = a_tag.text.strip()  # Extract company name from text content of <a> tag
            data['top_startup_url'] = url
            data['company_name'] = company_name

    return data

# Specify the path to the saved HTML file
html_file_path = "/content/Top Startups 2024 - Sequoia, Y Combinator, A16Z, Accel (2)(6).html"

# Read the HTML content from the file
with open(html_file_path, "r", encoding="utf-8") as file:
    html_content = file.read()

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

# Find all divs with class="col-12 col-md-6 col-xl-4 infinite-item"
items = soup.find_all('div', class_='col-12 col-md-6 col-xl-4 infinite-item')

# List to store extracted data dictionaries
data_list = []

# Iterate over each item (card) and extract data
for item in items:
    card_data = extract_card_data(item)
    data_list.append(card_data)

# Create DataFrame from the list of data dictionaries
df = pd.DataFrame(data_list)

# Reorder columns in the DataFrame based on the specified order
desired_columns = ['top_startup_url', 'company_name', 'description', 'founded_year', 'company_size', 'funding_amount', 'industry']
df = df[desired_columns]

# Display the DataFrame
print(df)


In [None]:
# Specify the path to save the CSV file
csv_file_path = "/content/startup_data.csv"

# Save the DataFrame to a CSV file
df.to_csv(csv_file_path, index=False)

# Display a message indicating successful CSV file creation
print(f"CSV file saved successfully at: {csv_file_path}")