# Web Scrapping for an entire website and storing in csv file and uploading it in kaggle

In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm  # progress bar

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/127.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
}

print("🚀 Starting scrape...")

all_data = []

for page in tqdm(range(1, 501), desc="Scraping pages"):
    url = f"https://www.ambitionbox.com/list-of-companies?campaign=desktop_nav&page={page}"
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'lxml')

    # Extract all cards at once
    cards = soup.find_all("div", class_="companyCardWrapper__companyCard")

    for card in cards:
        try:
            name = card.find('h2', class_="companyCardWrapper__companyName")
            name = name.text.strip() if name else None

            rating_block = card.find('div', class_="rating_text rating_text--md")
            rating = None
            if rating_block:
                inner = rating_block.find('div', style="height:auto;padding-bottom:1px;")
                rating = inner.text.strip() if inner else None

            reviews = card.find('span', class_="companyCardWrapper__companyRatingCount")
            reviews = reviews.text.strip() if reviews else None

            desc = card.find_all('span', class_="companyCardWrapper__interLinking")
            if len(desc) > 0:
                desc_text = " | ".join([d.text.strip() for d in desc])
            else:
                desc_text = None

            industry, hq, branches = None, None, None
            if desc_text:
                parts = desc_text.split('|')
                industry = parts[0].strip() if len(parts) > 0 else None
                if len(parts) > 1:
                    sub = parts[1].split('+')
                    hq = sub[0].strip() if len(sub) > 0 else None
                    branches = sub[1].strip() if len(sub) > 1 else None

            tertiary = card.find('div', class_="companyCardWrapper__tertiaryInformation")
            salary = None
            if tertiary:
                actions = tertiary.find_all('a', class_="companyCardWrapper__ActionWrapper")
                if len(actions) > 1:
                    span = actions[1].find('span', class_="companyCardWrapper__ActionCount")
                    salary = span.text.strip() if span else None

            all_data.append({
                'COMPANY NAME': name,
                'RATING': rating,
                'REVIEWS': reviews,
                'INDUSTRY': industry,
                'INDIA HQ': hq,
                'BRANCHES': branches,
                'PACKAGE': salary
            })

        except Exception as e:
            print(f"⚠️ Error parsing a card on page {page}: {e}")

# Build DataFrame safely
data = pd.DataFrame(all_data)

# Quick check for missing values or inconsistent lengths
print("\n📊 Data Summary:")
print(data.info())
print(f"✅ Total records scraped: {len(data)}")

# Save CSV
data.to_csv("ambitionbox_companies_data.csv", index=False, encoding='utf-8-sig')
print("💾 Data saved to 'ambitionbox_companies_data.csv'")


🚀 Starting scrape...


Scraping pages: 100%|██████████| 500/500 [06:17<00:00,  1.32it/s]


📊 Data Summary:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Empty DataFrame
None
✅ Total records scraped: 0
💾 Data saved to 'ambitionbox_companies_data.csv'





In [7]:
data.head(10)