In [7]:
import pandas as pd
import json
import time
from selenium import webdriver
from selenium.webdriver.edge.options import Options
from bs4 import BeautifulSoup


def scrape_ambitionbox_page(url, page_num=1):
    options = Options()
    options.use_chromium = True
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("--window-size=1920x1080")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                         "(KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36")

    driver = webdriver.Edge(options=options)
    driver.get(url)
    time.sleep(5)

    soup = BeautifulSoup(driver.page_source, "html.parser")
    driver.quit()

    companies = []

    for company in soup.select('div.companyCardWrapper'):
        try:
            position = company.find('meta', itemprop='position')['content'] if company.find('meta', itemprop='position') else ''
            name = company.find('h2', class_='companyCardWrapper__companyName').get_text(strip=True)
            company_url = company.find('meta', itemprop='url')['content'] if company.find('meta', itemprop='url') else ''
            rating = company.find('div', class_='companyCardWrapper__companyRating').find('div').get_text(strip=True)
            rating_count = company.find('span', class_='companyCardWrapper__companyRatingCount').get_text(strip=True)
            industry_location = company.find('span', class_='companyCardWrapper__interLinking').get_text(strip=True)

            actions = company.find_all('a', class_='companyCardWrapper__ActionWrapper')
            reviews = actions[0].find('span', class_='companyCardWrapper__ActionCount').get_text(strip=True) if len(actions) > 0 else ''
            salaries = actions[1].find('span', class_='companyCardWrapper__ActionCount').get_text(strip=True) if len(actions) > 1 else ''
            interviews = actions[2].find('span', class_='companyCardWrapper__ActionCount').get_text(strip=True) if len(actions) > 2 else ''
            jobs = actions[3].find('span', class_='companyCardWrapper__ActionCount').get_text(strip=True) if len(actions) > 3 else ''
            benefits = actions[4].find('span', class_='companyCardWrapper__ActionCount').get_text(strip=True) if len(actions) > 4 else ''
            photos = actions[5].find('span', class_='companyCardWrapper__ActionCount').get_text(strip=True) if len(actions) > 5 else ''

            highly_rated_for_tag = company.find('span', class_='companyCardWrapper__ratingHeader--high')
            highly_rated_for = highly_rated_for_tag.find_next('span', class_='companyCardWrapper__ratingValues').get_text(strip=True) if highly_rated_for_tag else ''

            critically_rated_for_tag = company.find('span', class_='companyCardWrapper__ratingHeader--critical')
            critically_rated_for = critically_rated_for_tag.find_next('span', class_='companyCardWrapper__ratingValues').get_text(strip=True) if critically_rated_for_tag else ''

            data = {
                'position': position,
                'name': name,
                'url': company_url,
                'rating': rating,
                'rating_count': rating_count,
                'industry_location': industry_location,
                'reviews': reviews,
                'salaries': salaries,
                'interviews': interviews,
                'jobs': jobs,
                'benefits': benefits,
                'photos': photos,
                'highly_rated_for': highly_rated_for,
                'critically_rated_for': critically_rated_for
            }

            print(f"[{page_num}] {name} | Rating: {rating} | Reviews: {reviews}")
            companies.append(data)

        except Exception as e:
            print(f"Error parsing company on page {page_num}: {e}")

    return companies


# Master list to hold all companies
all_companies = []

# Loop over all 500 pages
for page in range(1, 501):
    print(f"\n--- Scraping Page {page} ---")
    url = f"https://www.ambitionbox.com/list-of-companies?campaign=homepage_companies_widget&page={page}"
    try:
        companies = scrape_ambitionbox_page(url, page_num=page)
        all_companies.extend(companies)  # Add to master list
        time.sleep(1)  # Be polite to server
    except Exception as e:
        print(f"Failed to scrape page {page}: {e}")

# Convert to DataFrame
final_df = pd.DataFrame(all_companies)

# Save to single CSV and JSON
final_df.to_csv("ambitionbox_all_500_pages.csv", index=False, encoding='utf-8')
final_df.to_json("ambitionbox_all_500_pages.json", orient='records', indent=2, force_ascii=False)

# Preview
print("\nFinal DataFrame Preview:")
print(final_df[['name', 'rating', 'reviews']].head())



--- Scraping Page 1 ---
[1] TCS | Rating: 3.6 | Reviews: 96.8k
[1] Accenture | Rating: 3.8 | Reviews: 62.3k
[1] Wipro | Rating: 3.7 | Reviews: 56.7k
[1] Cognizant | Rating: 3.7 | Reviews: 53.8k
[1] Capgemini | Rating: 3.7 | Reviews: 45.5k
[1] HDFC Bank | Rating: 3.9 | Reviews: 42.9k
[1] Infosys | Rating: 3.6 | Reviews: 42.4k
[1] ICICI Bank | Rating: 4.0 | Reviews: 41.4k
[1] HCLTech | Rating: 3.5 | Reviews: 39.1k
[1] Tech Mahindra | Rating: 3.5 | Reviews: 37.7k
[1] Genpact | Rating: 3.8 | Reviews: 35.2k
[1] Teleperformance | Rating: 3.9 | Reviews: 32k
[1] Concentrix Corporation | Rating: 3.7 | Reviews: 28.2k
[1] Axis Bank | Rating: 3.7 | Reviews: 27.8k
[1] Amazon | Rating: 4.0 | Reviews: 26.9k
[1] Jio | Rating: 4.0 | Reviews: 25.2k
[1] iEnergizer | Rating: 4.7 | Reviews: 24.3k
[1] Reliance Retail | Rating: 3.9 | Reviews: 23.9k
[1] IBM | Rating: 4.0 | Reviews: 23.3k
[1] LTIMindtree | Rating: 3.7 | Reviews: 22.5k

--- Scraping Page 2 ---
[2] HDB Financial Services | Rating: 3.9 | Reviews

In [8]:
import pandas as pd
df = pd.read_csv('ambitionbox_all_500_pages.csv')
df

Unnamed: 0,position,name,url,rating,rating_count,industry_location,reviews,salaries,interviews,jobs,benefits,photos,highly_rated_for,critically_rated_for
0,1,TCS,https://www.ambitionbox.com/overview/tcs-overview,3.6,(96.8k),IT Services & Consulting | Bangalore / Bengalu...,96.8k,9.1L,11k,152,12k,86,"Job Security, Work Life Balance","Promotions / Appraisal, Salary & Benefits, Wor..."
1,2,Accenture,https://www.ambitionbox.com/overview/accenture...,3.8,(62.3k),IT Services & Consulting | Bangalore / Bengalu...,62.3k,6.1L,8.5k,10.1k,7.4k,48,,"Promotions / Appraisal, Salary & Benefits"
2,3,Wipro,https://www.ambitionbox.com/overview/wipro-ove...,3.7,(56.7k),IT Services & Consulting | Bangalore / Bengalu...,56.7k,4.6L,6k,4.8k,5.3k,101,Job Security,"Promotions / Appraisal, Salary & Benefits, Wor..."
3,4,Cognizant,https://www.ambitionbox.com/overview/cognizant...,3.7,(53.8k),IT Services & Consulting | Hyderabad / Secunde...,53.8k,5.8L,5.8k,574,6.1k,82,,"Promotions / Appraisal, Salary & Benefits, Wor..."
4,5,Capgemini,https://www.ambitionbox.com/overview/capgemini...,3.7,(45.5k),IT Services & Consulting | Bangalore / Bengalu...,45.5k,4.5L,5k,1.3k,4.2k,41,"Work Life Balance, Job Security","Promotions / Appraisal, Salary & Benefits, Wor..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9835,16,Bhagyalaxmi Rolling mill,https://www.ambitionbox.com/overview/bhagyalax...,3.6,(91),Iron & Steel | Jalna +5 other locations,91,374,3,3,6,--,,"Promotions / Appraisal, Salary & Benefits, Com..."
9836,17,Adhunik Power,https://www.ambitionbox.com/overview/adhunik-p...,4.1,(92),Power | Jamshedpur +5 other locations,92,412,7,--,4,--,"Work Life Balance, Job Security, Work Satisfac...",Promotions / Appraisal
9837,18,Avon Cycles,https://www.ambitionbox.com/overview/avon-cycl...,3.9,(130),Automobile | Ludhiana +2 other locations,130,463,18,9,13,--,,"Company Culture, Job Security, Work Satisfaction"
9838,19,Degania Medical Devices,https://www.ambitionbox.com/overview/degania-m...,4.0,(92),Medical Equipment | Gurgaon / Gurugram +5 othe...,92,430,10,--,11,--,"Skill Development / Learning, Company Culture,...",Promotions / Appraisal


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9840 entries, 0 to 9839
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   position              9840 non-null   int64  
 1   name                  9840 non-null   object 
 2   url                   9840 non-null   object 
 3   rating                9840 non-null   float64
 4   rating_count          9840 non-null   object 
 5   industry_location     9840 non-null   object 
 6   reviews               9840 non-null   object 
 7   salaries              9840 non-null   object 
 8   interviews            9840 non-null   object 
 9   jobs                  9840 non-null   object 
 10  benefits              9840 non-null   object 
 11  photos                9840 non-null   object 
 12  highly_rated_for      5825 non-null   object 
 13  critically_rated_for  7466 non-null   object 
dtypes: float64(1), int64(1), object(12)
memory usage: 1.1+ MB
