# Web Scrapping for an entire website and storing in csv file and uploading it in kaggle

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/127.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
}

# List to store all company data
all_companies = []

print("🚀 Starting scrape...")

for page in range(1, 501):
    url = f"https://www.ambitionbox.com/list-of-companies?campaign=desktop_nav&page={page}"
    
    try:
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'lxml')
        
        # Find all company cards
        company_cards = soup.find_all('div', class_='companyCardWrapper')
        
        for card in company_cards:
            company_data = {
                'COMPANY NAME': None,
                'RATING': None,
                'REVIEWS': None,
                'INDUSTRY': None,
                'INDIA HQ': None,
                'BRANCHES': None,
                'PACKAGE': None
            }
            
            # Company name
            name_tag = card.find('h2', class_="companyCardWrapper__companyName")
            if name_tag:
                company_data['COMPANY NAME'] = name_tag.text.strip()
            
            # Rating
            rating_tag = card.find('div', class_="rating_text rating_text--md")
            if rating_tag:
                temp = rating_tag.find('div', style="height:auto;padding-bottom:1px;")
                if temp:
                    company_data['RATING'] = temp.text.strip()
            
            # Reviews
            reviews_tag = card.find('span', class_="companyCardWrapper__companyRatingCount")
            if reviews_tag:
                company_data['REVIEWS'] = reviews_tag.text.strip()
            
            # Description (industry | hq + branches)
            desc_tag = card.find('span', class_="companyCardWrapper__interLinking")
            if desc_tag:
                desc_text = desc_tag.text.strip()
                parts = desc_text.split('|')
                
                company_data['INDUSTRY'] = parts[0].strip() if len(parts) > 0 else None     # if any parts dont hold any element in them, then it will append "None" in them
                    # if you write this "company_data['INDUSTRY'] = parts[0].strip()" and don’t check if parts actually contains any elements, Python will raise: "IndexError: list index out of range"
                
                if len(parts) > 1:
                    sub = parts[1].split('+')
                    company_data['INDIA HQ'] = sub[0].strip() if len(sub) > 0 else None
                    company_data['BRANCHES'] = sub[1].strip() if len(sub) > 1 else None
            
            # Salary
            salary_div = card.find('div', class_="companyCardWrapper__tertiaryInformation")
            if salary_div:
                action_links = salary_div.find_all('a', class_="companyCardWrapper__ActionWrapper")
                if len(action_links) > 1:
                    span = action_links[1].find('span', class_="companyCardWrapper__ActionCount")
                    if span:
                        company_data['PACKAGE'] = span.text.strip()
            
            all_companies.append(company_data)
        
        print(f"✅ Page {page} scraped successfully (found {len(company_cards)} companies)")
        time.sleep(1)  # polite delay
        
    except Exception as e:
        print(f"❌ Error on page {page}: {str(e)}")
        continue

# Create DataFrame
data = pd.DataFrame(all_companies)

print(f"\n✅ All pages scraped successfully!")
print(f"Total companies scraped: {len(data)}")
print("\nFirst 5 rows:")
print(data.head())

# Save to CSV
data.to_csv('ambitionbox_companies.csv', index=False)
print("\n💾 Data saved to 'ambitionbox_companies.csv'")

🚀 Starting scrape...
✅ Page 1 scraped successfully (found 20 companies)
✅ Page 2 scraped successfully (found 20 companies)
✅ Page 3 scraped successfully (found 20 companies)
✅ Page 4 scraped successfully (found 20 companies)
✅ Page 5 scraped successfully (found 20 companies)
✅ Page 6 scraped successfully (found 20 companies)
✅ Page 7 scraped successfully (found 20 companies)
✅ Page 8 scraped successfully (found 20 companies)
✅ Page 9 scraped successfully (found 20 companies)
✅ Page 10 scraped successfully (found 20 companies)
✅ Page 11 scraped successfully (found 20 companies)
✅ Page 12 scraped successfully (found 20 companies)
✅ Page 13 scraped successfully (found 20 companies)
✅ Page 14 scraped successfully (found 20 companies)
✅ Page 15 scraped successfully (found 20 companies)
✅ Page 16 scraped successfully (found 20 companies)
✅ Page 17 scraped successfully (found 20 companies)
✅ Page 18 scraped successfully (found 20 companies)
✅ Page 19 scraped successfully (found 20 companies)


In [15]:
data.head(30)

Unnamed: 0,COMPANY NAME,RATING,REVIEWS,INDUSTRY,INDIA HQ,BRANCHES,PACKAGE
0,TCS,3.4,(1.1L),IT Services & Consulting,Bangalore / Bengaluru,430 other locations,9.4L
1,Accenture,3.7,(67.9k),IT Services & Consulting,Bangalore / Bengaluru,245 other locations,6.3L
2,Wipro,3.7,(60.9k),IT Services & Consulting,Bangalore / Bengaluru,367 other locations,4.6L
3,Cognizant,3.7,(57.8k),IT Services & Consulting,Hyderabad / Secunderabad,224 other locations,5.8L
4,Capgemini,3.7,(49.5k),IT Services & Consulting,Bangalore / Bengaluru,180 other locations,4.6L
5,HDFC Bank,3.8,(47.9k),Banking,Mumbai,1778 other locations,1.5L
6,Infosys,3.5,(45.5k),IT Services & Consulting,Bangalore / Bengaluru,239 other locations,5.1L
7,ICICI Bank,4.0,(43.6k),Banking,Mumbai,1416 other locations,1.5L
8,HCLTech,3.4,(42.6k),IT Services & Consulting,Chennai,223 other locations,3.6L
9,Tech Mahindra,3.4,(40.7k),IT Services & Consulting,Hyderabad / Secunderabad,328 other locations,2.8L
