# Gather data for all pages

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver

In [7]:
company_name =[]
rating = []
reviews = []
company_type = []
office = []
high_rated_reasons = []
critical_rated_reason = []

# webscrap data

In [8]:
def web_scrap_page(page):    
    # Step 1: Fetch the HTML data
    url = f"https://www.ambitionbox.com/list-of-companies?page={page}"
    driver = webdriver.Chrome()
    driver.get(url)
    webpage=driver.page_source
    driver.quit()
    
    # Step 2: Parse the HTML using BeautifulSoup
    soup = BeautifulSoup(webpage, 'lxml')
    company_names = soup.find_all("div", class_="companyCardWrapper")
    return company_names

In [9]:
def rated_reasons(company_names):
    html = company_names.prettify()
    soup = BeautifulSoup(html, 'html.parser')
    
    # Initialize default values
    high_rated_reasons = None
    low_rated_reasons = None
    
    # Locate the "Highly Rated For" reasons
    high_rated_section = soup.find("span", class_="companyCardWrapper__ratingHeader--high")
    if high_rated_section:
        high_rated_values = high_rated_section.find_parent().find_next_sibling("span", class_="companyCardWrapper__ratingValues")
        if high_rated_values:
            high_rated_reasons = high_rated_values.get_text(strip=True).split(", ")
    
    # Locate the "Critically Rated For" reasons
    low_rated_section = soup.find("span", class_="companyCardWrapper__ratingHeader--critical")
    if low_rated_section:
        low_rated_values = low_rated_section.find_parent().find_next_sibling("span", class_="companyCardWrapper__ratingValues")
        if low_rated_values:
            low_rated_reasons = low_rated_values.get_text(strip=True).split(", ")
    
    # Ensure null values if not found
    high_rated_reasons = high_rated_reasons if high_rated_reasons else None
    low_rated_reasons = low_rated_reasons if low_rated_reasons else None

    high_rated_reasons = str(high_rated_reasons).replace("[","").replace("'","").replace("]","")
    low_rated_reasons = str(low_rated_reasons).replace("[","").replace("'","").replace("]","")
    
    # Print the extracted information
    return high_rated_reasons,low_rated_reasons

# Fetch all columns

In [10]:
def Fetch_all_columns(company_names):
    for i in company_names:
        #find name
        name = i.find("h2")
        name_text = name.get_text(strip=True) if name else "N/A"
        company_name.append(name_text)
        # print(name_text)
        
        #find rating
        rating_div = i.find("div", style="height:auto;padding-bottom:1px;")
        rating_text = rating_div.get_text(strip=True) if rating_div else "N/A"
        rating.append(rating_text)
    
        review_span = i.find("span", class_="companyCardWrapper__ActionCount")
        reviews_text = review_span.get_text(strip=True) if review_span else "N/A"
        reviews.append(reviews_text)
        
        # Extract company type and head office
        ctype_hq = i.find("span", class_="companyCardWrapper__interLinking")
        company_type_text = ctype_hq.get_text(strip=True)
        
        if '|' in company_type_text:
            company_type_text, office_text = company_type_text.split('|', 1)
            company_type_text = company_type_text.strip()
            office_text = office_text.strip()
        elif len(company_type_text.split('|')) == 1:
            office_text = company_type_text.strip()
            company_type_text = 'N/A'
        else:
            company_type_text = 'N/A'
            office_text = 'N/A'
        
        # Append to the respective lists
        company_type.append(company_type_text)
        office.append(office_text)
    
        #high rated or low rated reasons
        high,low = rated_reasons(i)
        high_rated_reasons.append(high)
        critical_rated_reason.append(low)


In [13]:
def main(page):
    print("Start")
    for p in range(1,page+1):
        print(p," page ")
        company_names = web_scrap_page(p)
        print(p," page data fetch.")
        Fetch_all_columns(company_names)
        print(p," page's content add into the list.")

In [None]:
main(352)

In [15]:
# Create DataFrame
df = pd.DataFrame({
    "Company name": company_name,
    "Rating": rating,
    "Reviews": reviews,
    "Company type" : company_type,
    "Office" : office,
    "Highly Rated For" : high_rated_reasons,
    "Critically Rated For" : critical_rated_reason
})

In [16]:
df

Unnamed: 0,Company name,Rating,Reviews,Company type,Office,Highly Rated For,Critically Rated For
0,TCS,3.7,85.2k,IT Services & Consulting,Bangalore / Bengaluru +375 other locations,"Job Security, Work Life Balance","Promotions / Appraisal, Salary & Benefits, Wor..."
1,Accenture,3.9,53.3k,IT Services & Consulting,Bangalore / Bengaluru +193 other locations,"Company Culture, Job Security, Skill Developme...",Promotions / Appraisal
2,Wipro,3.7,50.6k,IT Services & Consulting,Bangalore / Bengaluru +338 other locations,Job Security,"Promotions / Appraisal, Salary & Benefits"
3,Cognizant,3.8,47.7k,IT Services & Consulting,Chennai +191 other locations,,"Promotions / Appraisal, Salary & Benefits"
4,Capgemini,3.8,39.4k,IT Services & Consulting,Bangalore / Bengaluru +155 other locations,"Job Security, Work Life Balance","Promotions / Appraisal, Salary & Benefits"
...,...,...,...,...,...,...,...
7015,Liquiloans,3.1,118,FinTech,Mumbai +9 other locations,,"Promotions / Appraisal, Work Satisfaction, Job..."
7016,Bharat Petroleum Mumbai Refinery,4.2,118,,Mumbai +10 other locations,"Salary & Benefits, Company Culture, Work Satis...",
7017,Trinity Mobility,2.9,118,IT Services & Consulting,Bangalore / Bengaluru +6 other locations,,"Work Life Balance, Company Culture, Salary & B..."
7018,The Nuance Group AG,4.2,118,IT Services & Consulting,Bangalore / Bengaluru +6 other locations,"Salary & Benefits, Work Life Balance, Company ...",


In [24]:
df.to_csv("Ambitionbox_Company_list.csv",index=False)

In [25]:
df_new = pd.read_csv('Ambitionbox_Company_list.csv')

In [26]:
df_new.sample(10)

Unnamed: 0,Company name,Rating,Reviews,Company type,Office,Highly Rated For,Critically Rated For
6853,Deccan iServices,3.6,121,,Chennai +7 other locations,,"Promotions / Appraisal, Salary & Benefits"
1866,Novac Technology Solutions,3.9,436,Software Product,Chennai +20 other locations,"Job Security, Work Life Balance","Promotions / Appraisal, Salary & Benefits"
5745,Jeena & Company,4.0,144,Logistics,Mumbai +23 other locations,"Job Security, Work Satisfaction",
1480,Rockwell Automation,3.8,542,Industrial Automation,Pune +20 other locations,"Work Life Balance, Company Culture",Promotions / Appraisal
3941,Groww,3.8,212,Financial Services,Bangalore / Bengaluru +8 other locations,"Company Culture, Job Security, Work Life Balance",Promotions / Appraisal
6344,Expeditors International,3.8,131,Logistics,Chennai +19 other locations,Job Security,"Promotions / Appraisal, Work Satisfaction"
701,AtkinsRealis,4.0,1.1k,Engineering & Construction,Bangalore / Bengaluru +54 other locations,"Company Culture, Job Security, Work Life Balance",Promotions / Appraisal
3225,DSV - Global Transport and Logistics,4.0,258,Logistics,Mumbai +31 other locations,,
198,Bharat Electronics,4.2,2.9k,Electronics Manufacturing,Bangalore / Bengaluru +61 other locations,"Work Life Balance, Company Culture, Skill Deve...",Promotions / Appraisal
755,IIFL SAMASTA,4.0,1k,NBFC,Bangalore / Bengaluru +298 other locations,"Skill Development / Learning, Job Security, Sa...",
