# Web Scrapping using Beautiful Soup 

## Using Ambition Box website to scrap their first 20 weppages companies data and converting into a Pandas Dataframe

In [1]:
import pandas as pd
import requests 
from bs4 import BeautifulSoup

In [2]:
requests.get("https://www.ambitionbox.com/list-of-companies?campaign=homepage_companies_widget").text

'<HTML><HEAD>\n<TITLE>Access Denied</TITLE>\n</HEAD><BODY>\n<H1>Access Denied</H1>\n \nYou don\'t have permission to access "http&#58;&#47;&#47;www&#46;ambitionbox&#46;com&#47;list&#45;of&#45;companies&#63;" on this server.<P>\nReference&#32;&#35;18&#46;665d3a17&#46;1691439392&#46;1874c17\n</BODY>\n</HTML>\n'

In [3]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Safari/537.36"
}

response = requests.get("https://www.ambitionbox.com/list-of-companies?campaign=homepage_companies_widget", headers=headers)

if response.status_code == 200:
    webpage = response.text
else:
    print("Failed to retrieve the webpage")


In [4]:
soup = BeautifulSoup(webpage, "lxml")

In [5]:
for i in soup.find_all("h2"):
    print(i.text.strip())

TCS
Accenture
Cognizant
Wipro
ICICI Bank
HDFC Bank
Infosys
Capgemini
Tech Mahindra
Genpact
HCLTech
Axis Bank
Concentrix Corporation
IBM
Amazon
Reliance Jio
Larsen & Toubro Limited
HDB Financial Services
Reliance Retail
Teleperformance
Companies by  Industry
Companies by  Locations
Companies by  Type
Companies by  Badges


In [6]:
company_elements = soup.find_all("div", class_="companyCardWrapper")
len(company_elements)

20

In [7]:
name = []
rating = []
reviews = []
ctype = []

for i in company_elements:
    name.append(i.find("h2").text.strip())
    
    rating_elem = i.find("span", class_="companyCardWrapper__companyRatingValue")
    if rating_elem:
        rating.append(rating_elem.text.strip())
    else:
        rating.append("N/A")
    
    reviews_elem = i.find("a", class_="companyCardWrapper__ActionWrapper")
    if reviews_elem:
        reviews.append(reviews_elem.text.strip())
    else:
        reviews.append("N/A")
        
    ctype_elem = i.find("span", class_="companyCardWrapper__interLinking")
    if ctype_elem:
        ctype.append(ctype_elem.text.strip())
    else:
        ctype.append("N/A")


In [8]:
ctype

['IT Services & Consulting | 1 Lakh+ Employees | Public | 55 years old | Mumbai +280 more',
 'IT Services & Consulting | 1 Lakh+ Employees | Public | 34 years old | Dublin +140 more',
 'IT Services & Consulting | 1 Lakh+ Employees | Forbes Global 2000 | 29 years old | Teaneck. New Jersey. +111 more',
 'IT Services & Consulting | 1 Lakh+ Employees | Public | 78 years old | Bangalore/Bengaluru +242 more',
 'Banking | 1 Lakh+ Employees | Public | 29 years old | Mumbai +1176 more',
 'Banking | 1 Lakh+ Employees | Public | 29 years old | Mumbai +1399 more',
 'IT Services & Consulting | 1 Lakh+ Employees | Public | 42 years old | Bengaluru/Bangalore +135 more',
 'IT Services & Consulting | 1 Lakh+ Employees | Public | 56 years old | Paris +85 more',
 'IT Services & Consulting | 1 Lakh+ Employees | Public | 37 years old | Pune +230 more',
 'IT Services & Consulting | 50k-1 Lakh Employees | Public | 26 years old | New York +78 more',
 'IT Services & Consulting | 1 Lakh+ Employees | Public | 32

In [9]:
df = {"name":name ,"rating":rating, "reviews":reviews,"ctype":ctype}
df = pd.DataFrame(df)
df.head()

Unnamed: 0,name,rating,reviews,ctype
0,TCS,3.8,61.4k Reviews,IT Services & Consulting | 1 Lakh+ Employees |...
1,Accenture,4.1,39.2k Reviews,IT Services & Consulting | 1 Lakh+ Employees |...
2,Cognizant,3.9,35.8k Reviews,IT Services & Consulting | 1 Lakh+ Employees |...
3,Wipro,3.8,29.3k Reviews,IT Services & Consulting | 1 Lakh+ Employees |...
4,ICICI Bank,4.0,29.1k Reviews,Banking | 1 Lakh+ Employees | Public | 29 year...


In [10]:
df["reviews"].values

array(['61.4k Reviews', '39.2k Reviews', '35.8k Reviews', '29.3k Reviews',
       '29.1k Reviews', '28.5k Reviews', '26.8k Reviews', '24.8k Reviews',
       '23.4k Reviews', '22.6k Reviews', '22.5k Reviews', '19.2k Reviews',
       '18k Reviews', '17.4k Reviews', '17.3k Reviews', '17.3k Reviews',
       '16.4k Reviews', '15.4k Reviews', '14.9k Reviews', '14.4k Reviews'],
      dtype=object)

In [11]:
df["ctype"].values

array(['IT Services & Consulting | 1 Lakh+ Employees | Public | 55 years old | Mumbai +280 more',
       'IT Services & Consulting | 1 Lakh+ Employees | Public | 34 years old | Dublin +140 more',
       'IT Services & Consulting | 1 Lakh+ Employees | Forbes Global 2000 | 29 years old | Teaneck. New Jersey. +111 more',
       'IT Services & Consulting | 1 Lakh+ Employees | Public | 78 years old | Bangalore/Bengaluru +242 more',
       'Banking | 1 Lakh+ Employees | Public | 29 years old | Mumbai +1176 more',
       'Banking | 1 Lakh+ Employees | Public | 29 years old | Mumbai +1399 more',
       'IT Services & Consulting | 1 Lakh+ Employees | Public | 42 years old | Bengaluru/Bangalore +135 more',
       'IT Services & Consulting | 1 Lakh+ Employees | Public | 56 years old | Paris +85 more',
       'IT Services & Consulting | 1 Lakh+ Employees | Public | 37 years old | Pune +230 more',
       'IT Services & Consulting | 50k-1 Lakh Employees | Public | 26 years old | New York +78 more',


In [12]:
import re

numeric_part_col1 = [int(re.search(r'(\d+)', val).group(1)) for val in reviews]

In [13]:
col2_data = [val.split(' | ') for val in ctype]
company_type = [entry[0] if len(entry) > 0 else "N/A" for entry in ctype]
employees = [entry[1] if len(entry) > 1 else "N/A" for entry in ctype]
hq = [entry[-1].split(' +')[0] if len(entry) > 4 else "N/A" for entry in ctype]
type_ = [entry[2] if len(entry) > 2 else "N/A" for entry in ctype]

In [14]:
data = {
    'Reviews': numeric_part_col1,
    'Company Type': company_type,
    'Employees': employees,
    'HQ': hq,
    'Type': type_
}

df1 = pd.DataFrame(data)
df1.head()

Unnamed: 0,Reviews,Company Type,Employees,HQ,Type
0,61,I,T,e,
1,39,I,T,e,
2,35,I,T,e,
3,29,I,T,e,
4,29,B,a,e,n


In [15]:
df = pd.concat([df, df1], axis=1)
df = df.drop(["reviews", "ctype"], axis=1)
df.head()

Unnamed: 0,name,rating,Reviews,Company Type,Employees,HQ,Type
0,TCS,3.8,61,I,T,e,
1,Accenture,4.1,39,I,T,e,
2,Cognizant,3.9,35,I,T,e,
3,Wipro,3.8,29,I,T,e,
4,ICICI Bank,4.0,29,B,a,e,n


In [16]:
df["Type"].values

array([' ', ' ', ' ', ' ', 'n', 'n', ' ', ' ', ' ', ' ', ' ', 'n', 'O',
       'f', 't', 'l', 'g', 'F', 't', 'O'], dtype=object)

In [17]:
final = pd.DataFrame()

for j in range(1, 21):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Safari/537.36"
    }
    response = requests.get(f"https://www.ambitionbox.com/list-of-companies?campaign=homepage_companies_widget&page={j}", headers=headers)

    if response.status_code == 200:
        webpage = response.text
        soup = BeautifulSoup(webpage, "lxml")
        
        company_elements = soup.find_all("div", class_="companyCardWrapper")
        
        name = []
        rating = []
        reviews = []
        ctype = []

        for i in company_elements:
            name.append(i.find("h2").text.strip())

            rating_elem = i.find("span", class_="companyCardWrapper__companyRatingValue")
            if rating_elem:
                rating.append(rating_elem.text.strip())
            else:
                rating.append("N/A")

            reviews_elem = i.find("a", class_="companyCardWrapper__ActionWrapper")
            if reviews_elem:
                reviews.append(reviews_elem.text.strip())
            else:
                reviews.append("N/A")

            ctype_elem = i.find("span", class_="companyCardWrapper__interLinking")  # Check if this element is correct
            if ctype_elem:
                ctype_text = ctype_elem.text.strip()
                ctype.append(ctype_text)
            else:
                ctype.append("N/A")


        page_data = {
            'Name': name,
            'Rating': rating,
            'Reviews': reviews,
            'Company Type': ctype
        }
        df_page = pd.DataFrame(page_data)
        
        final = final.append(df_page, ignore_index=True)

    else:
        print(f"Failed to retrieve the webpage for page {j}")


final


  final = final.append(df_page, ignore_index=True)
  final = final.append(df_page, ignore_index=True)
  final = final.append(df_page, ignore_index=True)
  final = final.append(df_page, ignore_index=True)
  final = final.append(df_page, ignore_index=True)
  final = final.append(df_page, ignore_index=True)
  final = final.append(df_page, ignore_index=True)
  final = final.append(df_page, ignore_index=True)
  final = final.append(df_page, ignore_index=True)
  final = final.append(df_page, ignore_index=True)
  final = final.append(df_page, ignore_index=True)
  final = final.append(df_page, ignore_index=True)
  final = final.append(df_page, ignore_index=True)
  final = final.append(df_page, ignore_index=True)
  final = final.append(df_page, ignore_index=True)
  final = final.append(df_page, ignore_index=True)
  final = final.append(df_page, ignore_index=True)
  final = final.append(df_page, ignore_index=True)
  final = final.append(df_page, ignore_index=True)
  final = final.append(df_page,

Unnamed: 0,Name,Rating,Reviews,Company Type
0,TCS,3.8,61.4k Reviews,IT Services & Consulting | 1 Lakh+ Employees |...
1,Accenture,4.1,39.2k Reviews,IT Services & Consulting | 1 Lakh+ Employees |...
2,Cognizant,3.9,35.8k Reviews,IT Services & Consulting | 1 Lakh+ Employees |...
3,Wipro,3.8,29.3k Reviews,IT Services & Consulting | 1 Lakh+ Employees |...
4,ICICI Bank,4.0,29.1k Reviews,Banking | 1 Lakh+ Employees | Public | 29 year...
...,...,...,...,...
395,Fujitsu,4.0,1.3k Reviews,IT Services & Consulting | 5k-10k Employees | ...
396,Aavas Financiers,4.0,1.3k Reviews,Financial Services | 5k-10k Employees | Public...
397,Movate,3.6,1.3k Reviews,IT Services & Consulting | 10k-50k Employees (...
398,Gabriel India,4.2,1.3k Reviews,Auto Components | 1k-5k Employees | Public | 6...


In [18]:
import regex 

final['Reviews'] = final['Reviews'].str.replace('[a-zA-Z]', '', regex=True).astype(float)
final['Reviews'] *= 1000

In [19]:
final.head(2)

Unnamed: 0,Name,Rating,Reviews,Company Type
0,TCS,3.8,61400.0,IT Services & Consulting | 1 Lakh+ Employees |...
1,Accenture,4.1,39200.0,IT Services & Consulting | 1 Lakh+ Employees |...


In [20]:
final["Company Type"][0]

'IT Services & Consulting | 1 Lakh+ Employees | Public | 55 years old | Mumbai +280 more'

In [21]:
final['ctype'] = ""
final['employee'] = ""
final['sector'] = ""
final['old'] = ""
final['headquarter'] = ""

for idx, data in enumerate(final["Company Type"]):
    parts = [part.strip() for part in data.split('|')]
    
    final.loc[idx, 'ctype'] = parts[0]
    final.loc[idx, 'employee'] = parts[1] if len(parts) > 1 else "N/A"
    final.loc[idx, 'sector'] = parts[2] if len(parts) > 2 else "N/A"
    final.loc[idx, 'old'] = parts[3].split()[0] if len(parts) > 3 else "N/A"
    final.loc[idx, 'headquarter'] = parts[4].split()[0] if len(parts) > 4 else "N/A"


In [22]:
final.drop("Company Type", axis=1, inplace=True)
final.head()

Unnamed: 0,Name,Rating,Reviews,ctype,employee,sector,old,headquarter
0,TCS,3.8,61400.0,IT Services & Consulting,1 Lakh+ Employees,Public,55,Mumbai
1,Accenture,4.1,39200.0,IT Services & Consulting,1 Lakh+ Employees,Public,34,Dublin
2,Cognizant,3.9,35800.0,IT Services & Consulting,1 Lakh+ Employees,Forbes Global 2000,29,Teaneck.
3,Wipro,3.8,29300.0,IT Services & Consulting,1 Lakh+ Employees,Public,78,Bangalore/Bengaluru
4,ICICI Bank,4.0,29100.0,Banking,1 Lakh+ Employees,Public,29,Mumbai


In [23]:
final["employee"].nunique

<bound method IndexOpsMixin.nunique of 0               1 Lakh+ Employees
1               1 Lakh+ Employees
2               1 Lakh+ Employees
3               1 Lakh+ Employees
4               1 Lakh+ Employees
                  ...            
395              5k-10k Employees
396              5k-10k Employees
397    10k-50k Employees (Global)
398               1k-5k Employees
399               1k-5k Employees
Name: employee, Length: 400, dtype: object>

In [24]:
import re

def employee_details(no_of_employee):
    no_of_employee = re.sub(r'[^0-9a-zA-Z]', '', no_of_employee).lower()
    
    if "lakh" in no_of_employee:
        return int(re.search(r'\d+', no_of_employee).group()) * 100000 
    elif 'k' in no_of_employee:
        employee_values = list(map(int, re.findall(r'\d+', no_of_employee)))
        return sum(employee_values) / len(employee_values) * 1000
    else:
        return None

final['employee'] = final['employee'].apply(employee_details)


In [25]:
final.head()

Unnamed: 0,Name,Rating,Reviews,ctype,employee,sector,old,headquarter
0,TCS,3.8,61400.0,IT Services & Consulting,100000.0,Public,55,Mumbai
1,Accenture,4.1,39200.0,IT Services & Consulting,100000.0,Public,34,Dublin
2,Cognizant,3.9,35800.0,IT Services & Consulting,100000.0,Forbes Global 2000,29,Teaneck.
3,Wipro,3.8,29300.0,IT Services & Consulting,100000.0,Public,78,Bangalore/Bengaluru
4,ICICI Bank,4.0,29100.0,Banking,100000.0,Public,29,Mumbai
