# Web Scraping

https://www.ambitionbox.com/list-of-companies?page=1

- We'll be getting companies information listed on ambitionbox.com



In [5]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [6]:
import subprocess

# this function runs the 'command' on the shell and returns the output
def run_command(command):
    try:
        # Run the command
        result = subprocess.run(command, shell=True, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        # Print the output

        print("Error:", result.stderr)
        return result.stdout
    except subprocess.CalledProcessError as e:
        print(f"An error occurred: {e}")

In [7]:
import re
# this function matches the pattern for ctype, old, no_emp and return the same
def company_desc(info_string):
    type_pattern = r'^[^|]+'
    yr_pattern = r'\|\s(\d+)\syears\sold'
    emp_pattern = r'\|\s([\d\s\w+-]+)\sEmployees'
    
    type_match = re.search(type_pattern, info_string)
    yr_match = re.search(yr_pattern, info_string)
    emp_match = re.search(emp_pattern, info_string)
    
    ctype = type_match.group(0).strip() if type_match else None
    old = yr_match.group(1).strip() if yr_match else None
    no_emp = emp_match.group(1).strip() if emp_match else None

    return ctype, old, no_emp
    # print(no_emp)

# company_desc(desc[0])
    

In [8]:
final_df = pd.DataFrame()

# Scraping pages 1,2,3 from the website
for j in range(1,4):
    num=j
    # this command gets the source code of the url 
    command = f" curl -X GET 'https://www.ambitionbox.com/list-of-companies?page={num}'"
    data = run_command(command)
    # creating object of BeautifulSoup-> This is a class from the bs4 (Beautiful Soup) library, which is a popular library in Python for parsing HTML and XML documents.
    soup = BeautifulSoup(data,'lxml')
    
    # print(soup.prettify())

    # storing html code of information cards about comapnies from the website
    companies_info = soup.find_all('div',class_="companyCardWrapper")
    # len(companies_info)
        
    name = [] # storing names of companies
    rating = [] # storing rating of companies
    desc = [] # storing description of companies
    
    for i in companies_info:
        
        #.text gets the content inside the tags mentioned. .strip removes extrap spaces
        temp_name = i.find('h2').text.strip()
        temp_rating = i.find('span',class_="companyCardWrapper__companyRatingValue").text.strip()
        temp_desc = i.find('span',class_ = "companyCardWrapper__interLinking").text.strip()
    
        name.append(temp_name)
        rating.append(temp_rating)
        # salary.append(temp_salary)
        desc.append(temp_desc)
       
    #using len helps to find if we got the info of all companies
    # print(len(name))
    # print(len(rating))
    # print(len(desc))

    # extracting info (comapny type, how old is the coompany, no of employees)form the description
    ctype = [] 
    old = []
    no_emp = []
    for i in desc:
        temp_ctype, temp_old, temp_no_emp = company_desc(i)
        ctype.append(temp_ctype)
        old.append(temp_old)
        no_emp.append(temp_no_emp)
    
    # print(len(ctype))
    # print(len(old))
    # print(len(no_emp))

    # defining what will be the columns of our dataset
    col = {'Name': name, 'Company Type': ctype, 'Rating': rating, 'No of Employees': no_emp, 'Old(yrs)': old}
    # dataset of each page
    df = pd.DataFrame(col)
    #concat df with final df
    final_df = pd.concat([final_df, df], ignore_index=True)
    

Error:   % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:02 --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:02 --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:04 --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:05 --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:06 --:--:--     0
100 16384    0 16384    0     0   2537      0 --:--:--  0:00:06 --:--:--  3874
100   98k    0   98k    0     0  13212      0 --:--:--  0:00:07 --:--:-- 21667
100  297k    0  297k    0     0  39400      0 --:--:--  0:00:07 --:--:-- 88006

Error:   % Total    % Received % Xferd  Ave

In [9]:
final_df

Unnamed: 0,Name,Company Type,Rating,No of Employees,Old(yrs)
0,TCS,IT Services & Consulting,3.8,1 Lakh+,56
1,Accenture,IT Services & Consulting,4.0,1 Lakh+,35
2,Cognizant,IT Services & Consulting,3.9,1 Lakh+,30
3,Wipro,IT Services & Consulting,3.8,1 Lakh+,79
4,Capgemini,IT Services & Consulting,3.8,1 Lakh+,57
5,HDFC Bank,Banking,3.9,1 Lakh+,30
6,ICICI Bank,Banking,4.0,1 Lakh+,30
7,Infosys,IT Services & Consulting,3.8,1 Lakh+,43
8,HCLTech,IT Services & Consulting,3.6,1 Lakh+,33
9,Tech Mahindra,IT Services & Consulting,3.7,1 Lakh+,38


In [10]:
#changing dtypes
final_df['Rating'] = final_df['Rating'].astype(float)
final_df['Old(yrs)'] = final_df['Old(yrs)'].astype(int)

In [11]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             60 non-null     object 
 1   Company Type     60 non-null     object 
 2   Rating           60 non-null     float64
 3   No of Employees  60 non-null     object 
 4   Old(yrs)         60 non-null     int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 2.5+ KB
