# WEB SCRAPING

In [1]:
!pip3 install beautifulsoup4
!pip3 install requests



In [2]:
import sys
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
import unicodedata


In [3]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

**Using web scraping to find the top 5 ranking institutions**

In [4]:
nirf_url = "https://www.nirfindia.org/2021/EngineeringRanking.html"

**Obtain a response object by making a request for the HTML page from the above URL**

In [5]:
response = requests.get(nirf_url)

In [6]:
soup = BeautifulSoup(response.content, "html.parser")

In [7]:
soup.title

<title> MoE, National Institute Ranking Framework (NIRF) </title>

**Importing relevant data**

In [8]:
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <script async="" src="//www.google-analytics.com/analytics.js">
  </script>
  <script>
   (function (i, s, o, g, r, a, m) {i['GoogleAnalyticsObject'] = r; i[r] = i[r] || function () {(i[r].q = i[r].q || []).push(arguments)}, i[r].l = 1 * new Date(); a = s.createElement(o),m = s.getElementsByTagName(o)[0]; a.async = 1; a.src = g; m.parentNode.insertBefore(a, m)})(window, document, 'script', '//www.google-analytics.com/analytics.js', 'ga');ga('create', 'UA-75867016-1', 'auto');ga('send', 'pageview');
  </script>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <title>
   MoE, National Institute Ranking Framework (NIRF)
  </title>
  <link href="https://www.nirfindia.org/css/style.css" rel="stylesheet" type="text/css"/>
  <script src="https://www.nirfindia.org/js/modernizr.js" type="text/javascript">
  </script>
  <script src="https://www.nirfindia.org/js/jquery-min.js" type="text/javascript">
  </script>
  <script src="https://www.n

In [9]:
def extract_column_from_header(row):
    """
    This function returns the landing status from the HTML table cell 
    Input: the  element of a table data cell extracts extra row
    """
    if (row.br):
        row.br.extract()
    if row.a:
        row.a.extract()
    if row.sup:
        row.sup.extract()
        
    column_name = ' '.join(row.contents)
    
    # Filter the digit and empty names
    if not(column_name.strip().isdigit()):
        column_name = column_name.strip()
        return column_name   

In [10]:
#putting the column names from header in a list 
column_names = []
element = soup.find_all('th')
for row in range(len(element)):
    try:
        name = extract_column_from_header(element[row])
        if (name is not None and len(name) > 0):
            column_names.append(name)
    except:
        pass

In [11]:
print(column_names) #this confirms we have accessed the right table

['Institute ID', 'Name', 'City', 'State', 'Score', 'Rank', 'TLR (100)', 'RPC (100)', 'GO (100)', 'OI (100)', 'PERCEPTION (100)', 'TLR (100)', 'RPC (100)', 'GO (100)', 'OI (100)', 'PERCEPTION (100)', 'TLR (100)', 'RPC (100)', 'GO (100)', 'OI (100)', 'PERCEPTION (100)', 'TLR (100)', 'RPC (100)', 'GO (100)', 'OI (100)', 'PERCEPTION (100)', 'TLR (100)', 'RPC (100)', 'GO (100)', 'OI (100)', 'PERCEPTION (100)', 'TLR (100)', 'RPC (100)', 'GO (100)', 'OI (100)', 'PERCEPTION (100)', 'TLR (100)', 'RPC (100)', 'GO (100)', 'OI (100)', 'PERCEPTION (100)', 'TLR (100)', 'RPC (100)', 'GO (100)', 'OI (100)', 'PERCEPTION (100)', 'TLR (100)', 'RPC (100)', 'GO (100)', 'OI (100)', 'PERCEPTION (100)', 'TLR (100)', 'RPC (100)', 'GO (100)', 'OI (100)', 'PERCEPTION (100)', 'TLR (100)', 'RPC (100)', 'GO (100)', 'OI (100)', 'PERCEPTION (100)', 'TLR (100)', 'RPC (100)', 'GO (100)', 'OI (100)', 'PERCEPTION (100)', 'TLR (100)', 'RPC (100)', 'GO (100)', 'OI (100)', 'PERCEPTION (100)', 'TLR (100)', 'RPC (100)', 'GO (

**Initial dataframe**

In [12]:
table = soup.find_all('table',"table table-condensed")
df = pd.read_html(str(table))[0]

In [13]:
#for top5 rankings
df_top = df.head()
df_top

Unnamed: 0,Institute ID,Name,City,State,Score,Rank
0,IR-E-U-0456,Indian Institute of Technology MadrasMore Deta...,Chennai,Tamil Nadu,90.19,1
1,IR-E-I-1074,Indian Institute of Technology DelhiMore Detai...,New Delhi,Delhi,88.96,2
2,IR-E-U-0306,Indian Institute of Technology BombayMore Deta...,Mumbai,Maharashtra,85.16,3
3,IR-E-I-1075,Indian Institute of Technology KanpurMore Deta...,Kanpur,Uttar Pradesh,83.22,4
4,IR-E-U-0573,Indian Institute of Technology KharagpurMore D...,Kharagpur,West Bengal,82.03,5


we can either download the data, i.e. in PDF format to our local machine using wget module like this

In [14]:
#list to store each institude ID using list comprehension
ins_id = [x for x in df_top['Institute ID']]
ins_id

['IR-E-U-0456', 'IR-E-I-1074', 'IR-E-U-0306', 'IR-E-I-1075', 'IR-E-U-0573']

In [15]:
!pip install wget



In [16]:
#Now let's download each Institute record 

import wget 

# "C:\Users\anwar\Downloads\NIRF TOP5"

for i in ins_id:
    print(f'\nBeginning file download for Institute ID: {i}...')
    url = f'https://www.nirfindia.org/nirfpdfcdn/2021/pdf/Engineering/{i}.pdf'
    wget.download(url, f'/Users/anwar/Downloads/NIRF TOP5/{i}.pdf')
    #must create a folder named 'NIRF TOP5' in your downloads directory or alter the code to your specifics


Beginning file download for Institute ID: IR-E-U-0456...

Beginning file download for Institute ID: IR-E-I-1074...

Beginning file download for Institute ID: IR-E-U-0306...

Beginning file download for Institute ID: IR-E-I-1075...

Beginning file download for Institute ID: IR-E-U-0573...


Or read PDFs as a list of tables remotly without downloading them to local machine using tabula-py library, which converts it into dataframes automatically.

In [17]:
!pip install tabula-py



In [21]:
import tabula

In [22]:
nirf_1 = tabula.read_pdf(f'https://www.nirfindia.org/nirfpdfcdn/2021/pdf/Engineering/{ins_id[0]}.pdf', pages='all')
nirf_1 = nirf_1[2]
nirf_1

Unnamed: 0,Academic Year,No. of first year\rstudents intake in the\ryear,No. of first year\rstudents admitted in\rthe year,Academic Year.1,No. of students\radmitted through\rLateral entry,Academic Year.2,No. of students\rgraduating in\rminimum stipulated\rtime,No. of students\rplaced,Median salary of\rplaced\rgraduates(Amount in\rRs.),No. of students\rselected for Higher\rStudies
0,2014-15,466,441,2015-16,0,2017-18,334,283,1306000(Thirteen\rlakhs six thousand),51
1,2015-16,466,458,2016-17,0,2018-19,362,268,1360000(Thirteen\rLakhs Sixty thousand),65
2,2016-17,466,465,2017-18,0,2019-20,374,286,1500000(Fifteen lakhs),43
