# Here this crawler will extract names of all the universities from a webpage and save the them into a csv file.

In [1]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

## **This is a crawler function which reads date from the website given below. The website has the names of most of the universities in the world and we are trying to read that and write all that into a csv file.**

link: https://www.4icu.org/reviews/index2.htm

In [2]:
def crawler_function():
    maxpage=27
    names=[]
    
    #using this for loop it iterates over all the pages 
    #present on the website.
    for page in range(2,maxpage+1,1):
        
        #here a new link is generated for all the pages on website
        url="https://www.4icu.org/reviews/index"+str(page)+".htm"
        
        
        #request to get the web page
        source_code=requests.get(url)
        
        #convert source code of web page into text
        plain_text=source_code.text
        
        
        soup=BeautifulSoup(plain_text)
        x=0
        
        
        #loop below basically extracts the 
        #lines having the name of universities
        for link in soup.findAll('a',):#{ 'href'  }):
            l=str(link)
            x=re.search("s/[1-9]+.htm",l)
            if(x):
                names.append(l)        
                
                
    return names

## This names variable has a list of names of all the Universities.

In [3]:
names=crawler_function()

In [4]:
names[1:20]

['<a href="/reviews/17715.htm"> Abbottabad University of Science and Technology</a>',
 '<a href="/reviews/14616.htm"> Abdul Wali Khan University Mardan</a>',
 '<a href="/reviews/15143.htm"> Abdullah Gül Üniversitesi</a>',
 '<a href="/reviews/17274.htm"> Abdulrahman Al-Sumait Memorial University</a>',
 '<a href="/reviews/4763.htm"> Abertay University</a>',
 '<a href="/reviews/4823.htm"> Aberystwyth University</a>',
 '<a href="/reviews/17817.htm"> Abhilashi University</a>',
 '<a href="/reviews/6637.htm"> Abilene Christian University</a>',
 '<a href="/reviews/17921.htm"> Abourihan Higher Education Institute</a>',
 '<a href="/reviews/11138.htm"> Abra State Institute of Science and Technology</a>',
 '<a href="/reviews/16788.htm"> Abraham Baldwin Agricultural College</a>',
 '<a href="/reviews/14773.htm"> Abu Ali ibn Sino nomidagi Buxoro Davlat Tibbiyot Instituti</a>',
 '<a href="/reviews/17118.htm"> Abu Dhabi Polytechnic</a>',
 '<a href="/reviews/4663.htm"> Abu Dhabi University</a>',
 '<a hr

# Now we will do pre-processing

In [5]:
#remove the uses less parts using re library
def pre_processing(names):
    for i in range(len(names)):
        l=names[i]
        l=re.sub("</a>","",l,1)
        pos=re.search(">",l).start()+1
        l=l[pos:-1].strip()
        names[i]=l
    return names

In [6]:
names=pre_processing(names)

In [7]:
names[1:20]

['Abbottabad University of Science and Technolog',
 'Abdul Wali Khan University Marda',
 'Abdullah Gül Üniversites',
 'Abdulrahman Al-Sumait Memorial Universit',
 'Abertay Universit',
 'Aberystwyth Universit',
 'Abhilashi Universit',
 'Abilene Christian Universit',
 'Abourihan Higher Education Institut',
 'Abra State Institute of Science and Technolog',
 'Abraham Baldwin Agricultural Colleg',
 'Abu Ali ibn Sino nomidagi Buxoro Davlat Tibbiyot Institut',
 'Abu Dhabi Polytechni',
 'Abu Dhabi Universit',
 'Abubakar Tafawa Balewa Universit',
 'Academia de Muzica Gheorghe Dim',
 'Academia de Muzica, Teatru si Arte Plastic',
 'Academia de Studii Economice din Moldov',
 'Academia Nacional Superior de Orquestr']

In [8]:
names

['Aarhus Universite',
 'Abbottabad University of Science and Technolog',
 'Abdul Wali Khan University Marda',
 'Abdullah Gül Üniversites',
 'Abdulrahman Al-Sumait Memorial Universit',
 'Abertay Universit',
 'Aberystwyth Universit',
 'Abhilashi Universit',
 'Abilene Christian Universit',
 'Abourihan Higher Education Institut',
 'Abra State Institute of Science and Technolog',
 'Abraham Baldwin Agricultural Colleg',
 'Abu Ali ibn Sino nomidagi Buxoro Davlat Tibbiyot Institut',
 'Abu Dhabi Polytechni',
 'Abu Dhabi Universit',
 'Abubakar Tafawa Balewa Universit',
 'Academia de Muzica Gheorghe Dim',
 'Academia de Muzica, Teatru si Arte Plastic',
 'Academia de Studii Economice din Moldov',
 'Academia Nacional Superior de Orquestr',
 'Academia Rerum Civiliu',
 'Academic City University Colleg',
 'Académie des Beaux-Arts de Tourna',
 'Académie Franco-Américaine de Managemen',
 'Academie voor Hoger Kunst- en Cultuuronderwij',
 'Academy of Art Universit',
 'Academy of Education of Kyrgyzsta',
 '

In [9]:
len(names) #total number of all the universities

9568

# Here we write all the names present in this "names[ ]" into a csv file. 
# Open the csv file generate in excel. it is saved in same location as this file. 

In [10]:
#convert list into pandas dataframe and save into a csv file.
df=pd.DataFrame(names)
df.to_csv("Names_of_all_the_universities.csv")