In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
from tqdm import tqdm
import re

# Lecture 6 - IES Web scraper

by Vítek Macháček

March 23, 2020

* Putting it all together
* OOP + Pandas + Requests + Scraping


## Object-oriented Programming
* Brief reminder of how objects work

### Docstring + Objects + Inheriting + Constructors

![Simple Object Structure](https://github.com/vitekzkytek/PythonDataIES/blob/master/06_IES_Web_Scraper/img/class_example.png?raw=1)

In [None]:
class Person:
    '''
    A Person class is *Abstract* - Intended not to be used directly, but rather to be inherited.
    '''
    def __init__(self,name,email):
        '''
        Person's constructor accepts name and e-mail and set it as class attributes, but first it checks whether they are in valid format
        '''

        if self._verify_name(name):
            self.name = name
        else:
            raise PersonException('Name is invalid')

        if self._verify_email(email):
            self.email = email
        else:
            raise PersonException('Email is invalid')
    
    def _verify_name(self,name):
        '''
        Method for ensuring that name is not empty and is string
        '''
        return len(name) > 0 and type(name) == type('')

    def _verify_email(self,email):
        '''
        Method validating that email is truly an email.
        '''

        email_pattern = '^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$'
        if(re.search(email_pattern,email)):  
            return True 
        else:  
            return False
    
    def _verify_courses(self,courses):
        '''
        Is list of courses a list containing valid course idents?
        '''
        def _verify_course(course):
            if type(course) != type(''):
                return False
            elif len(course) != 6:
                return False
            elif not (course.startswith('JEM') or course.startswith('JEB')):
                return False
            else:
                return True
        
        if type(courses) != type([]):
            return False

        return all([_verify_course(course) for course in courses])
        
    def describe(self):
        
        raise PersonException("Cannot describe abstract Person class. Override this in Teacher or Student class")
    
    def get_name(self):
        if self.name:
            return self.name
        else:
            return PersonException('No name defined')
    def get_email(self):
        if self.email:
            return self.email
        else: 
            return PersonException('No email defined')
    
    
class PersonException(Exception):
    '''
    Object handling Person related errors
    '''
    pass

class TeacherException(Exception):
    '''
    Object handling Teacher related errors
    '''
    pass

class StudentException(Exception):
    '''
    Object handling Student related errors
    '''
    pass

someone = Person('Name','email@somewhere.com')

In [None]:
someone.email

'email@somewhere.com'

In [None]:
someone.describe()

PersonException: Cannot describe abstract Person class. Override this in Teacher or Student class

In [None]:
 class Teacher(Person):
    '''
    Teacher class inherits from Person and contains name, email and list of courses taught
    '''
    def __init__(self,name,email,teaching_courses):
        '''
        Teacher's constructor calls the Person's constructor first and then do his stuff
        '''
        
        ## call to "parent" constructor
        super().__init__(name,email)
        
        ## own stuff of the Teacher
        if self._verify_courses(teaching_courses):
            self.teaching_courses = teaching_courses
        else:
            raise TeacherException('Cannot validate courses.')
    
    def describe(self):
        """ Overriding parent method """
        print('I am {}, my email is {} and I teach following courses: {}'.format(self.name,self.email,self.teaching_courses))
    
        
class Student(Person):
    '''
    Student class inherits from Person and contains name, email and list of courses taught
    '''
    def __init__(self,name,email,studying_courses):
        '''
        Student's constructor calls the Person's constructor first and then do his stuff
        '''
        super().__init__(name,email)
        
        if self._verify_courses(studying_courses):
            self.studying_courses = studying_courses
        else:
            raise StudentException('Cannot validate courses.')

    def describe(self):
        """ Overriding parent method """
        print("I am Vítek's student. My name is {name}. I study following courses: {courses}".format(courses=self.studying_courses,name=self.name))
        
vitek = Teacher('Vítek','vitezkytek@gmail.com',['JEM207'])

vitek.describe()

I am Vítek, my email is vitezkytek@gmail.com and I teach following courses: ['JEM207']


In [None]:
my_student = Student('Honza','honza@fsv.cuni.cz',['JEM207','JEB111'])
my_student.describe()

I am Vítek's student. My name is Honza. I study following courses: ['JEM207', 'JEB111']


In [None]:
my_student.name

'Honza'

### Exceptions

In [None]:
for i in range(5,-5,-1):
    try:
        print(5/i)
    except StudentException:
        print('you should not divide by zero')
    except TeacherException:
        pass
    except ValueError:
        pass
    except: 
        print()

1.0
1.25
1.6666666666666667
2.5
5.0

-5.0
-2.5
-1.6666666666666667
-1.25


In [None]:
?Person

[0;31mInit signature:[0m [0mPerson[0m[0;34m([0m[0mname[0m[0;34m,[0m [0memail[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m      A Person class is *Abstract* - Intended not to be used directly, but rather to be inherited.
[0;31mInit docstring:[0m Person's constructor accepts name and e-mail and set it as class attributes, but first it checks whether they are in valid format
[0;31mType:[0m           type
[0;31mSubclasses:[0m     Teacher, Student


In [None]:
?Teacher

[0;31mInit signature:[0m [0mTeacher[0m[0;34m([0m[0mname[0m[0;34m,[0m [0memail[0m[0;34m,[0m [0mteaching_courses[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m      Teacher class inherits from Person and contains name, email and list of courses taught
[0;31mInit docstring:[0m Teacher's constructor calls the Person's constructor first and then do his stuff
[0;31mType:[0m           type
[0;31mSubclasses:[0m     


### Robots.txt

* Is it OK to scrape?
* Guidance for search engines etc.


https://www.promptcloud.com/blog/how-to-read-and-respect-robots-file/

In [None]:
requests.get('http://ies.fsv.cuni.cz/robots.txt')

<Response [404]>

In [None]:
print(requests.get('http://sreality.cz/robots.txt').text)

User-agent: *
Disallow: /

User-agent: Googlebot
Allow: /
Disallow: /advertpdf/
Disallow: /favourites-info
Disallow: *_buri=
Disallow: /adresar/*page=
Disallow: /adresar/*perPage=
Disallow: /adresar/*search=
Disallow: /adresar/*letter=
Disallow: /adresar/*id=
Disallow: /firma/*page=
Disallow: /firma/*perPage=
Disallow: /firma/*search=
Disallow: /firma/*letter=
Disallow: /firma/*id=
Disallow: /hledani/*,
Allow: /hledani/*region=*,
Disallow: /rk-detail
Disallow: *bez-aukce=
Disallow: *without-auction=
Disallow: *pois_in_place=
Disallow: *pois_in_place_distance=

User-agent: SeznamBot
Allow: /
Disallow: /advertpdf/
Disallow: /en/
Disallow: /ru/
Disallow: /favourites-info
Disallow: *_buri=
Disallow: /adresar/*page=
Disallow: /adresar/*perPage=
Disallow: /adresar/*search=
Disallow: /adresar/*letter=
Disallow: /adresar/*id=
Disallow: /firma/*page=
Disallow: /firma/*perPage=
Disallow: /firma/*search=
Disallow: /firma/*letter=
Disallow: /firma/*id=
Disallow: /hledani/*,
Allow: /hledani/*region

## Task:
* A parser of IES websites with following features:
    * All info about people from [Internal faculty](http://ies.fsv.cuni.cz/en/node/48), [External lecturers](http://ies.fsv.cuni.cz/en/node/49), [Ph.D. candidates](http://ies.fsv.cuni.cz/en/node/51) and [Administration](http://ies.fsv.cuni.cz/en/node/50)
    * All info about [all](http://ies.fsv.cuni.cz/en/node/109) theses between 1994 and 2019 won'be covered
    * Also all courses! But no list of courses available ...

## Pages

### Find all persons?
[Current faculty](http://ies.fsv.cuni.cz/en/node/48)

1. understand structure of the website

In [None]:
def getSoup(link):
    sleep(0.1) #to be kind to the website
    r = requests.get(link)
    r.encoding = 'UTF-8'
    return BeautifulSoup(r.text,'lxml')

In [None]:
soup = getSoup('https://ies.fsv.cuni.cz/en/node/48')

In [None]:
tds = soup.findAll('td', {'class':'peopleTableCellName'})
tds

[<td class="peopleTableCellName">
 <a href="/en/staff/bajgar"><b> Matěj Bajgar M.Sc., DPhil.</b></a>
 </td>,
 <td class="peopleTableCellName">
 <a href="/en/staff/barunik"><b>doc. PhDr. Jozef Baruník Ph.D.</b></a>
 </td>,
 <td class="peopleTableCellName">
 <a href="/en/staff/bauerm"><b>doc. PhDr. Michal Bauer Ph.D.</b></a>
 </td>,
 <td class="peopleTableCellName">
 <a href="/en/staff/baxajaromir"><b>PhDr. Jaromír Baxa Ph.D.</b></a>
 </td>,
 <td class="peopleTableCellName">
 <a href="/en/staff/bertoli"><b>doc. Paola Bertoli M.A., MSc., Ph.D.</b></a>
 </td>,
 <td class="peopleTableCellName">
 <a href="/en/staff/cahlik"><b>doc. Ing. Tomáš Cahlík CSc.</b></a>
 </td>,
 <td class="peopleTableCellName">
 <a href="/en/staff/fcech"><b>PhDr. František Čech Ph.D.</b></a>
 </td>,
 <td class="peopleTableCellName">
 <a href="/en/staff/cervinka"><b>RNDr. Michal Červinka Ph.D.</b></a>
 </td>,
 <td class="peopleTableCellName">
 <a href="/en/staff/chytilova"><b>doc. PhDr. Julie Chytilová Ph.D.</b></a>
 

In [None]:
def getAllLinks(link):
    soup = getSoup(link)
    tds = soup.findAll('td', {'class':'peopleTableCellName'})
    return ['https://ies.fsv.cuni.cz' + td.find('a')['href'] for td in tds]

links = getAllLinks('http://ies.fsv.cuni.cz/en/node/48')
links

['https://ies.fsv.cuni.cz/en/staff/bajgar',
 'https://ies.fsv.cuni.cz/en/staff/barunik',
 'https://ies.fsv.cuni.cz/en/staff/bauerm',
 'https://ies.fsv.cuni.cz/en/staff/baxajaromir',
 'https://ies.fsv.cuni.cz/en/staff/bertoli',
 'https://ies.fsv.cuni.cz/en/staff/cahlik',
 'https://ies.fsv.cuni.cz/en/staff/fcech',
 'https://ies.fsv.cuni.cz/en/staff/cervinka',
 'https://ies.fsv.cuni.cz/en/staff/chytilova',
 'https://ies.fsv.cuni.cz/en/staff/dedek',
 'https://ies.fsv.cuni.cz/en/staff/dolezalova',
 'https://ies.fsv.cuni.cz/en/staff/gersl',
 'https://ies.fsv.cuni.cz/en/staff/gregor',
 'https://ies.fsv.cuni.cz/en/staff/havranek',
 'https://ies.fsv.cuni.cz/en/staff/irsova',
 'https://ies.fsv.cuni.cz/en/staff/hlavacekm',
 'https://ies.fsv.cuni.cz/en/staff/holub',
 'https://ies.fsv.cuni.cz/en/staff/horvath',
 'https://ies.fsv.cuni.cz/en/staff/jakubik',
 'https://ies.fsv.cuni.cz/en/staff/janda',
 'https://ies.fsv.cuni.cz/en/staff/jansky',
 'https://ies.fsv.cuni.cz/en/staff/kemenyova',
 'https://i

## Person's characteristics?

[A TEACHER!](https://ies.fsv.cuni.cz/en/staff/barunik)

In [None]:
def getName(link):
    soup = getSoup(link)
    return soup.find('h2').text.strip()

names = [getName(link) for link in links]
names

['Matěj Bajgar M.Sc., DPhil.',
 'doc. PhDr. Jozef Baruník Ph.D.',
 'doc. PhDr. Michal Bauer Ph.D.',
 'PhDr. Jaromír Baxa Ph.D.',
 'doc. Paola Bertoli M.A., MSc., Ph.D.',
 'doc. Ing. Tomáš Cahlík CSc.',
 'PhDr. František Čech Ph.D.',
 'RNDr. Michal Červinka Ph.D.',
 'doc. PhDr. Julie Chytilová Ph.D.',
 'prof. Ing. Oldřich Dědek CSc.',
 'doc. PhDr. Ing. Antonie Doležalová Ph.D.',
 'doc. PhDr. Adam Geršl Ph.D.',
 'doc. PhDr. Martin Gregor Ph.D.',
 'prof. PhDr. Tomáš Havránek Ph.D.',
 'doc. PhDr. Zuzana Havránková Ph.D.',
 'PhDr. Michal Hlaváček Ph.D.',
 'doc. Mgr. Tomáš Holub Ph.D.',
 'prof. Roman Horváth Ph.D.',
 'doc. PhDr. Ing. Ing. Petr Jakubík Ph.D. Ph.D.',
 'prof. Ing. Karel Janda M.A., Dr., Ph.D.',
 'doc. Petr Janský Ph.D.',
 'Ing. Irena Kemény',
 'prof. Ing. Evžen Kočenda M.A., Ph.D., DSc.',
 'prof. Ing. et Ing. Luboš Komárek Ph.D., MSc., MBA',
 'prof. PhDr. Ladislav Krištoufek Ph.D.',
 'PhDr. Jiří Kukačka Ph.D.',
 'Jiří Novák M.Sc., Ph.D., Deloitte Corporate Chair',
 'PhDr. Miros

In [None]:
def getPhone(link):
    soup = getSoup(link)
    node = soup.find('strong',text='Phone:')
    return node.next_sibling.strip()

[getPhone(link) for link in links]

['',
 '+420(776)259273',
 '222 112 329',
 '222 112 309',
 '',
 '222 112 318',
 '+420 776 535 106',
 '+420 26605 2345',
 '222 112 318',
 '222 112 325',
 'n.a.',
 '222 112 313',
 '+420 222 112 306',
 'Please contact me by e-mail',
 '222 112 309',
 '736 524 520',
 '+420/22441-2010',
 '222 112 317',
 '+49 69 9511 19393',
 '+420 222 112 316',
 '',
 '222 112 323',
 '222 112 321',
 '736524516',
 'line 312 (IES), line 2243 (UTIA)',
 '+420 602 767 305',
 '+420 222 112 314',
 '+420776661654',
 '',
 '222 112 309',
 '',
 '(+420) 220 199  477',
 '+1 301 530 5032',
 '',
 '',
 '222 112 309',
 '+420 222 112 326',
 '733-644-300',
 '602 161 710',
 '732 344 585',
 '222 112 314',
 '+420 777 576 698']

In [None]:
def getNextSiblingOfStrong(soup,characteristic):
    strong = soup.find('strong',text=characteristic)
    return strong.next_sibling.strip()
soup = getSoup(links[1])
getNextSiblingOfStrong(soup,'Email:')

'barunik [AT] fsv [DOT] cuni [DOT] cz'

In [None]:
def getMoreCharacteristics(link, characteristics):
    soup = getSoup(link)
    return pd.Series({
        char.replace(':',''):getNextSiblingOfStrong(soup,char) 
        for char in characteristics
    })

#[getMoreCharacteristics(link,['Phone:','Office:','Position:']) for link in links[:2]]
getMoreCharacteristics(links[1],['Phone:','Office:','Position:'])

Phone           +420(776)259273
Office                      503
Position    Associate Professor
dtype: object

* Let's do an object!

In [None]:
class Person:
    def __init__(self,link):
        self.soup = getSoup(link)
        self.name = self.getName()
        self.office = self.getNextSiblingOfStrong('Office:')
        self.phone = self.getNextSiblingOfStrong('Phone:')
        self.characteristics = self.getCharacteristics()
        
    def getNextSiblingOfStrong(self, characteristic):
        strong = self.soup.find('strong',text=characteristic)
        return strong.next_sibling.strip()
    
    def getName(self):
        return self.soup.find('h2').text
    
    def getCharacteristics(self):
        return pd.Series({
            'name':self.name,
            'office':self.office,
            'phone':self.phone
        })

people = [Person(link) for link in links[:2]]

In [None]:
people[-1].characteristics

name      doc. PhDr. Jozef Baruník Ph.D.
office                               503
phone                    +420(776)259273
dtype: object

In [None]:
[p.name for p in people]

[' Matěj Bajgar M.Sc., DPhil.', 'doc. PhDr. Jozef Baruník Ph.D.']

In [None]:
people[0].soup.find('div', {'class':'col-sm-12'})

<div class="col-sm-12 col-xs-9">
<ul>
<li>
<a href="http://ies.fsv.cuni.cz/default/admin_auth/login/" target="_blank" title="">E-Control</a>
</li>
<li>
<a href="/en/node/263" title="">Contact</a>
</li>
<li>
<a href="http://fsveng.fsv.cuni.cz/" target="_blank" title="">FSV UK</a>
</li>
<li>
<a href="http://dl1.cuni.cz/course/index.php?categoryid=44" target="_blank" title="">Moodle</a>
</li>
<li>
<a href="https://is.cuni.cz/studium/" target="_blank" title="">SIS</a>
</li>
<li>
<a href="http://karolinka.fsv.cuni.cz" target="_blank" title="">Karolinka</a>
</li>
</ul>
</div>

In [None]:
#We still have raw data if needed!
[p.soup.find('h2').text for p in people]

[' Matěj Bajgar M.Sc., DPhil.', 'doc. PhDr. Jozef Baruník Ph.D.']

In [None]:
pd.DataFrame([p.characteristics for p in people])

AttributeError: 'Person' object has no attribute 'characteristics'