# Functions

In [5]:
import unittest
import os
import bs4

def get_files_in_dir(dir_name, extension = None):
    if os.path.exists(dir_name) == False:
        return None
    if os.path.isfile(dir_name):
        return None
    files = os.listdir(dir_name)
    if extension is not None: # we have a filter 
        files = [f for f in files if f.endswith(extension)]
    return files

def get_bs4_obj(filename):
    if os.path.exists(filename):
        with open(filename) as f:
            bs4_obj = bs4.BeautifulSoup(f)
        return bs4_obj
    else:
        return None


    
def get_alum_list(filename):
    dom = get_bs4_obj(filename)
    if dom == None:
        return []
    h2s_filt = [h2 for h2 in dom.find_all("h2") if "lumni" in h2.text]
    
    if len(h2s_filt) == 0:
        return []
    
    sib = h2s_filt[0].next_sibling
    # traverse, accumulating uls
    # that are siblings of h2
    uls = []
    while sib != None:
        if sib.name == "ul":
            uls.append(sib)
        if sib.name == "h2": # end of the section
            break
        sib = sib.next_sibling
    # dig into the uls
    list_texts = []
    for ul in uls: # iterate all sibling uls
        lis = ul.find_all("li")
        for li in lis: # iterate list items
            list_text = li.text
            list_texts.append(list_text) # add text 
    return list_texts

get_alum_list('./test_html_files/uol.html')


['Name 1 ', 'Name 2 ', 'Name 3 ', 'Name 4 ']

# Unit tests

In [21]:


class TestListFiles(unittest.TestCase):
    def test_list_xists(self):
        self.assertIsNotNone(get_files_in_dir)
    
    def test_returns_list(self):
        res = get_files_in_dir("/")
        self.assertEqual(  type(res), list)
    
    def test_nonexistent(self):
        res = get_files_in_dir("/i/do/not/exist")
        self.assertIsNone(res)
    
    def test_nofilenames(self):
        res = get_files_in_dir("./test_files/test1.txt")
        self.assertIsNone(res)
        
    def test_get_two(self):
        res = get_files_in_dir("./test_files/")
        self.assertEqual(len(res), 3)
    
    def test_filter_to_two(self):
        res = get_files_in_dir("./test_files/", "txt")
        self.assertEqual(len(res), 2)
    
    def test_filter_to_one(self):
        res = get_files_in_dir("./test_files/", "html")
        self.assertEqual(len(res), 1)
    
    def test_filter_to_one_ext(self):
        res = get_files_in_dir("./test_files/", "html")
        
        self.assertTrue(res[0].endswith("html"))
    
    

    

unittest.main(argv=['ingored', '-v'], exit=False)

test_alum_func_exists (__main__.TestExtractAlumni) ... ok
test_alum_func_ret_list (__main__.TestExtractAlumni) ... ok
test_filter_to_one (__main__.TestListFiles) ... ok
test_filter_to_one_ext (__main__.TestListFiles) ... ok
test_filter_to_two (__main__.TestListFiles) ... ok
test_get_two (__main__.TestListFiles) ... ok
test_list_xists (__main__.TestListFiles) ... ok
test_nofilenames (__main__.TestListFiles) ... ok
test_nonexistent (__main__.TestListFiles) ... ok
test_returns_list (__main__.TestListFiles) ... ok

----------------------------------------------------------------------
Ran 10 tests in 0.007s

OK


<unittest.main.TestProgram at 0x7fa60d69bb50>

In [7]:
class TestHTMLParse(unittest.TestCase):
    def test_get_alums_exists(self):
        self.assertIsNotNone(get_alum_list)
    
    def test_returns_list(self):
        res = get_alum_list("something.html")
        self.assertEqual( type(res), list )
    
    def test_load_html_not_none(self):
        res = get_bs4_obj("./test_html_files/uol.html")
        self.assertEqual(type(res), bs4.BeautifulSoup)
    
    def test_ret_4(self):
        res = get_alum_list('./test_html_files/uol.html')
        self.assertEqual(len(res), 4)
    
    def test_uni_london_works(self):
        res = get_alum_list('./test_html_files/uol_real.html')
        self.assertGreater(len(res), 0)
    
    def test_uni_leeds(self):
        res = get_alum_list('./test_html_files/leeds.html')
        self.assertIsNotNone(res)
    
    
        
        
unittest.main(argv=['ingored', '-v'], exit=False)

test_get_alums_exists (__main__.TestHTMLParse) ... ok
test_load_html_not_none (__main__.TestHTMLParse) ... ok
test_ret_4 (__main__.TestHTMLParse) ... ok
test_returns_list (__main__.TestHTMLParse) ... ok
test_uni_leeds (__main__.TestHTMLParse) ... ok
test_uni_london_works (__main__.TestHTMLParse) ... ok

----------------------------------------------------------------------
Ran 6 tests in 0.335s

OK


<unittest.main.TestProgram at 0x7f2207dbd460>

# Data analysis

In [9]:
dirname = "../../Data/wikipedia-university-pages/"
files = get_files_in_dir(dirname)
print("Found ", len(files))

for f in files:
    alums = get_alum_list(dirname + f)
    if len(alums) > 0:
        print("Found ", len(alums), "in",  f)
        


Found  1065
Found  9 in Wye_College.html
Found  21 in Queen's_College,_Taunton.html
Found  5 in St_Dominic's_Sixth_Form_College.html
Found  3 in University_of_Brighton.html
Found  44 in University_of_Ibadan.html
Found  12 in Woodhouse_College.html
Found  2 in Barnet_and_Southgate_College.html
Found  55 in List_of_University_College_London_people_in_the_Law.html
Found  11 in University_of_Worcester.html
Found  24 in Harrow_College.html
Found  47 in UCL_Medical_School.html
Found  2 in St._Matthew's_University.html
Found  10 in University_of_Bedfordshire.html
Found  39 in Ealing,_Hammersmith_and_West_London_College.html
Found  6 in London_Consortium.html
Found  1 in Institute_of_Commonwealth_Studies.html
Found  4 in Havering_Sixth_Form_College.html
Found  9 in College_of_North_West_London.html
Found  11 in University_of_Aberdeen.html
Found  10 in Royal_College_of_Music.html
Found  88 in University_of_the_Arts_London.html
Found  2 in Barts_and_The_London_School_of_Medicine_and_Dentistry.ht