## Test return_cat_pages

In [5]:
def return_cat_pages(categories, return_list = [], parent_list = [-99]):
    
    """
    Returns all pages associated with provided categories.
    Categories should be provided as (a list of) strings: 'LGBT musicians by nationality'
    """
    
    # create wikipedia api instance
    import wikipediaapi
    wiki_en = wikipediaapi.Wikipedia('en')
    
    for category in categories:
        
        # add 'Category:' before category name
        wiki_category = format_wiki_cat(category)
        
        # does the page exist?
        wiki_page_exist = does_wiki_exist(wiki_category)
        
        # Does wikipedia recognize the category?
        if wiki_page_exist == True:
                        
            # return all category members. pages and categories
            catmembers_dict = return_catmembers_dict(wiki_category)
            
            # return list of category members as pages only
            catmembers_list(catmembers_dict, return_list, parent_list)
            
        else:
            print("Does the", wiki_category, "exist?", wiki_page_exist)
     
    return return_list

## Dependencies

In [6]:
def format_wiki_cat(category):
    
    # add 'Category:' to proceed provided string
    format_name = str('Category:' + str(category))
    
    return format_name

In [7]:
def does_wiki_exist(name):
    """
    Given a name, check to see if a wikipedia page with that name exists
    """
    # create instance of wikipedia api
    import wikipediaapi
    wiki_en = wikipediaapi.Wikipedia('en')
    
    # get wikipedia page
    wiki_page = wiki_en.page(name)
    
    # does wikipedia page exist
    wiki_page_exist = wiki_page.exists()
    
    return wiki_page_exist

In [8]:
def return_catmembers_dict(page_name):
    """
    Given page name, return category members as dict
    """
    
    import wikipediaapi
    wiki_en = wikipediaapi.Wikipedia('en')
    
    wiki_page = wiki_en.page(page_name)
    catmembers_dict = wiki_page.categorymembers
    
    return catmembers_dict

In [9]:
def catmembers_list(categorymembers, return_list, parent_list):
    """
    Using a dictionary of wikipedia page category members, return a list of all bottom category members (ns == 0)
    Provide an empty list that will be populated with bottom category members (return_list)
    Provide an empty list that will be populated with parent categories (parent_list)
    """
    
    import wikipediaapi
    wiki_en = wikipediaapi.Wikipedia('en')
    
    seen = set(return_list)  # functionality to prevent duplicates; code source: https://stackoverflow.com/questions/19834806/is-there-a-more-pythonic-way-to-prevent-adding-a-duplicate-to-a-list
    
    for c in categorymembers.values():
        
        # check for duplicates
        if c.title not in seen:
               
            if c.ns == 0:
                return_list.append(c.title)

            else:
                # sometimes category members do not begin with 'Category'; will result in read time error
                if c.title[0:8] != 'Category':
                    continue

                # some artists have categories; not interested in digging deeper
                if c.title == parent_list[-1]:
                    continue

                # some categories are empty; will result in read time error
                wiki_page = wiki_en.page(c.title)
                catmembers_dict = wiki_page.categorymembers
                if catmembers_dict == {}:
                    continue

                # keep track of previous categories - helps with debugging
                parent_list.append(c.title)
                
                # keep digging down
                catmembers_list(c.categorymembers, return_list, parent_list)
            
    return return_list

## Test **Expand outside the test category list**

### Load priors

In [17]:
category = 'Female musicians by nationality'

In [18]:
import wikipediaapi
wiki_en = wikipediaapi.Wikipedia('en')

try:
    # add 'Category:' before category name
    wiki_category = format_wiki_cat(category)
except:
    print("Error! Check format_wiki_cat()")

try:
    # does the page exist?
    wiki_page_exist = does_wiki_exist(wiki_category)
except:
    print("Error! Check does_wiki_exist()")

### Did it work?

In [19]:
wiki_page_exist

True

In [21]:
catmembers_dict = return_catmembers_dict(wiki_category)

In [23]:
return_list = []
parent_list = [-99]

catmember_pages = catmembers_list(catmembers_dict, return_list, parent_list)

### Did it work?

In [24]:
len(catmember_pages)

26337

## Test **Does filter reduce the number of returns**? 

In [25]:
def filter_wiki_cat(return_list, term):
    filtered_list = []
    
    import wikipediaapi
    wiki_en = wikipediaapi.Wikipedia('en')
    
    for name in return_list:
        
        wiki_page = wiki_en.page(name)
        match_count = 0
        
        for category in wiki_page.categories:
            find_term = category.find(term)
            
            if find_term > -1:
                match_count = match_count + 1
                
        if match_count > -1:
            filtered_list.append(name)
            
    return filtered_list

In [26]:
filter_catmember_pages = filter_wiki_cat(catmember_pages, term = 'death')

In [29]:
f'Did the filter work? {len(filter_catmember_pages) < len(catmember_pages)}'

'Did the filter work? False'

In [27]:
len(filter_catmember_pages)

26337