# Workflow: Return list of category member pages

In [73]:
def return_cat_pages(categories, return_list = [], parent_list = [-99]):
    
    """
    Returns all pages associated with provided categories.
    Categories should be provided as (a list of) strings: 'LGBT musicians by nationality'
    """
    
    # create wikipedia api instance
    import wikipediaapi
    wiki_en = wikipediaapi.Wikipedia('en')
    
    for category in categories:
        try:
            # add 'Category:' before category name
            wiki_category = format_wiki_cat(category)
        except:
            print("Error! Check format_wiki_cat()")
        
        try:
            # does the page exist?
            wiki_page_exist = does_wiki_exist(wiki_category)
        except:
            print("Error! Check does_wiki_exist()")
        
        # Does wikipedia recognize the category?
        if wiki_page_exist == True:
                        
            # return all category members. pages and categories
            catmembers_dict = return_catmembers_dict(wiki_category)
            
            # return list of category members as pages only
            catmembers_list(catmembers_dict, return_list, parent_list)
            
        else:
            print("Does the", wiki_category, "exist?", wiki_page_exist)
     
    return return_list

## Dependencies

In [74]:
def format_wiki_cat(category):
    
    # add 'Category:' to proceed provided string
    format_name = str('Category:' + str(category))
    
    return format_name

In [75]:
def does_wiki_exist(name):
    """
    Given a name, check to see if a wikipedia page with that name exists
    """
    # create instance of wikipedia api
    import wikipediaapi
    wiki_en = wikipediaapi.Wikipedia('en')
    
    # get wikipedia page
    wiki_page = wiki_en.page(name)
    
    # does wikipedia page exist
    wiki_page_exist = wiki_page.exists()
    
    return wiki_page_exist

In [76]:
def return_catmembers_dict(page_name):
    """
    Given page name, return category members as dict
    """
    
    import wikipediaapi
    wiki_en = wikipediaapi.Wikipedia('en')
    
    wiki_page = wiki_en.page(page_name)
    catmembers_dict = wiki_page.categorymembers
    
    return catmembers_dict    

In [77]:
def catmembers_list(categorymembers, return_list, parent_list):
    """
    Using a dictionary of wikipedia page category members, return a list of all bottom category members (ns == 0)
    Provide an empty list that will be populated with bottom category members (return_list)
    Provide an empty list that will be populated with parent categories (parent_list)
    """
    
    import wikipediaapi
    wiki_en = wikipediaapi.Wikipedia('en')
    
    seen = set(return_list)  # functionality to prevent duplicates; code source: https://stackoverflow.com/questions/19834806/is-there-a-more-pythonic-way-to-prevent-adding-a-duplicate-to-a-list
    
    for c in categorymembers.values():
        
        # check for duplicates
        if c.title not in seen:
               
            if c.ns == 0:
                return_list.append(c.title)

            else:
                # sometimes category members do not begin with 'Category'; will result in read time error
                if c.title[0:8] != 'Category':
                    continue

                # some artists have categories; not interested in digging deeper
                if c.title == parent_list[-1]:
                    continue

                # some categories are empty; will result in read time error
                wiki_page = wiki_en.page(c.title)
                catmembers_dict = wiki_page.categorymembers
                if catmembers_dict == {}:
                    continue

                # keep track of previous categories - helps with debugging
                parent_list.append(c.title)
                
                # keep digging down
                catmembers_list(c.categorymembers, return_list, parent_list)
            
    return return_list

## Test **Expand out of test category**

In [71]:
categories = ['Female musicians by nationality', 'LGBT musicians by nationality']

In [80]:
catmember_pages = return_cat_pages(categories)

Female musicians by nationality
LGBT musicians by nationality


In [81]:
f'The Wikipedia API returned {len(catmember_pages)} pages (i.e., category members).'

'The Wikipedia API returned 27283 pages (i.e., category members).'

## Test **How long does the workflow take?**

In [39]:
def time_per(operation, yur_list, digits = 2):
    """
    Given an method and the target of the method
    Return the time it takes to process an item in your list
    
    required: time
    """
    # import and create an instance
    import time    
    start = time.time()
    
    # perform your operation
    operation(yur_list)
    
    # end the instance
    end = time.time()
    
    # operation time
    time_taken = end-start
    
    # how many items on the list that were processed
    yurlist_len = len(yur_list)
    
    # unit time
    time_per = (end-start) / yurlist_len
    
    return f'You processed {yurlist_len} items. It takes {round(time_per, digits)} s to process an item in your list.'

In [40]:
time_per(return_cat_pages, categories)

'You processed 7 items. It takes 0.38 s to process an item in your list.'