# 01 Obtaining Data
In this section the data will get fetched from the Wikipedia API. The data should consist of Indian Movie Titles and their plots as well as American Movie Titles and their plots. 

In [1]:
import pandas as pd
import wikipediaapi
wiki_wiki = wikipediaapi.Wikipedia('en')

## Functions

In [2]:
# getting all pages from category
def get_PagesInCategory(categorymembers, level=0, max_level=1):
    pages = []
    for c in categorymembers.values():

        if not (
                c.title.__contains__("Category:") 
                or c.title.__contains__("List of") 
                or c.title.__contains__("Lists of")
            ):

            pages.append(c)

        if c.ns == wikipediaapi.Namespace.CATEGORY and level < max_level:
            pages += get_PagesInCategory(c.categorymembers, level=level + 1, max_level=max_level)
    
    return pages

In [3]:
# takes long since entire page text gets loaded for each page
def get_PlotsFromPages(pages, max_plots, max_length, min_length=0):
    plots = []
    for page in pages:
        sections = page.sections
        if(len(sections) > 0 ):
            
            plot_length = len(sections[0].text)
            if(sections[0].title == "Plot" and plot_length > min_length and plot_length <= max_length):
                plots.append({"title":page.title, "plot":sections[0].text})

                # Max plots
                if len(plots) >= max_plots:
                    break
    
    return plots

## Indian Film Plots

In [4]:
ind_cat = wiki_wiki.page("Category:Indian_films_by_genre")

# Can lead to an error when internet connection gets interrupted
try:
    ind_pages = get_PagesInCategory(ind_cat.categorymembers, level=0, max_level=1)
    print(len(ind_pages))
except:
    print("An error occured. Try again.")

7662


In [5]:
try:
    ind_plots = get_PlotsFromPages(ind_pages, max_plots=2000, max_length=6000, min_length=200)
    len(ind_plots)
except:
    print("An error occured. Try again.")

An error occured. Try again.


In [6]:
ind_df = pd.DataFrame(ind_plots)
ind_df.head()

NameError: name 'ind_plots' is not defined

In [None]:
ind_df.to_csv('../data/indian_plots.csv')

## American Film Plots

In [None]:
amer_cat = wiki_wiki.page("Category:American_films_by_genre")

try:
    amer_pages = get_PagesInCategory(amer_cat.categorymembers, level=0, max_level=1)
    print(len(amer_pages))
except:
    print("An error occured. Try again.")

In [None]:
try:
    amer_plots = get_PlotsFromPages(amer_pages, max_plots=2000, max_length=6000, min_length=200)
    len(amer_plots)
except:
    print("An error occured. Try again.")

In [None]:
amer_df = pd.DataFrame(amer_plots)
amer_df.head()

In [None]:
amer_df.to_csv('../data/american_plots.csv')