# Building Blocks You Can Use In Your Code

## Setup Stuff

In [None]:
## Boring setup stuff
import math
import numpy as np
import pandas as pd
import re # regular expressions for pattern matching
import requests
from nltk import word_tokenize

# Make plots inline
%matplotlib inline

# Make inline plots vector graphics instead of raster graphics
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('pdf', 'svg')

## Loading the Data

In [None]:
def load_pirates_movie_script():
    """
    Returns a DataFrame where each row is a line from the script
    """
    return pd.DataFrame(
        data=requests.get("https://gist.githubusercontent.com/briandk/61d60180accecff51807ffddf0c5f18e/raw/00cfc60c64e75dbd730635850dc393d6c0196243/pirates.txt").text.splitlines(), # or in nltk: webtext.raw('pirates.txt').splitlines(),
        columns=["line_of_script"])

pirates_of_the_caribbean = load_pirates_movie_script()
pirates_of_the_caribbean.head()

## Printing out the values of a column

In [None]:
pirates_of_the_caribbean["line_of_script"].apply(print)

## Regular Expression Patterns for Dialogue and Scenes

In [None]:
# Visualize this regular expression at https://www.debuggex.com/r/iLYFPobXckKrlx4l
dialogue_pattern = re.compile(r'''(?x)
          (?P<character_name>  # Create a capturing group called `character_name`
              [A-Z,\s]+)       #   Where movie character names contain either ALL CAPS letters or spaces
          :                    # followed by a colon,
          \s*                  # followed by some whitespace,
          (?P<what_they_say>   # Then create a capturing group for what the movie character actually says
              .*               #   Where what they actually say can be any characters 0 or more,
              $)               #   followed by the end of the line
    '''
)

# Visualize this regular expression at https://www.debuggex.com/r/7uNzZYOnuahhyBQt
scene_pattern = re.compile(r'''(?x)
    ^Scene # The line begins with scene,
         : # Followed by a colon,
        .* # Followed by any characters,
         $ # Until the end of the line.
    '''
)


## Functions that extract specific parts of text using patterns

In [None]:
def extract_character_name(dialogue_element):
    """
    Get a character's name from a dialogue_element dictionary
    """
    if dialogue_element is not None:
        return dialogue_element["character_name"]
    else:
        return None

def extract_what_character_says(dialogue_element):
    if dialogue_element is not None:
        return dialogue_element["what_they_say"]
    else:
        return None

def create_dialogue_element(line, pattern):
    """
    Keyword Arguments:
    line -- a line from the script
    pattern -- a regular expression pattern that recognizes dialogue
    
    Output:
    If the line matches the pattern, a Python dictionary.
    Otherwise, None
    """
    dialogue_element = re.match(dialogue_pattern, line)
    if dialogue_element is not None:
        return dialogue_element.groupdict()
    else:
        return None

## Add specific columns to data and return a copy of the data

In [None]:
def add_character_and_speech_columns(pirates_dataframe):
    """
    Takes a DataFrame that *must* include a column called "dialogue_element".
    Tries to apply extraction functions on that column.
    Returns a copy of the data that includes new columns.
    """
    return (
        pirates_of_the_caribbean.assign(
            character = pirates_of_the_caribbean["dialogue_element"].apply(extract_character_name),
            what_they_say = pirates_of_the_caribbean["dialogue_element"].apply(extract_what_character_says)
        )
    )

def add_scene_numbers_to_data(pattern, pirates_data):
    scenes = []
    scene_number = 0
    for line in pirates_data["line_of_script"]:
        if (re.match(pattern, line)):
            scene_number = scene_number + 1
        scenes.append(scene_number)
    return pirates_data.assign(scene=scenes)

def add_gender_columns_to_data(pirates_data):
    """
    Takes a DataFrame that should have columns for "character" and "what they say".
    Tries to apply gender-based functions.
    Returns a copy of the data with new columns of gender information.
    """
    return pirates_of_the_caribbean.assign(
        gender_of_character = pirates_of_the_caribbean["character"].apply(infer_gender_of_character),
        contains_male_pronouns = pirates_of_the_caribbean["what_they_say"].apply(detect_male_pronouns)
    )


# Example usage
pirates_of_the_caribbean = add_character_and_speech_columns(pirates_of_the_caribbean)

## Create filters to use on data

In [None]:
dialogue_filter = pd.notnull(pirates_of_the_caribbean["dialogue_element"]) # True/False array

def create_scene_filter(pattern, pirates_data):
    """
    Takes a DataFrame that has a "line_of_script" column.
    Returns a True/False column as long as the DataFrame
    """
    return pd.notnull(
        pirates_data["line_of_script"].apply(lambda x: re.match(pattern, x))
    )

scene_filter = create_scene_filter(scene_pattern, pirates_of_the_caribbean)
pirates_of_the_caribbean[scene_filter]
pirates_of_the_caribbean[scene_filter].count()

## Print Potentially Useful Information

In [None]:
def print_all_characters_with_speaking_parts(pirates_data):
    """
    This function prints out information, but it doesn't return anything.
    """
    characters_with_speaking_parts = pirates_data["character"].unique()
    for character in characters_with_speaking_parts:
        print(character)

# Example Usage
print_all_characters_with_speaking_parts(pirates_of_the_caribbean)

## Sort data

In [None]:
def sort_characters_by_lines_of_dialogue_per_character(pirates_data):
    return (
        pirates_of_the_caribbean
            .groupby("character")
            .count()
            .sort_values(by=['what_they_say'], ascending=False)
    )

sort_characters_by_lines_of_dialogue_per_character(pirates_of_the_caribbean)

## Plot Data

In [None]:
lines_per_character = sort_characters_by_lines_of_dialogue_per_character(pirates_of_the_caribbean)
lines_per_character["what_they_say"].plot(kind='bar')

## Functions that try to infer from gendered language

In [None]:
def detect_male_pronouns(line_of_dialogue):
    """
    Returns True if the line contains male pronouns; False if it doesn't
    """
    male_pronouns = ["he", "him"]
    contains_male_pronoun = False
    if pd.notnull(line_of_dialogue):
        tokenized_dialogue = word_tokenize(line_of_dialogue.lower())
        for pronoun in male_pronouns:
            if (pronoun in tokenized_dialogue):
                contains_male_pronoun = True
    return contains_male_pronoun

def infer_gender_of_character(character_name):
    """
    Uses the gender_map to look up the gender of a character based on their name.
    If the character's gender is defined in the map, this returns that gender.
    If a character name/gender is NOT defined in the map, it returns "unknown"
    
    You can add characters to the character map using "(name_of_character)": "(gender)"
    """
    gender_map = {
        "ELIZABETH SWANN": "female",
        "LORD CUTLER BECKETT": "male"
    }
    character_gender = "unknown"
    if character_name in gender_map.keys():
        character_gender = gender_map[character_name]
    return character_gender

## Create a complex query

In [None]:
(
    add_gender_columns_to_data(pirates_of_the_caribbean)
        .query("gender_of_character == 'female' and contains_male_pronouns == False")
        [["character", "gender_of_character", "what_they_say"]]
)