In [None]:
# 8.3.10 Parse the Box Office Data

# Remember, there are two main forms the box office data is written in: 
# "$123.4 million" (or billion), and "$123,456,789." 

# We're going to build a regular expression for each form, and then see what forms are left over.



In [1]:
# Create the First Form

# PLAN

# For the first form, our pattern match string will include six elements in the following order:

# 1. A dollar sign
# 2. An arbitrary (but non-zero) number of digits
# 3. An optional decimal point
# 4. An arbitrary (but possibly zero) number of more digits
# 5. A space (maybe more than one)
# 6. The word "million" or "billion"

# We'll translate those rules into a regular expression, step by step.


In [None]:
# Step 1: Start with a dollar sign.

# The dollar sign is a special character in regular expressions, so we'll need to escape it.

# \$

In [None]:
# Step 2: Add an arbitrary (but non-zero) number of digits.

# We'll add the \d character to specify digits only, and the + modifier to capture one or more digits. 
# Our regular expression string now appears as:

# "\$\d+".


In [3]:
# Step 3: Add an optional decimal point.

# Remember, the decimal point is a special character, so it needs to be escaped with a backslash. 
# Since the decimal point is optional, add a question mark modifier after it. 
# Our regular expression string now appears as:

# "\$\d+\.?".


In [None]:
# Step 4: Add an arbitrary (but possibly zero) number of more digits.

# Once again, we'll use the \d character to specify digits only, but now with the * modifier because there may be no 
# more digits after the decimal point. 

# Our regular expression string now appears as: 

# "\$\d+\.?\d*".

In [None]:
# Step 5: Add a space (maybe more than one).

# Now we're going to use the \s character to match whitespace characters. To be safe, we'll match any number of 
# whitespace characters with the * modifier. 

# Our regular expression string now appears as: 

# "\$\d+\.?\d*\s*".

In [None]:
# Step 6: Add the word "million" or "billion."

# Since "million" and "billion" only differ by one letter, we can match it with a character set for the first letter. 

# We specify character sets with square brackets, so we'll add "[mb]illion" to the end of our string. 
# Our finished regular expression string now appears as:

# "\$\d+\.?\d*\s*[mb]illion".


In [None]:
# Create a variable form_one and set it equal to the finished regular expression string. 
# Because we need the escape characters to remain, we need to preface the string with an r.

# form_one = r'\$\d+\.?\d*\s*[mb]illion'

In [None]:
# Before moving on, a note about regex playgrounds. 
# Sites like https://regex101.com let you test your regex expressions on texts. 

# Try playing around with some of the examples that you have gone over. 
# You will find it helpful to use such a tool when writing regular expressions.


In [None]:
# You might be wondering if we're going to miss any box office values that have uppercase letters. 
# Don't worry—when we use the contains() method, we will specify an option to ignore case.



In [None]:
# INSPECT

# Now, to count up how many box office values match our first form. We'll use the str.contains() method on box_office.

# To ignore whether letters are uppercase or lowercase, add an argument called flags, and set it equal to 
# re.IGNORECASE. 

# Finally, we can call the sum()method to count up the total number that return True. 

# Your code should look like the following:

# box_office.str.contains(form_one, flags=re.IGNORECASE).sum()

In [None]:
# Create the Second Form

# PLAN

# Next, we'll match the numbers of our second form, "$123,456,789." 
# In words, our pattern match string will include the following elements:

# 1. A dollar sign
# 2. A group of one to three digits
# 3. At least one group starting with a comma and followed by exactly three digits


In [None]:
# Step 1: Start with a dollar sign.

# Once again, we need to escape the dollar sign for it to match. 

# Our regular expression string starts like this: 

# "\$".

In [None]:
# Step 2: Add a group of one to three digits.

# We'll use the \d character for digits, but this time, we'll modify it with curly brackets to only match one through 
# three repetitions. 

# Our regular expression string now appears as:

# "\$\d{1,3}".


In [None]:
# Step 3: Match at least one group starting with a comma and followed by exactly three digits.

# To match a comma and exactly three digits, we'll use the string ",\d{3}". 
# To match any repetition of that group, we'll put it inside parentheses, and then put a plus sign after the 
# parentheses: "(,\d{3})+". 

# We'll add one more modification to specify that this is a non-capturing group by inserting a question mark and 
# colon after the opening parenthesis: "(?:,\d{3})+". 

# The use of a non-capturing group isn’t strictly necessary here, but it eliminates an unwanted warning message in 
# Jupyter Notebook. Our finished regular expression string now appears as:

# "\$\d{1,3}(?:,\d{3})+".



In [None]:
# Create another variable form_two and set it equal to the finished regular expression string. 

# Don't forget to make it a raw string so Python keeps the escaped characters.

# count up the number of box office values that match this pattern. 

# Don't forget to put an r before the string and set the flags option to include re.IGNORECASE.

In [None]:
# form_two = r'\$\d{1,3}(?:,\d{3})+'
# box_office.str.contains(form_two, flags=re.IGNORECASE).sum()
