# Text Wrangling and Regex

Adapted from Bella Crouch, Lisa Yan, Will Fithian, Joseph Gonzalez, Deborah Nolan, Sam Lau

Updated by Maya Shen

Working with text: applying string methods and regular expressions

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import zipfile
import pandas as pd

## Aside: Vectorization Speed

A slight warning...

Inspired by Itamar Turner-Trauring's blog post: https://pythonspeed.com/articles/pandas-vectorization/

In [None]:
import timeit
import string
import random as py_random
from numpy import *

In [None]:
num_rows  = 10000
df = pd.DataFrame({'x': np.random.rand(num_rows)*100,
                   'y': np.random.rand(num_rows)*100,
                  'sentence': [''.join(py_random.choices(string.ascii_uppercase + string.digits + '                    ', 
                                                    k=random.randint(100, 250))) for i in range(num_rows)]})
df.head()

### Basic

Let's begin with a basic operation: calculating the ratio of two numerical columns in the dataframe. We begin by just looking at how long it takes with vectorization vs without for one run:

In [None]:
ratio_t0_vec = timeit.default_timer()
100 * (df["x"] / df["y"])
ratio_t1_vec = timeit.default_timer()
print(f"Execution time: {ratio_t1_vec-ratio_t0_vec} seconds")

In [None]:
def calc_ratio(row):
    return 100 * (row["x"] / row["y"])

ratio_t0_nonvec = timeit.default_timer()
df.apply(calc_ratio, axis=1)
ratio_t1_nonvec = timeit.default_timer()
print(f"Execution time: {ratio_t1_nonvec-ratio_t0_nonvec} seconds")

In [None]:
single_ratio_speedup = (ratio_t1_nonvec-ratio_t0_nonvec) / (ratio_t1_vec-ratio_t0_vec)
print(f"Vectorized code is ~{round(single_ratio_speedup, 4)}x faster than the non-vectorized code")

Now let's take the average time over 100 runs:

In [None]:
num_runs = 100

In [None]:
def num_vec_fn():
    return 100 * (df["x"] / df["y"])

def num_nonvec_fn():
    return df.apply(calc_ratio, axis=1)

In [None]:
ratio_vec_execution_time = timeit.timeit(num_vec_fn, number=num_runs)
print(f"Total execution time for {num_runs} runs: {ratio_vec_execution_time} seconds")
print(f"Average execution time per run: {ratio_vec_execution_time / num_runs} seconds")


In [None]:
ratio_nonvec_execution_time = timeit.timeit(num_nonvec_fn, number=num_runs)
print(f"Total execution time for {num_runs} runs: {ratio_nonvec_execution_time} seconds")
print(f"Average execution time per run: {ratio_nonvec_execution_time / num_runs} seconds")

In [None]:
mult_ratio_speedup = (ratio_nonvec_execution_time / num_runs) / (ratio_vec_execution_time / num_runs)
print(f"Vectorized code is ~{round(mult_ratio_speedup, 4)}x faster than the non-vectorized code")

### Strings
Let's try it for strings / string operations now by getting the "sentence" length. Again, let's start by just looking at how long it takes with vectorization vs without for one run:

In [None]:
str_t0_vec = timeit.default_timer()
df["sentence"].str.split().apply(len)
str_t1_vec = timeit.default_timer()
print(f"Execution time: {str_t1_vec-str_t0_vec} seconds")

In [None]:
def sentence_length(s):
    return len(s.split())

str_t0_nonvec = timeit.default_timer()
df["sentence"].apply(sentence_length)
str_t1_nonvec = timeit.default_timer()
print(f"Execution time: {str_t1_nonvec-str_t0_nonvec} seconds")

In [None]:
single_str_speedup = (str_t1_nonvec-str_t0_nonvec) / (str_t1_vec-str_t0_vec)
print(f"Vectorized code is ~{round(single_str_speedup, 4)}x faster than the non-vectorized code")

Now let's take the average time over 100 runs:

In [None]:
def str_vec_fn():
    return df["sentence"].str.split().apply(len)

def str_nonvec_fn():
    return df["sentence"].apply(sentence_length)

In [None]:
str_vec_execution_time = timeit.timeit(str_vec_fn, number=num_runs)
print(f"Total execution time for {num_runs} runs: {str_vec_execution_time} seconds")
print(f"Average execution time per run: {str_vec_execution_time / num_runs} seconds")


In [None]:
str_nonvec_execution_time = timeit.timeit(str_nonvec_fn, number=num_runs)
print(f"Total execution time for {num_runs} runs: {str_nonvec_execution_time} seconds")
print(f"Average execution time per run: {str_nonvec_execution_time / num_runs} seconds")


In [None]:
mult_str_speedup = (str_nonvec_execution_time / num_runs) / (str_vec_execution_time / num_runs)
print(f"Vectorized code is ~{round(mult_str_speedup, 4)}x faster than the non-vectorized code")

In [None]:
mult_str_slowdown = (str_vec_execution_time / num_runs) / (str_nonvec_execution_time / num_runs)
print(f"Non-vectorized code is ~{round(mult_str_slowdown, 4)}x faster than the vectorized code")

---

## Demo 1: Canonicalizing County Names

In [None]:
states = pd.read_csv("data/county_and_state.csv")
populations = pd.read_csv("data/county_and_population.csv")

# display allows us to view a DataFrame without returning it as an object
display(states)
display(populations)

Both of these DataFrames share a "County" column. Unfortunately, formatting differences mean that we can't directly merge the two DataFrames using the "County"s.

In [None]:
states.merge(populations, left_on="County", right_on="County")

### Using Pandas String Functions

To address this, we can **canonicalize** the "County" string data to apply a common formatting.

In [None]:
def canonicalize_county(county_series):
    return (county_series.str.lower()               # lowercase
            .str.replace(' ', '')                   # remove space
            .str.replace('&', 'and')                # replace &
            .str.replace('.', '')                   # remove dot
            .str.replace('county', '')              # remove "county"
            .str.replace('parish', '')              # remove "parish" 
            )

display(canonicalize_county(states["County"]))
display(canonicalize_county(populations["County"]))


In [None]:
states["Canonical County"] = canonicalize_county(states["County"])
populations["Canonical County"] = canonicalize_county(populations["County"])
display(states)
display(populations)

Now, the merge works as expected!

In [None]:
states.merge(populations, on="Canonical County")

<br><br><br>

**Return to Lecture**


---

## Demo 2: Extracting Log Data

In [None]:
log_fname = 'data/log.txt'
with open(log_fname, 'r') as f:
    log_lines = f.readlines()
log_lines

Suppose we want to extract the day, month, year, hour, minutes, seconds, and timezone. Looking at the data, we see that these items are not in a fixed position relative to the beginning of the string. That is, slicing by some fixed offset isn't going to work.

In [None]:
log_lines[0][20:31] #  20:31 were determined by trial-and-error!

What happens if we use the same range for the next log line?

In [None]:
log_lines[1][20:31]

Instead, we'll need to use some more sophisticated thinking. Let's focus on only the first line of the file.

In [None]:
first = log_lines[0]
first

Find the data inside the square brackes by splitting string at the square brackets

In [None]:
pertinent = (
    first.split("[")[1] # remove everything before the first [
    .split(']')[0] # Remove everything after the second square ]
) # find the text enclosed in square brackets
pertinent

In [None]:
day, month,rest  = pertinent.split('/')       # split up the date/month/year 

print("Day:   ", day)
print("Month: ", month)
print("Rest:  ", rest)

In [None]:
year, hour, minute, rest = rest.split(':')    # split up the hour:minute:second

print("Year:   ", year)
print("Hour:   ", hour)
print("Minute: ", minute)
print("Rest:   ", rest)

In [None]:
seconds, time_zone = rest.split(' ')          # split the timezone after the blank space
day, month, year, hour, minute, seconds, time_zone

Try doing the same thing using pandas `str` methods:

Solution below.
<details>
    
```python
df = (
    logs.str.split("[")
        .str[1]
        .str.split("]")
        .str[0]
        .str.split("/", expand=True)
        .rename(columns={0: "Day", 1: "Month", 2: "Rest"})
)
df = (
    df.join(df["Rest"].str.split(":", expand=True))
        .drop(columns=["Rest"])
        .rename(columns={0: "Year", 1: "Hour", 2: "Minute", 3: "Rest"})
)
df = (
    df.join(df["Rest"].str.split(" ", expand=True))
        .drop(columns=["Rest"])
        .rename(columns = {0: "Seconds", 1: "Timezone"})
)

print("Final Dataframe")
display(df)
```

</details>

In [None]:
logs = pd.read_csv("data/log.txt", 
                sep="\t", 
                header=None)[0]

print("Original input!")
display(logs)

# finish me


This worked and you will often see code like this in data cleaning pipelines.  

However, **regular expressions** provide a faster and more expressive mechanism to extract strings that match certain patterns. 

<br> <br>

**Return to lecture**

<br><br>


---

# Regular Expressions


## String Extraction with Regex

Python `re.findall` returns a list of all extracted matches:

In [None]:
import re

text = "My social security number is 123-45-6789 bro, or actually maybe it’s 321-45-6789.";

pattern = r"[0-9]{3}-[0-9]{2}-[0-9]{4}"

re.findall(pattern, text)

<br/>

Now, let's see vectorized extraction in `pandas`:

 `.str.findall` returns a `Series` of lists of all matches in each record.

In [None]:
df_ssn = pd.DataFrame(
    ['987-65-4321',
     'forty',
     '123-45-6789 bro or 321-45-6789',
     '999-99-9999'],
    columns=['SSN'])
df_ssn

In [None]:
# -> Series of lists
pattern = r"[0-9]{3}-[0-9]{2}-[0-9]{4}"
df_ssn['SSN'].str.findall(pattern)

Extracting the last expression

In [None]:
(
    df_ssn['SSN']
    .str.findall(pattern)
    .str[-1] # Get the last element from each list
)

<br><br><br>

**Return to slides**



---

<br> <br>


## Extraction Using Regex Capture Groups

The Python function `re.findall`, in combination with parentheses returns specific substrings (i.e., **capture groups**) within each matched string, or **match**.

In [None]:
text = """I will meet you at 08:30:00 pm tomorrow"""       
pattern = ".*(\d\d):(\d\d):(\d\d).*"
matches = re.findall(pattern, text)
matches

In [None]:
# the three capture groups in the first matched string
hour, minute, second = matches[0]
print("Hour:   ", hour)
print("Minute: ", minute)
print("Second: ", second)

<br/>

In `pandas`, we can use `.str.extract` to extract each capture group of **only the first match** of each record into separate columns.

In [None]:
# back to SSNs
df_ssn

In [None]:
# Will extract the first match of all groups
pattern_group_mult = r"([0-9]{3})-([0-9]{2})-([0-9]{4})" # 3 groups
df_ssn['SSN'].str.extract(pattern_group_mult)

When debugging my code with the `str` accessors I often make a separate series varible so the python tab completion tools can find the documentation.

In [None]:
ssns = df_ssn['SSN']
ssns.str.extract(pattern_group_mult) # <- try shift+tab inside the parens

Alternatively, `.str.extractall` extracts **all matches** of each record into separate columns. Rows are then MultiIndexed by original record index and match index.

In [None]:
# -> DataFrame, one row per match
df_ssn['SSN'].str.extractall(pattern_group_mult)

<br><br>

**Return to Slides**

<br><br>


---

## Canonicalization with Regex (sub, replace)

In regular Python, canonicalize with `re.sub` (standing for "substitute"):

In [None]:
text = '<div><td valign="top">Moo</td></div>'
pattern = r"<[^>]+>"
re.sub(pattern, '', text)

<br/>

In `pandas`, canonicalize with `Series.str.replace`.

In [None]:
# example dataframe of strings
df_html = pd.DataFrame(['<div><td valign="top">Moo</td></div>',
                   '<a href="http://ds100.org">Link</a>',
                   '<b>Bold text</b>'], columns=['Html'])
df_html

In [None]:
# Series -> Series
df_html["Html"].str.replace(pattern, '', regex=True).to_frame()

<br><br>

**Return to lecture**

<br><br>

# Bonus material


---


# Revisiting Text Log Processing using Regex

### Python `re` version

In [None]:
line = log_lines[0]
display(line)

pattern = r'\[(\d+)\/(\w+)\/(\d+):(\d+):(\d+):(\d+) (.+)\]'
day, month, year, hour, minute, second, time_zone = re.findall(pattern, line)[0] # get first match
day, month, year, hour, minute, second, time_zone

### `pandas` version

In [None]:
df = pd.DataFrame(log_lines, columns=['Log'])
df

Option 1: `Series.str.findall`

In [None]:
pattern = r'\[(\d+)\/(\w+)\/(\d+):(\d+):(\d+):(\d+) (.+)\]'
df['Log'].str.findall(pattern)

<br/>

Option 2: `Series.str.extractall`

In [None]:
df['Log'].str.extractall(pattern)

Wrangling either of these two DataFrames into a nice format (like below) is left as an exercise for you! You will do a related problem on the homework.


||Day|Month|Year|Hour|Minute|Second|Time Zone|
|---|---|---|---|---|---|---|---|
|0|26|Jan|2014|10|47|58|-0800|
|1|2|Feb|2005|17|23|6|-0800|
|2|3|Feb|2006|10|18|37|-0800|


In [None]:
# your code here


<br/><br/>
<br/>

---

# Real World Case Study: Restaurant Data

In this example, we will show how regexes can allow us to track quantitative data across categories defined by the appearance of various text fields.

In this example we'll see how the presence of certain keywords can affect quantitative data:

> **How do restaurant health scores vary as a function of the number of violations that mention a particular keyword?** 
> <br/>
> (e.g., unclean surfaces, vermin, permits, etc.)

In [None]:
vio = pd.read_csv('data/violations.csv', header=0, names=['bid', 'date', 'desc'])
desc = vio['desc']
vio.head()

In [None]:
counts = desc.value_counts()
counts.shape

That's a lot of different descriptions!! Can we **canonicalize** at all? Let's explore two sets of 10 rows.

In [None]:
counts[:10]

In [None]:
# Hmmm...
counts[50:60]

In [None]:
# Use regular expressions to cut out the extra info in square braces.
vio['clean_desc'] = (vio['desc']
             .str.replace(r'\s*\[.*\]$', '', regex=True)
             .str.strip()       # removes leading/trailing whitespace
             .str.lower())
vio.head()

In [None]:
# canonicalizing definitely helped
vio['clean_desc'].value_counts().shape

In [None]:
vio['clean_desc'].value_counts().head() 

Remember our research question:

> **How do restaurant health scores vary as a function of the number of violations that mention a particular keyword?** 
> <br/>
> (e.g., unclean surfaces, vermin, permits, etc.)

<br/>

Below, we use regular expressions and `df.assign()` ([documentation](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.assign.html?highlight=assign#pandas.DataFrame.assign)) to **method chain** our creation of new boolean features, one per keyword.

In [None]:
# use regular expressions to assign new features for the presence of various keywords
# regex metacharacter | 
with_features = (vio
 .assign(is_unclean     = vio['clean_desc'].str.contains('clean|sanit'))
 .assign(is_high_risk = vio['clean_desc'].str.contains('high risk'))
 .assign(is_vermin    = vio['clean_desc'].str.contains('vermin'))
 .assign(is_surface   = vio['clean_desc'].str.contains('wall|ceiling|floor|surface'))
 .assign(is_human     = vio['clean_desc'].str.contains('hand|glove|hair|nail'))
 .assign(is_permit    = vio['clean_desc'].str.contains('permit|certif'))
)
with_features.head()

---

### EDA

That's the end of our text wrangling. Now let's do some more analysis to analyze restaurant health as a function of the number of violation keywords.

To do so we'll first group so that our **granularity** is one inspection for a business on particular date. This effectively counts the number of violations by keyword for a given inspection.

In [None]:
count_features = (with_features
 .groupby(['bid', 'date'])
 .sum(numeric_only=True)
 .reset_index()
)
count_features.iloc[255:260, :]

Check out our new dataframe in action:

In [None]:
count_features[count_features['is_vermin'] > 1].head(5)

Now we'll reshape this "wide" table into a "tidy" table using a pandas feature called `pd.melt` ([documentation](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.melt.html?highlight=pd%20melt)) which we won't describe in any detail, other than that it's effectively the inverse of `pd.pivot_table`.

Our **granularity** is now a violation type for a given inspection (for a business on a particular date).

In [None]:
violation_type_df = pd.melt(count_features, id_vars=['bid', 'date'],
            var_name='feature', value_name='num_vios')

# show a particular inspection's results
violation_type_df[(violation_type_df['bid'] == 489) & (violation_type_df['date'] == 20150728)]

Remember our research question:

> **How do restaurant health scores vary as a function of the number of violations that mention a particular keyword?** 
> <br/>
> (e.g., unclean surfaces, vermin, permits, etc.)

<br/>

We have the second half of this question! Now let's **join** our table with the inspection scores, located in `inspections.csv`.

In [None]:
# read in the scores
inspection_df = pd.read_csv('data/inspections.csv',
                  header=0,
                  usecols=[0, 1, 2],
                  names=['bid', 'score', 'date'])
inspection_df.head()

While the inspection scores were stored in a separate file from the violation descriptions, we notice that the **primary key** in inspections is (`bid`, `date`)! So we can reference this key in our join.

In [None]:
# join scores with the table broken down by violation type
violation_type_and_scores = (
    violation_type_df
    .merge(inspection_df, on=['bid', 'date'])
)
violation_type_and_scores.head(12)

<br/><br/>

---

Let's plot the distribution of scores, broken down by violation counts, for each inspection feature (`is_clean`, `is_high_risk`, `is_vermin`, `is_surface`).

In [None]:
# you will learn this syntax next week. Focus on interpreting for now.
sns.catplot(x='num_vios', y='score',
               col='feature', col_wrap=2,
               kind='box',
               data=violation_type_and_scores);

Above we can observe:
* The inspection score generally goes down with increasing numbers of violations, as expected.
* Depending on the violation keyword, inspections scores on average go down at slightly different rates.
* For example, that if a restaurant inspection involved 2 violations with the keyword "vermin", the average score for that inspection would be a little bit below 80.