# Text Wrangling and Regex
Working with text: applying string methods and regular expressions

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import zipfile
import pandas as pd

## Demo 1: Canonicalizing County Names

In [2]:
# read both states and population csv files
states = pd.read_csv('county_and_state.csv')
populations = pd.read_csv('county_and_population.csv')


# display allows us to view a DataFrame without returning it as an object
display(states)

display(populations)

Unnamed: 0,County,State
0,De Witt County,IL
1,Lac qui Parle County,MN
2,Lewis and Clark County,MT
3,St John the Baptist Parish,LS


Unnamed: 0,County,Population
0,DeWitt,16798
1,Lac Qui Parle,8067
2,Lewis & Clark,55716
3,St. John the Baptist,43044


Both of these DataFrames share a "County" column. Unfortunately, formatting differences mean that we can't directly merge the two DataFrames using the "County"s.

In [3]:
# apply merage
states.merge(populations, left_on= 'County', right_on= 'County')


Unnamed: 0,County,State,Population


In [4]:
states.head()

Unnamed: 0,County,State
0,De Witt County,IL
1,Lac qui Parle County,MN
2,Lewis and Clark County,MT
3,St John the Baptist Parish,LS


To address this, we can **canonicalize** the "County" string data to apply a common formatting.

In [5]:
# define some str operation need to perform on County col in each dataframe

def str_oper(county_series):
    return (county_series.str.lower().str.replace(' ', '').str.replace('&', 'and').str.replace('.', '') 
            .str.replace('county', '') .str.replace('parish', ''))

In [6]:
# apply that customized function
states["County"] = str_oper(states["County"])
populations["County"] = str_oper(populations["County"])

In [7]:
display(states)

display(populations)


Unnamed: 0,County,State
0,dewitt,IL
1,lacquiparle,MN
2,lewisandclark,MT
3,stjohnthebaptist,LS


Unnamed: 0,County,Population
0,dewitt,16798
1,lacquiparle,8067
2,lewisandclark,55716
3,stjohnthebaptist,43044


Now, the merge works as expected!

In [8]:
# merage both frames
states.merge(populations, left_on="County", right_on="County")

Unnamed: 0,County,State,Population
0,dewitt,IL,16798
1,lacquiparle,MN,8067
2,lewisandclark,MT,55716
3,stjohnthebaptist,LS,43044


## Demo 2: Extracting Log Data

In [9]:
# Load log.txt
with open('log.txt', 'r') as fil:
    file = fil.readlines()

In [10]:
file

['169.237.46.168 - - [26/Jan/2014:10:47:58 -0800] "GET /stat141/Winter04/ HTTP/1.1" 200 2585 "http://anson.ucdavis.edu/courses/"\n',
 '193.205.203.3 - - [2/Feb/2005:17:23:6 -0800] "GET /stat141/Notes/dim.html HTTP/1.0" 404 302 "http://eeyore.ucdavis.edu/stat141/Notes/session.html"\n',
 '169.237.46.240 - "" [3/Feb/2006:10:18:37 -0800] "GET /stat141/homework/Solutions/hw1Sol.pdf HTTP/1.1"\n']

Suppose we want to extract the day, month, year, hour, minutes, seconds, and timezone. Looking at the data, we see that these items are not in a fixed position relative to the beginning of the string. That is, slicing by some fixed offset isn't going to work.

In [11]:
# find and print DD/MM/YY from first line, second line

file[0][20:31]

# Too much frustrated


'26/Jan/2014'

In [12]:
file[1][20:31]

'/Feb/2005:1'

Instead, we'll need to use some more sophisticated thinking. Let's focus on only the first line of the file.

In [13]:
# read 1st line only
line_1 = file[0]
line_1

'169.237.46.168 - - [26/Jan/2014:10:47:58 -0800] "GET /stat141/Winter04/ HTTP/1.1" 200 2585 "http://anson.ucdavis.edu/courses/"\n'

In [14]:
square_brack = line_1.split("[")[1].split(']')[0]                                             # find the text enclosed in square brackets
day, month, rest = square_brack.split('/') 

year, hour, minute, rest = rest.split(':') 

year, hour, minute, rest

('2014', '10', '47', '58 -0800')

In [15]:
# apply string operations and print day, month, year, and time

square_brack = line_1.split("[")[1].split(']')[0]   # find the text enclosed in square brackets
day, month, rest = square_brack.split('/')          # split up the date/month/year
hours,minute,second,rest = rest.split(':')          # split up the hour:minute:second
second, time_zon = rest.split(' ')                 # split the timezone after the blank space
day, month, year, hour, minute, second, time_zon


('26', 'Jan', '2014', '10', '10', '58', '-0800')

This worked, but felt fairly "hacky" – the code above isn't particularly elegant. A much more sophisticated but common approach is to extract the information we need using a *regular expression*.


# Regular Expressions


## String Extraction with Regex

Python `re.findall` returns a list of all extracted matches:

In [33]:
# find out all social sequrity numbers from text below
import re
text = "My social security number is 123-45-6789 bro, or actually maybe it’s 321-45-6789.";
pattern = r"[0-9]{3}-[0-9]{2}-[0-9]{4}"
re.findall(pattern,text)


['123-45-6789', '321-45-6789']

<br/>

Now, let's see vectorized extraction in `pandas`:

 `.str.findall` returns a `Series` of lists of all matches in each record.

In [17]:
# convert the ['987-65-4321', 'forty', '123-45-6789 bro or 321-45-6789', '999-99-9999']
# into data frame and extract all Social sequrity numbers
reg_pattern = r"[0-9]{3}-[0-9]{2}-[0-9]{4}"
df_ssn = pd.DataFrame(['987-65-4321', 'forty', '123-45-6789 bro or 321-45-6789', '999-99-9999'],columns=
                      ["Social Security No."])#)

df_ssn["Social Security No."].str.findall(reg_pattern)

df_ssn

Unnamed: 0,Social Security No.
0,987-65-4321
1,forty
2,123-45-6789 bro or 321-45-6789
3,999-99-9999


In [18]:
df_ssn["Social Security No."].str.findall(reg_pattern)

0                 [987-65-4321]
1                            []
2    [123-45-6789, 321-45-6789]
3                 [999-99-9999]
Name: Social Security No., dtype: object

In [19]:
# -> Series of lists
a = df_ssn["Social Security No."].str.findall(reg_pattern)

# Apply str.findall() and remove empty strings from the result
a.drop(1)

0                 [987-65-4321]
2    [123-45-6789, 321-45-6789]
3                 [999-99-9999]
Name: Social Security No., dtype: object

In [20]:
# find out AA or B from 'AA B B aaaabbbb'

text = "AA B B aaaabbbb"
pat = r'AA|B'
re.findall(pat,text)



['AA', 'B', 'B']

## Extraction Using Regex Capture Groups

The Python function `re.findall`, in combination with parentheses returns specific substrings (i.e., **capture groups**) within each matched string, or **match**.

In [21]:
# find out hour, mint and second
import re
text = """I will meet you at 08:30:00 pm tomorrow"""
    
reg_pattern2 = "(\d\d):(\d\d):(\d\d).*"
pattern_matches = re.findall(reg_pattern2, text)
hour,minute,second = pattern_matches[0]
print(hour,minute,second)
print(f"Hour is {hour} , minute is {minute} , second is {second}")


08 30 00
Hour is 08 , minute is 30 , second is 00


<br/>

In `pandas`, we can use `.str.extract` to extract each capture group of **only the first match** of each record into separate columns.

In [22]:
# back to SSNs
df_ssn

Unnamed: 0,Social Security No.
0,987-65-4321
1,forty
2,123-45-6789 bro or 321-45-6789
3,999-99-9999


In [23]:
# Will extract the first match of all groups
 # 3 groups
group_pattern = r"([0-9]{3})-([0-9]{2})-([0-9]{4})" # 3 groups
df_ssn['Social Security No.'].str.extract(group_pattern)

Unnamed: 0,0,1,2
0,987.0,65.0,4321.0
1,,,
2,123.0,45.0,6789.0
3,999.0,99.0,9999.0


Alternatively, `.str.extractall` extracts **all matches** of each record into separate columns. Rows are then MultiIndexed by original record index and match index.

In [24]:
# -> DataFrame, one row per match
df_ssn['Social Security No.'].str.extractall(group_pattern)

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2
Unnamed: 0_level_1,match,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0,987,65,4321
2,0,123,45,6789
2,1,321,45,6789
3,0,999,99,9999


## Canonicalization with Regex

In regular Python, canonicalize with `re.sub` (standing for "substitute"):

In [25]:
# find out Moo from given text
text = '<div><td valign="top">Moo</td></div>'
pattern = r"<[^>]+>"
re.sub(pattern, '', text)


'Moo'

<br/>

In `pandas`, canonicalize with `Series.str.replace`.

In [26]:
# example dataframe of strings
df_html = pd.DataFrame(['<div><td valign="top">Moo</td></div>',
                   '<a href="http://ds100.org">Link</a>',
                   '<b>Bold text</b>'], columns=['Html'])
df_html

Unnamed: 0,Html
0,"<div><td valign=""top"">Moo</td></div>"
1,"<a href=""http://ds100.org"">Link</a>"
2,<b>Bold text</b>


In [27]:
# find out Moo, Link, and Bold text from each row of df_html dataframe
df_html["Html"].str.replace(pattern, '', regex=True).to_frame()



Unnamed: 0,Html
0,Moo
1,Link
2,Bold text



# Revisiting Text Log Processing using Regex

### Python `re` version

In [28]:
line = file[0]
display(line)

pattern = r'\[(\d+)\/(\w+)\/(\d+):(\d+):(\d+):(\d+) (.+)\]'
day, month, year, hour, minute, second, time_zone = re.findall(pattern, line)[0] # get first match
day, month, year, hour, minute, second, time_zone

'169.237.46.168 - - [26/Jan/2014:10:47:58 -0800] "GET /stat141/Winter04/ HTTP/1.1" 200 2585 "http://anson.ucdavis.edu/courses/"\n'

('26', 'Jan', '2014', '10', '47', '58', '-0800')

### `pandas` version

In [29]:
# convert lines of above provided log.txt file into data frame and then find out date and time

data = pd.DataFrame(file, columns=['Log_History'])
data

Unnamed: 0,Log_History
0,169.237.46.168 - - [26/Jan/2014:10:47:58 -0800...
1,"193.205.203.3 - - [2/Feb/2005:17:23:6 -0800] ""..."
2,"169.237.46.240 - """" [3/Feb/2006:10:18:37 -0800..."


Option 1: `Series.str.findall`

In [30]:
pattern = r'\[(\d+)\/(\w+)\/(\d+):(\d+):(\d+):(\d+) (.+)\]'
data['Log_History'].str.findall(pattern)

0    [(26, Jan, 2014, 10, 47, 58, -0800)]
1      [(2, Feb, 2005, 17, 23, 6, -0800)]
2     [(3, Feb, 2006, 10, 18, 37, -0800)]
Name: Log_History, dtype: object

<br/>

Option 2: `Series.str.extractall`

In [31]:
# apply extractall function and then some wrangling
df = data['Log_History'].str.extractall(pattern)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4,5,6
Unnamed: 0_level_1,match,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0,26,Jan,2014,10,47,58,-800
1,0,2,Feb,2005,17,23,6,-800
2,0,3,Feb,2006,10,18,37,-800


Wrangling either of these two DataFrames into a nice format (like below) is left as an exercise for you! You will do a related problem on the homework.


||Day|Month|Year|Hour|Minute|Second|Time Zone|
|---|---|---|---|---|---|---|---|
|0|26|Jan|2014|10|47|58|-0800|
|1|2|Feb|2005|17|23|6|-0800|
|2|3|Feb|2006|10|18|37|-0800|


In [32]:
# your code here
final_df = df.copy()
final_df.columns = ["Day","Month","Year","Hour","Minute","Second","Time Zone"]
final_df= final_df.droplevel(level="match")
final_df



Unnamed: 0,Day,Month,Year,Hour,Minute,Second,Time Zone
0,26,Jan,2014,10,47,58,-800
1,2,Feb,2005,17,23,6,-800
2,3,Feb,2006,10,18,37,-800
