# Import Dependencies

In [1]:
import numpy as np
import pandas as pd

# Pandas String Documentation

In [2]:
import webbrowser
webbrowser.open_new_tab('http://pandas.pydata.org/pandas-docs/stable/text.html')

True

# A String & Regex Pattern to Match

In [3]:
string = ['fg44h4j', 'rghi', 'tr44e', 'IG4u']
pattern = '4'
pattern2 = 'i'
pattern3 = 'g'

# Convert to Dataframe

In [4]:
z = pd.DataFrame(string, columns = ['text'])
z

Unnamed: 0,text
0,fg44h4j
1,rghi
2,tr44e
3,IG4u


# 4 Basic Regex Actions

1. Matching (logical/counting)
2. Subbing
3. Splitting
4. Extracting

## Matching: Logical

In [5]:
z.text.str.contains('4')

0     True
1    False
2     True
3     True
Name: text, dtype: bool

In [6]:
z.text.str.contains('i')

0    False
1     True
2    False
3    False
Name: text, dtype: bool

### Ignore Case

In [7]:
z.text.str.contains('(?i)i')

0    False
1     True
2    False
3     True
Name: text, dtype: bool

## Matching: Counting

In [8]:
z.text.str.count('4')

0    3
1    0
2    2
3    1
Name: text, dtype: int64

In [9]:
z.text.str.count('i')

0    0
1    1
2    0
3    0
Name: text, dtype: int64

### Ignore Case

In [10]:
z.text.str.count('(?i)i')

0    0
1    1
2    0
3    1
Name: text, dtype: int64

# Subbing

In [11]:
z.text.str.replace('4', '<<FOUR>>')

0    fg<<FOUR>><<FOUR>>h<<FOUR>>j
1                            rghi
2             tr<<FOUR>><<FOUR>>e
3                     IG<<FOUR>>u
Name: text, dtype: object

In [12]:
z.text.str.replace('i', '<<AYE>>')

0       fg44h4j
1    rgh<<AYE>>
2         tr44e
3          IG4u
Name: text, dtype: object

### Ignore Case

In [13]:
z.text.str.replace('(?i)i', '<<AYE>>')

0       fg44h4j
1    rgh<<AYE>>
2         tr44e
3    <<AYE>>G4u
Name: text, dtype: object

# Splitting

In [14]:
z.text.str.split('4')

0    [fg, , h, j]
1          [rghi]
2       [tr, , e]
3         [IG, u]
Name: text, dtype: object

In [15]:
z.text.str.split('g')

0    [f, 44h4j]
1       [r, hi]
2       [tr44e]
3        [IG4u]
Name: text, dtype: object

### Ignore Case

In [16]:
z.text.str.split('(?i)g')

0    [f, 44h4j]
1       [r, hi]
2       [tr44e]
3       [I, 4u]
Name: text, dtype: object

### Splitting to Dataframe

In [17]:
z.text.str.split('4', expand=True).fillna('')

Unnamed: 0,0,1,2,3
0,fg,,h,j
1,rghi,,,
2,tr,,e,
3,IG,u,,


# Extracting

In [18]:
z.text.str.extractall('(4)')

Unnamed: 0_level_0,Unnamed: 1_level_0,0
Unnamed: 0_level_1,match,Unnamed: 2_level_1
0,0,4
0,1,4
0,2,4
2,0,4
2,1,4
3,0,4


### Going Wide

In [19]:
z.text.str.extractall('(4)').unstack().fillna('')

Unnamed: 0_level_0,0,0,0
match,0,1,2
0,4,4.0,4.0
2,4,4.0,
3,4,,


In [20]:
z.text.str.extractall('(g)')

Unnamed: 0_level_0,Unnamed: 1_level_0,0
Unnamed: 0_level_1,match,Unnamed: 2_level_1
0,0,g
1,0,g


### Ignore Case

In [21]:
z.text.str.extractall('(?i)(g)')

Unnamed: 0_level_0,Unnamed: 1_level_0,0
Unnamed: 0_level_1,match,Unnamed: 2_level_1
0,0,g
1,0,g
3,0,G


# Exercises

## String Data

In [22]:
wiki = ["In theoretical computer science and formal language theory, a regular expression (sometimes called a rational expression)[1][2] is a sequence of characters that define a search pattern, mainly for use in pattern matching with strings, or string matching, i.e. \"find and replace\"-like operations.",
    "The concept arose in the 1950s, when the American mathematician Stephen Kleene formalized the description of a regular language, and came into common use with the Unix text processing utilities ed, an editor, and grep, a filter.",
    "In modern usage, \"regular expressions\" are often distinguished from the derived, but fundamentally distinct concepts of regex or regexp, which no longer describe a regular language.",
    "See below for details."
]
wiki

['In theoretical computer science and formal language theory, a regular expression (sometimes called a rational expression)[1][2] is a sequence of characters that define a search pattern, mainly for use in pattern matching with strings, or string matching, i.e. "find and replace"-like operations.',
 'The concept arose in the 1950s, when the American mathematician Stephen Kleene formalized the description of a regular language, and came into common use with the Unix text processing utilities ed, an editor, and grep, a filter.',
 'In modern usage, "regular expressions" are often distinguished from the derived, but fundamentally distinct concepts of regex or regexp, which no longer describe a regular language.',
 'See below for details.']

## Use `'wiki'` to make a 1 column pandas dataframe with the column named wiki.

In [23]:
x = pd.DataFrame(wiki, columns = ['wiki'])
x

Unnamed: 0,wiki
0,In theoretical computer science and formal lan...
1,"The concept arose in the 1950s, when the Ameri..."
2,"In modern usage, ""regular expressions"" are oft..."
3,See below for details.


## Which elements contain the expression "regular"?

In [24]:
x.wiki.str.contains('regular')

0     True
1     True
2     True
3    False
Name: wiki, dtype: bool

## Which elements contain the expression "regular expression"?

In [25]:
x.wiki.str.contains('regular expression')

0     True
1    False
2     True
3    False
Name: wiki, dtype: bool

## Get a count of the number of times the expression "regular" appears in each element.

In [26]:
x.wiki.str.count('regular')

0    1
1    1
2    2
3    0
Name: wiki, dtype: int64

## Get a count of the number of times the expression "regular expression" appears in each element.

In [27]:
x.wiki.str.count('regular expression')

0    1
1    0
2    1
3    0
Name: wiki, dtype: int64

## Get a count of the number of times the exact (match case) expression "the" appears in each element.

In [28]:
x.wiki.str.count('the')

0    2
1    5
2    1
3    0
Name: wiki, dtype: int64

## Get a count of the number of times the expression "the" appears in each element ignoring case.

In [29]:
x.wiki.str.count('(?i)the')

0    2
1    6
2    1
3    0
Name: wiki, dtype: int64

## Replace the expression "regular expression" with `"<<REGEX>>"`.

In [30]:
x.wiki.str.replace('regular expression', '<<REGEX>>')

0    In theoretical computer science and formal lan...
1    The concept arose in the 1950s, when the Ameri...
2    In modern usage, "<<REGEX>>s" are often distin...
3                               See below for details.
Name: wiki, dtype: object

## Replace the exact expression "the" with `"<<ARTICLE>>`.

In [31]:
x.wiki.str.replace('the', '<<ARTICLE>>')

0    In <<ARTICLE>>oretical computer science and fo...
1    The concept arose in <<ARTICLE>> 1950s, when <...
2    In modern usage, "regular expressions" are oft...
3                               See below for details.
Name: wiki, dtype: object

## Replace the expression "the" with `"<<ARTICLE>>` ignoring case.

In [35]:
x.wiki.str.replace('(?i)the', '<<ARTICLE>>')

0    In <<ARTICLE>>oretical computer science and fo...
1    <<ARTICLE>> concept arose in <<ARTICLE>> 1950s...
2    In modern usage, "regular expressions" are oft...
3                               See below for details.
Name: wiki, dtype: object

## Extract all the occurrences of the exact expression "the" from each element.

**NOTE:** Version 18.1 of Pandas contains a bug that does not allow this to work.  It should be fixed in version 18.2: http://stackoverflow.com/a/38617526/1000343

In [None]:
x.wiki.str.extractall('(the)')

## Extract all the occurrences of the expression "the" from each element ignoring case.

In [None]:
x.wiki.str.extractall('(?i)(the)')

## Split the text on the expression "the".

In [37]:
x.wiki.str.split('the', expand = True)

Unnamed: 0,0,1,2,3,4,5
0,In,oretical computer science and formal language,"ory, a regular expression (sometimes called a ...",,,
1,The concept arose in,"1950s, when",American ma,matician Stephen Kleene formalized,"description of a regular language, and came i...","Unix text processing utilities ed, an editor,..."
2,"In modern usage, ""regular expressions"" are oft...","derived, but fundamentally distinct concepts ...",,,,
3,See below for details.,,,,,


# Challenge

***Hint***: you'll need to use/read [the documentation](http://pandas.pydata.org/pandas-docs/stable/text.html)

## Split on the first occurrence of the expression "the".

In [38]:
x.wiki.str.split('the', expand = True, n = 1).fillna("")

Unnamed: 0,0,1
0,In,oretical computer science and formal language ...
1,The concept arose in,"1950s, when the American mathematician Stephe..."
2,"In modern usage, ""regular expressions"" are oft...","derived, but fundamentally distinct concepts ..."
3,See below for details.,


## Split on the last occurrence of the expression "the".

In [39]:
x.wiki.str.rsplit('the', expand = True, n = 1).fillna("")

Unnamed: 0,0,1
0,In theoretical computer science and formal lan...,"ory, a regular expression (sometimes called a ..."
1,"The concept arose in the 1950s, when the Ameri...","Unix text processing utilities ed, an editor,..."
2,"In modern usage, ""regular expressions"" are oft...","derived, but fundamentally distinct concepts ..."
3,See below for details.,


## Switch all lower case letter 'a' to 'e' and vice versa.

In [40]:
x['a'] = x.wiki.str.replace("a", "REPLACEMELATER")
x['a'] = x.a.str.replace("e", "a")
x['a'] = x.a.str.replace("REPLACEMELATER", "e")
x.a

0    In thaoraticel computar scianca end formel len...
1    Tha concapt erosa in tha 1950s, whan tha Amari...
2    In modarn usega, "raguler axprassions" era oft...
3                               Saa balow for dateils.
Name: a, dtype: object