# Import Dependencies

In [1]:
import numpy as np
import pandas as pd

# Pandas String Documentation

In [None]:
import webbrowser
webbrowser.open_new_tab('http://pandas.pydata.org/pandas-docs/stable/text.html')

# A Regex Cheat Sheet

```
NAME                REGEX    WHAT IT DOES                                          
Exception           [^X]     Match everything except `X`                        
Dot                 .        Match any character                                
Case Insensitive    (?i)     Matches irregardless of case; `Foo` & `foO` matched
Digit               \d       Match digits (i.e., [0-9])                         
Non-Digit           \D       Match non-digits (i.e., [^0-9])                    
Word                \w       Match words (i.e., [_a-zA-Z0-9])                   
Non-Word            \W       Match non-words (i.e., [^_a-zA-Z0-9])              
Whitespace          \s       Match whitespace (i.e., [ \t\r\n\f])           
Non-Whitespace      \S       Match non-whitespace (i.e., [^ \t\r\n\f])      
Word Boundary       \b       Match beginning/end of word                        
Non-Word Boundary   \B       Match not beginning/end of word                    
0-1 (Greedy)        x?       Match 0-1 times greedy                             
0-1 (Lazy)          x??      Match 0-1 times lazy                               
>= 0 (Greedy)       x*       Match 0 or more times greedy                       
>= 0 (Lazy)         x*?      Match 0 or more times lazy                         
>= 1 (Greedy)       x+       Match 1 or more times greedy                       
>= 1 (Lazy)         x+?      Match 1 or more times lazy                         
Exactly N           x{4}     Match N times                                      
Min-Max             x{4,8}   Match min-max times                                
> N                 x{9,}    Match N or more times        
```

# Exercises

## String Data

In [3]:
random = [
    "A download file from the http://example.com", 
    "Another url ftp://www.example.com",
    "And https://www.example.net",
    "@hadley (Dr. Wickham) I like #rstats for #ggplot2 work.",
    "Difference between #magrittr and #pipeR, both implement pipeline operators for #rstats: http://renkun.me/r/2014/07/26/difference-between-magrittr-and-pipeR.html @timelyportfolio",
    "Is -2 an integer?", "-4.3 and 3.33 are not.",
    "123,456 is 0 alot -123456 more than -.2", "and 3456789123 fg for 345.",
    "There is $5.50 for me.", "that's 45.6% of the pizza", 
    "14% is $26 or $25.99",
    "Mr. Bean bought 2 tickets 2-613-213-4567",
    "43 Butter Rd, Brossard QC K0A 3P0 - 613 213 4567", 
    "The Rat Race, XX, 12345",
    "Ignore phone numbers(613)2134567",
    "Grab zips with dashes 12345-6789 or no space before12345-6789",  
    "Grab zips with spaces 12345 6789 or no space before12345 6789",
    "I like 1234567 dogs",
    " There is ( $5.50 ) for , me and you (NAME HERE). ", " that's [ 45.6% ] of! the pizza !", 
    "     14% is { $26  } or $25.99 ?", "Oh ;  here's colon : Yippee !",
    "I love chicken [unintelligible]!",
    "Me too Miss Jane! (laughter) It's so good.[interrupting]",
    "Yep it's awesome {reading}.", "Agreed Ms. Jones. {is so much fun}",
    "R uses 1:5 for 1, 2, 3, 4, 5.", 
    "At 3:00 we'll meet up , and we leave by 4:30:20",
    "We'll meet at 6:33 , bring $20,000.", "He ran it in :22.34"
]
random

['A download file from the http://example.com',
 'Another url ftp://www.example.com',
 'And https://www.example.net',
 '@hadley (Dr. Wickham) I like #rstats for #ggplot2 work.',
 'Difference between #magrittr and #pipeR, both implement pipeline operators for #rstats: http://renkun.me/r/2014/07/26/difference-between-magrittr-and-pipeR.html @timelyportfolio',
 'Is -2 an integer?',
 '-4.3 and 3.33 are not.',
 '123,456 is 0 alot -123456 more than -.2',
 'and 3456789123 fg for 345.',
 'There is $5.50 for me.',
 "that's 45.6% of the pizza",
 '14% is $26 or $25.99',
 'Mr. Bean bought 2 tickets 2-613-213-4567',
 '43 Butter Rd, Brossard QC K0A 3P0 - 613 213 4567',
 'The Rat Race, XX, 12345',
 'Ignore phone numbers(613)2134567',
 'Grab zips with dashes 12345-6789 or no space before12345-6789',
 'Grab zips with spaces 12345 6789 or no space before12345 6789',
 'I like 1234567 dogs',
 ' There is ( $5.50 ) for , me and you (NAME HERE). ',
 " that's [ 45.6% ] of! the pizza !",
 '     14% is { $26  }

## Use `'random'` to make a 1 column pandas dataframe with the column named random.

In [4]:
x = pd.DataFrame(random, columns = ['random'])
x

Unnamed: 0,random
0,A download file from the http://example.com
1,Another url ftp://www.example.com
2,And https://www.example.net
3,@hadley (Dr. Wickham) I like #rstats for #ggpl...
4,"Difference between #magrittr and #pipeR, both ..."
5,Is -2 an integer?
6,-4.3 and 3.33 are not.
7,"123,456 is 0 alot -123456 more than -.2"
8,and 3456789123 fg for 345.
9,There is $5.50 for me.


## Remove all Twitter hash tags (#tag).

In [25]:
x.random.str.replace('#\w+', '')

0           A download file from the http://example.com
1                     Another url ftp://www.example.com
2                           And https://www.example.net
3              @hadley (Dr. Wickham) I like  for  work.
4     Difference between  and , both implement pipel...
5                                     Is -2 an integer?
6                                -4.3 and 3.33 are not.
7               123,456 is 0 alot -123456 more than -.2
8                            and 3456789123 fg for 345.
9                                There is $5.50 for me.
10                            that's 45.6% of the pizza
11                                 14% is $26 or $25.99
12             Mr. Bean bought 2 tickets 2-613-213-4567
13     43 Butter Rd, Brossard QC K0A 3P0 - 613 213 4567
14                              The Rat Race, XX, 12345
15                     Ignore phone numbers(613)2134567
16    Grab zips with dashes 12345-6789 or no space b...
17    Grab zips with spaces 12345 6789 or no spa

## Replace all Twitter user names (@name) with `'<<USER>>'`.

In [24]:
x.random.str.replace('@\w+', '<<USER>>')

0           A download file from the http://example.com
1                     Another url ftp://www.example.com
2                           And https://www.example.net
3     <<USER>> (Dr. Wickham) I like #rstats for #ggp...
4     Difference between #magrittr and #pipeR, both ...
5                                     Is -2 an integer?
6                                -4.3 and 3.33 are not.
7               123,456 is 0 alot -123456 more than -.2
8                            and 3456789123 fg for 345.
9                                There is $5.50 for me.
10                            that's 45.6% of the pizza
11                                 14% is $26 or $25.99
12             Mr. Bean bought 2 tickets 2-613-213-4567
13     43 Butter Rd, Brossard QC K0A 3P0 - 613 213 4567
14                              The Rat Race, XX, 12345
15                     Ignore phone numbers(613)2134567
16    Grab zips with dashes 12345-6789 or no space b...
17    Grab zips with spaces 12345 6789 or no spa

## Replace dollar amounts including dollars and decimals with `'<<$$$>>'`.

In [23]:
x.random.str.replace("\$\d[\d,]*(\.\d+)*", '<<$$$>>')

0           A download file from the http://example.com
1                     Another url ftp://www.example.com
2                           And https://www.example.net
3     @hadley (Dr. Wickham) I like #rstats for #ggpl...
4     Difference between #magrittr and #pipeR, both ...
5                                     Is -2 an integer?
6                                -4.3 and 3.33 are not.
7               123,456 is 0 alot -123456 more than -.2
8                            and 3456789123 fg for 345.
9                              There is <<$$$>> for me.
10                            that's 45.6% of the pizza
11                            14% is <<$$$>> or <<$$$>>
12             Mr. Bean bought 2 tickets 2-613-213-4567
13     43 Butter Rd, Brossard QC K0A 3P0 - 613 213 4567
14                              The Rat Race, XX, 12345
15                     Ignore phone numbers(613)2134567
16    Grab zips with dashes 12345-6789 or no space b...
17    Grab zips with spaces 12345 6789 or no spa

## Count the number of articles (the, a, an) in each string.  Remember to ignore case.  

Don't count words that contain the articles (e.g., `"they"` contains the sub-string 'the'; don't count it).  '\b' may not work unless you make the string raw: http://stackoverflow.com/a/3995242/1000343

In [8]:
x.random.str.count(r"(?i)\b(the|a|an)\b")

0     2
1     0
2     0
3     0
4     0
5     1
6     0
7     0
8     0
9     0
10    1
11    0
12    0
13    0
14    1
15    0
16    0
17    0
18    0
19    0
20    1
21    0
22    0
23    0
24    0
25    0
26    0
27    0
28    0
29    0
30    0
Name: random, dtype: int64

## Replace 5 or 9 (must contain a dash) digit zip codes with `"<<ZIP>>"`.

In [9]:
x.random.str.replace(r"\b\d{5}(-\d{4})?", '<<ZIP>>')

0           A download file from the http://example.com
1                     Another url ftp://www.example.com
2                           And https://www.example.net
3     @hadley (Dr. Wickham) I like #rstats for #ggpl...
4     Difference between #magrittr and #pipeR, both ...
5                                     Is -2 an integer?
6                                -4.3 and 3.33 are not.
7             123,456 is 0 alot -<<ZIP>>6 more than -.2
8                          and <<ZIP>>89123 fg for 345.
9                                There is $5.50 for me.
10                            that's 45.6% of the pizza
11                                 14% is $26 or $25.99
12             Mr. Bean bought 2 tickets 2-613-213-4567
13     43 Butter Rd, Brossard QC K0A 3P0 - 613 213 4567
14                            The Rat Race, XX, <<ZIP>>
15                   Ignore phone numbers(613)<<ZIP>>67
16    Grab zips with dashes <<ZIP>> or no space befo...
17    Grab zips with spaces <<ZIP>> 6789 or no s

## Replace a name title (Mr., Ms., Mz., Miss, Dr., etc.) followed by a capitalized name (presumably last name) with `"<<TITLE NAME>>"`.

In [10]:
x.random.str.replace("(Miss|((D|M)(s|r|rs|z)).)\\s+[A-Z][a-z']+", '<<TITLE NAME>>')

0           A download file from the http://example.com
1                     Another url ftp://www.example.com
2                           And https://www.example.net
3     @hadley (<<TITLE NAME>>) I like #rstats for #g...
4     Difference between #magrittr and #pipeR, both ...
5                                     Is -2 an integer?
6                                -4.3 and 3.33 are not.
7               123,456 is 0 alot -123456 more than -.2
8                            and 3456789123 fg for 345.
9                                There is $5.50 for me.
10                            that's 45.6% of the pizza
11                                 14% is $26 or $25.99
12       <<TITLE NAME>> bought 2 tickets 2-613-213-4567
13     43 Butter Rd, Brossard QC K0A 3P0 - 613 213 4567
14                              The Rat Race, XX, 12345
15                     Ignore phone numbers(613)2134567
16    Grab zips with dashes 12345-6789 or no space b...
17    Grab zips with spaces 12345 6789 or no spa

## Replace double spaces with single spaces.

In [11]:
x.random.str.replace("\s+", ' ')

0           A download file from the http://example.com
1                     Another url ftp://www.example.com
2                           And https://www.example.net
3     @hadley (Dr. Wickham) I like #rstats for #ggpl...
4     Difference between #magrittr and #pipeR, both ...
5                                     Is -2 an integer?
6                                -4.3 and 3.33 are not.
7               123,456 is 0 alot -123456 more than -.2
8                            and 3456789123 fg for 345.
9                                There is $5.50 for me.
10                            that's 45.6% of the pizza
11                                 14% is $26 or $25.99
12             Mr. Bean bought 2 tickets 2-613-213-4567
13     43 Butter Rd, Brossard QC K0A 3P0 - 613 213 4567
14                              The Rat Race, XX, 12345
15                     Ignore phone numbers(613)2134567
16    Grab zips with dashes 12345-6789 or no space b...
17    Grab zips with spaces 12345 6789 or no spa

## Extract final punctuation mark. 

In [12]:
x.random.str.extractall("([?.!]$)")

Unnamed: 0_level_0,Unnamed: 1_level_0,0
Unnamed: 0_level_1,match,Unnamed: 2_level_1
3,0,.
5,0,?
6,0,.
8,0,.
9,0,.
20,0,!
21,0,?
22,0,!
23,0,!
25,0,.


## Which strings have a comma preceded by a space?  Remove space before commas.

In [13]:
x['commaspace'] = x.random.str.contains('\s+,')
x[x['commaspace']].index.tolist()

[19, 28, 29]

In [14]:
x.random.str.replace("\s+,", ',')

0           A download file from the http://example.com
1                     Another url ftp://www.example.com
2                           And https://www.example.net
3     @hadley (Dr. Wickham) I like #rstats for #ggpl...
4     Difference between #magrittr and #pipeR, both ...
5                                     Is -2 an integer?
6                                -4.3 and 3.33 are not.
7               123,456 is 0 alot -123456 more than -.2
8                            and 3456789123 fg for 345.
9                                There is $5.50 for me.
10                            that's 45.6% of the pizza
11                                 14% is $26 or $25.99
12             Mr. Bean bought 2 tickets 2-613-213-4567
13     43 Butter Rd, Brossard QC K0A 3P0 - 613 213 4567
14                              The Rat Race, XX, 12345
15                     Ignore phone numbers(613)2134567
16    Grab zips with dashes 12345-6789 or no space b...
17    Grab zips with spaces 12345 6789 or no spa

## Replace words starting with capital letters with `"<<CAPS>>"`.

In [15]:
x.random.str.replace(r"\b[A-Z]\w+", '<<CAPS>>')

0           A download file from the http://example.com
1                    <<CAPS>> url ftp://www.example.com
2                      <<CAPS>> https://www.example.net
3     @hadley (<<CAPS>>. <<CAPS>>) I like #rstats fo...
4     <<CAPS>> between #magrittr and #pipeR, both im...
5                               <<CAPS>> -2 an integer?
6                                -4.3 and 3.33 are not.
7               123,456 is 0 alot -123456 more than -.2
8                            and 3456789123 fg for 345.
9                             <<CAPS>> is $5.50 for me.
10                            that's 45.6% of the pizza
11                                 14% is $26 or $25.99
12    <<CAPS>>. <<CAPS>> bought 2 tickets 2-613-213-...
13    43 <<CAPS>> <<CAPS>>, <<CAPS>> <<CAPS>> <<CAPS...
14          <<CAPS>> <<CAPS>> <<CAPS>>, <<CAPS>>, 12345
15                   <<CAPS>> phone numbers(613)2134567
16    <<CAPS>> zips with dashes 12345-6789 or no spa...
17    <<CAPS>> zips with spaces 12345 6789 or no

## Replace all parenthesis and the text in between with `'<<PLACEHOLDER>>'`.

In [16]:
x.random.str.replace("\\([^)]*?\\)", "<<PLACEHOLDER>>")

0           A download file from the http://example.com
1                     Another url ftp://www.example.com
2                           And https://www.example.net
3     @hadley <<PLACEHOLDER>> I like #rstats for #gg...
4     Difference between #magrittr and #pipeR, both ...
5                                     Is -2 an integer?
6                                -4.3 and 3.33 are not.
7               123,456 is 0 alot -123456 more than -.2
8                            and 3456789123 fg for 345.
9                                There is $5.50 for me.
10                            that's 45.6% of the pizza
11                                 14% is $26 or $25.99
12             Mr. Bean bought 2 tickets 2-613-213-4567
13     43 Butter Rd, Brossard QC K0A 3P0 - 613 213 4567
14                              The Rat Race, XX, 12345
15           Ignore phone numbers<<PLACEHOLDER>>2134567
16    Grab zips with dashes 12345-6789 or no space b...
17    Grab zips with spaces 12345 6789 or no spa

## Remove all non-vowel characters ignoring cases.

In [17]:
x.random.str.replace("(?i)[^aeiou]", "")

0                                        Aooaieoeeaeo
1                                            Aoeueaeo
2                                               Aeaee
3                                         aeiaIieaooo
4     ieeeeeeaiaieoieeieieoeaooaeueieeeeeeaiaieieooio
5                                               Iaiee
6                                                aaeo
7                                              iaooea
8                                                  ao
9                                               eeioe
10                                              aoeia
11                                                 io
12                                             eaouie
13                                              ueoaA
14                                               eaae
15                                            Ioeoeue
16                                       aiiaeooaeeoe
17                                       aiiaeooaeeoe
18                          

## Remove all URLs.


In [18]:
x.random.str.replace("(http[^ ]*)|(ftp[^ ]*)|(www\\.[^ ]*)", "")

0                             A download file from the 
1                                          Another url 
2                                                  And 
3     @hadley (Dr. Wickham) I like #rstats for #ggpl...
4     Difference between #magrittr and #pipeR, both ...
5                                     Is -2 an integer?
6                                -4.3 and 3.33 are not.
7               123,456 is 0 alot -123456 more than -.2
8                            and 3456789123 fg for 345.
9                                There is $5.50 for me.
10                            that's 45.6% of the pizza
11                                 14% is $26 or $25.99
12             Mr. Bean bought 2 tickets 2-613-213-4567
13     43 Butter Rd, Brossard QC K0A 3P0 - 613 213 4567
14                              The Rat Race, XX, 12345
15                     Ignore phone numbers(613)2134567
16    Grab zips with dashes 12345-6789 or no space b...
17    Grab zips with spaces 12345 6789 or no spa

## Tokenizing: 
1. Remove all characters except letters, apostrophes, & spaces. 
2. Remove leading and trailing spaces.  
3. Split each string into a vector of words (split on one or more spaces).  

In [19]:
x['temp'] = x.random.str.replace("[^A-Za-z' ]", "")
x['temp'] = x.temp.str.replace("^\\s+|\\s+$", "")
x.temp.str.split("\s+")

0        [A, download, file, from, the, httpexamplecom]
1                      [Another, url, ftpwwwexamplecom]
2                             [And, httpswwwexamplenet]
3     [hadley, Dr, Wickham, I, like, rstats, for, gg...
4     [Difference, between, magrittr, and, pipeR, bo...
5                                     [Is, an, integer]
6                                       [and, are, not]
7                                [is, alot, more, than]
8                                        [and, fg, for]
9                                  [There, is, for, me]
10                             [that's, of, the, pizza]
11                                             [is, or]
12                          [Mr, Bean, bought, tickets]
13                    [Butter, Rd, Brossard, QC, KA, P]
14                                 [The, Rat, Race, XX]
15                             [Ignore, phone, numbers]
16    [Grab, zips, with, dashes, or, no, space, before]
17    [Grab, zips, with, spaces, or, no, space, 

## Replace all numbers (including decimal points and negative signs) with `"<<NUMBER>>"`.

In [20]:
x.random.str.replace( "-?\\d[\\d,]*(\\.\\d+)*", "<<NUMBER>>")

0           A download file from the http://example.com
1                     Another url ftp://www.example.com
2                           And https://www.example.net
3     @hadley (Dr. Wickham) I like #rstats for #ggpl...
4     Difference between #magrittr and #pipeR, both ...
5                             Is <<NUMBER>> an integer?
6                    <<NUMBER>> and <<NUMBER>> are not.
7     <<NUMBER>> is <<NUMBER>> alot <<NUMBER>> more ...
8                     and <<NUMBER>> fg for <<NUMBER>>.
9                          There is $<<NUMBER>> for me.
10                      that's <<NUMBER>>% of the pizza
11            <<NUMBER>>% is $<<NUMBER>> or $<<NUMBER>>
12    Mr. Bean bought <<NUMBER>> tickets <<NUMBER>><...
13    <<NUMBER>> Butter Rd, Brossard QC K<<NUMBER>>A...
14                         The Rat Race, XX, <<NUMBER>>
15           Ignore phone numbers(<<NUMBER>>)<<NUMBER>>
16    Grab zips with dashes <<NUMBER>><<NUMBER>> or ...
17    Grab zips with spaces <<NUMBER>> <<NUMBER>

***Note*** *that a number followed by a comma  (e.g., `"33,"` in `"We'll meet at 6:33, bring $20,000."`) might get captured.  We'll learn advanced techniques to handle this in a more advanced session.*
