In [None]:
# 1- Regular Expressions

In [1]:
import re

my_string = "Let's write RegEx!"
result = re.findall(r"\w+", my_string)
print(result)

['Let', 's', 'write', 'RegEx']


In [2]:
# re.split() and re.findall()
# Note: It's important to prefix your regex patterns with r to ensure that your patterns are 
# interpreted in the way you want them to. Else, you may encounter problems to do with escape 
# sequences in strings. 

my_string = "Let's write RegEx!  Won't that be fun?  I sure think so.  Can you find 4 sentences?  Or perhaps, all 19 words?"

# the syntax for the regex library is to always to pass the pattern first, and then the string second.
# Write a pattern to match sentence endings: sentence_endings
sentence_endings = (r"[.?!]")
print(re.split(sentence_endings, my_string))

["Let's write RegEx", "  Won't that be fun", '  I sure think so', '  Can you find 4 sentences', '  Or perhaps, all 19 words', '']


In [3]:
# Find all capitalized words in my_string and print the result
capitalized_words = r"[A-Z]\w+"
print(re.findall(capitalized_words, my_string))

['Let', 'RegEx', 'Won', 'Can', 'Or']


In [6]:
# Split my_string on spaces and print the result, keeping all punctuation intact.
spaces = r"\s+"
print(re.split(spaces, my_string))

["Let's", 'write', 'RegEx!', "Won't", 'that', 'be', 'fun?', 'I', 'sure', 'think', 'so.', 'Can', 'you', 'find', '4', 'sentences?', 'Or', 'perhaps,', 'all', '19', 'words?']


In [7]:
# Find all digits in my_string and print the result
digits = r"\d+"
print(re.findall(digits, my_string))

['4', '19']


In [9]:
# Word tokenization with NLTK

scene_one = "SCENE 1: [wind] [clop clop clop] \nKING ARTHUR: Whoa there!  [clop clop clop] \nSOLDIER #1: Halt!  Who goes there?\nARTHUR: It is I, Arthur, son of Uther Pendragon, from the castle of Camelot.  King of the Britons, defeator of the Saxons, sovereign of all England!\nSOLDIER #1: Pull the other one!\nARTHUR: I am, ...  and this is my trusty servant Patsy.  We have ridden the length and breadth of the land in search of knights who will join me in my court at Camelot.  I must speak with your lord and master.\nSOLDIER #1: What?  Ridden on a horse?\nARTHUR: Yes!\nSOLDIER #1: You're using coconuts!\nARTHUR: What?\nSOLDIER #1: You've got two empty halves of coconut and you're bangin' 'em together.\nARTHUR: So?  We have ridden since the snows of winter covered this land, through the kingdom of Mercea, through--\nSOLDIER #1: Where'd you get the coconuts?\nARTHUR: We found them.\nSOLDIER #1: Found them?  In Mercea?  The coconut's tropical!\nARTHUR: What do you mean?\nSOLDIER #1: Well, this is a temperate zone.\nARTHUR: The swallow may fly south with the sun or the house martin or the plover may seek warmer climes in winter, yet these are not strangers to our land?\nSOLDIER #1: Are you suggesting coconuts migrate?\nARTHUR: Not at all.  They could be carried.\nSOLDIER #1: What?  A swallow carrying a coconut?\nARTHUR: It could grip it by the husk!\nSOLDIER #1: It's not a question of where he grips it!  It's a simple question of weight ratios!  A five ounce bird could not carry a one pound coconut.\nARTHUR: Well, it doesn't matter.  Will you go and tell your master that Arthur from the Court of Camelot is here.\nSOLDIER #1: Listen.  In order to maintain air-speed velocity, a swallow needs to beat its wings forty-three times every second, right?\nARTHUR: Please!\nSOLDIER #1: Am I right?\nARTHUR: I'm not interested!\nSOLDIER #2: It could be carried by an African swallow!\nSOLDIER #1: Oh, yeah, an African swallow maybe, but not a European swallow.  That's my point.\nSOLDIER #2: Oh, yeah, I agree with that.\nARTHUR: Will you ask your master if he wants to join my court at Camelot?!\nSOLDIER #1: But then of course a-- African swallows are non-migratory.\nSOLDIER #2: Oh, yeah...\nSOLDIER #1: So they couldn't bring a coconut back anyway...  [clop clop clop] \nSOLDIER #2: Wait a minute!  Supposing two swallows carried it together?\nSOLDIER #1: No, they'd have to have it on a line.\nSOLDIER #2: Well, simple!  They'd just use a strand of creeper!\nSOLDIER #1: What, held under the dorsal guiding feathers?\nSOLDIER #2: Well, why not?\n"

# Import necessary modules
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

# Split scene_one into sentences: sentences
sentences = sent_tokenize(scene_one)

['SCENE 1: [wind] [clop clop clop] \nKING ARTHUR: Whoa there!', '[clop clop clop] \nSOLDIER #1: Halt!', 'Who goes there?', 'ARTHUR: It is I, Arthur, son of Uther Pendragon, from the castle of Camelot.', 'King of the Britons, defeator of the Saxons, sovereign of all England!', 'SOLDIER #1: Pull the other one!', 'ARTHUR: I am, ...  and this is my trusty servant Patsy.', 'We have ridden the length and breadth of the land in search of knights who will join me in my court at Camelot.', 'I must speak with your lord and master.', 'SOLDIER #1: What?', 'Ridden on a horse?', 'ARTHUR: Yes!', "SOLDIER #1: You're using coconuts!", 'ARTHUR: What?', "SOLDIER #1: You've got two empty halves of coconut and you're bangin' 'em together.", 'ARTHUR: So?', "We have ridden since the snows of winter covered this land, through the kingdom of Mercea, through--\nSOLDIER #1: Where'd you get the coconuts?", 'ARTHUR: We found them.', 'SOLDIER #1: Found them?', 'In Mercea?', "The coconut's tropical!", 'ARTHUR: What 

In [11]:
# Use word_tokenize to tokenize the fourth sentence: tokenized_sent
tokenized_sent = word_tokenize(sentences[3])
print(tokenized_sent)

['ARTHUR', ':', 'It', 'is', 'I', ',', 'Arthur', ',', 'son', 'of', 'Uther', 'Pendragon', ',', 'from', 'the', 'castle', 'of', 'Camelot', '.']


In [12]:
# Make a set of unique tokens in the entire scene: unique_tokens
unique_tokens = set(word_tokenize(scene_one))
print(unique_tokens)

{'Will', 'Camelot', 'winter', 'its', 'in', 'there', 'Supposing', 'Saxons', ',', 'Who', 'dorsal', 'Ridden', 'Pull', 'sovereign', 'with', 'grips', 'ask', 'England', 'SOLDIER', 'of', '.', 'he', 'held', 'why', 'an', 'join', 'times', 'back', "'", 'our', 'suggesting', 'Where', 'search', 'Are', 'ounce', 'You', "'m", 'use', 'ARTHUR', 'climes', "'ve", 'carrying', '1', 'get', 'on', 'this', 'Well', 'strand', 'breadth', 'migrate', 'wings', 'Please', 'defeator', 'guiding', 'In', 'other', 'could', '...', 'warmer', 'minute', 'by', 'So', 'swallow', 'mean', 'these', 'halves', 'are', 'be', 'line', 'but', 'point', 'bird', 'five', 'Whoa', '[', 'kingdom', 'maybe', 'must', 'yet', 'question', 'horse', 'under', 'bring', 'Pendragon', 'length', 'beat', ']', 'weight', 'pound', 'go', 'plover', 'from', 'covered', 'A', 'temperate', 'does', 'or', 'air-speed', 'do', 'that', 'What', 'wants', 'lord', 'seek', 'and', 'sun', 'is', 'you', 'speak', 'Court', 'just', 'creeper', ':', 'castle', 'King', 'matter', 'order', 'cours

In [15]:
# More regex with re.search()

# Search for the first occurrence of "coconuts" in scene_one: match
match = re.search("coconuts", scene_one)

# Print the start and end indexes of match
print(match.start(), match.end())

580 588


In [24]:
# a regular expression to search for anything in square brackets: pattern1
pattern1 = r"\[.*]"

# Use re.search to find the first text in square brackets
print(re.search(pattern1, scene_one))

<re.Match object; span=(9, 32), match='[wind] [clop clop clop]'>


In [34]:
# Find the script notation at the beginning of the fourth sentence and print it
pattern2 = r"[\w]+:"


<re.Match object; span=(0, 7), match='ARTHUR:'>


In [35]:
# Regex groups using or |, define with (), chr ranges []
# find best tokenizer, want to retain sentence punctuation as separate tokens, but have '#1' remain a single token.
# Unlike the syntax for the regex library, with nltk_tokenize() we pass the pattern as the second argument.
my_string = "SOLDIER #1: Found them? In Mercea? The coconut's tropical!"
from nltk.tokenize import regexp_tokenize

pattern3 = r"(\w+|#\d|\?|!)"
regexp_tokenize(my_string, pattern3)

['SOLDIER',
 '#1',
 'Found',
 'them',
 '?',
 'In',
 'Mercea',
 '?',
 'The',
 'coconut',
 's',
 'tropical',
 '!']

In [36]:
# Twitter, nltk.tokenize.TweetTokenizer class

from nltk.tokenize import TweetTokenizer
tweets = ['This is the best #nlp exercise ive found online! #python',
 '#NLP is super fun! <3 #learning',
 'Thanks @datacamp :) #nlp #python']

# Define a regex pattern to find hashtags: pattern4
pattern4 = r"#\w+"
# Use the pattern on the first tweet in the tweets list
hashtags = regexp_tokenize(tweets[0], pattern4)
print(hashtags)



['#nlp', '#python']


In [37]:
# Write a pattern that matches both mentions (@) and hashtags
pattern5 = r"(#\w+|@\w+)"
# Use the pattern on the last tweet in the tweets list
mentions_hashtags = regexp_tokenize(tweets[-1], pattern5)
print(mentions_hashtags)

['@datacamp', '#nlp', '#python']


In [41]:
# Use the TweetTokenizer to tokenize all tweets into one list
tknzr = TweetTokenizer()
all_tokens = [tknzr.tokenize(t) for t in tweets]
print(all_tokens)

[['This', 'is', 'the', 'best', '#nlp', 'exercise', 'ive', 'found', 'online', '!', '#python'], ['#NLP', 'is', 'super', 'fun', '!', '<3', '#learning'], ['Thanks', '@datacamp', ':)', '#nlp', '#python']]
