# regex exercise

In [13]:
import re

exampleString = '''
Jessica is 15 years old, and Daniel is 27 years old.
Edward is 97 years old, and his grandfather, Oscar, is 102. 
'''

In [15]:
ages = re.findall(r'\d{1,3}',exampleString)
names = re.findall(r'[A-Z][a-z]*',exampleString)

print(ages)
print(names)

['15', '27', '97', '102']
['Jessica', 'Daniel', 'Edward', 'Oscar']


# main code

In [8]:
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

'''
POS tag list:

CC	coordinating conjunction
CD	cardinal digit
DT	determiner
EX	existential there (like: "there is" ... think of it like "there exists")
FW	foreign word
IN	preposition/subordinating conjunction
JJ	adjective	'big'
JJR	adjective, comparative	'bigger'
JJS	adjective, superlative	'biggest'
LS	list marker	1)
MD	modal	could, will
NN	noun, singular 'desk'
NNS	noun plural	'desks'
NNP	proper noun, singular	'Harrison'
NNPS	proper noun, plural	'Americans'
PDT	predeterminer	'all the kids'
POS	possessive ending	parent's
PRP	personal pronoun	I, he, she
PRP$	possessive pronoun	my, his, hers
RB	adverb	very, silently,
RBR	adverb, comparative	better
RBS	adverb, superlative	best
RP	particle	give up
TO	to	go 'to' the store.
UH	interjection	errrrrrrrm
VB	verb, base form	take
VBD	verb, past tense	took
VBG	verb, gerund/present participle	taking
VBN	verb, past participle	taken
VBP	verb, sing. present, non-3d	take
VBZ	verb, 3rd person sing. present	takes
WDT	wh-determiner	which
WP	wh-pronoun	who, what
WP$	possessive wh-pronoun	whose
WRB	wh-abverb	where, when

'''

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

tokenized = custom_sent_tokenizer.tokenize(sample_text)

0.04020979020979021 0.07357859531772576 0.03836930455635491 5720 299 230 22
0.002972027972027972 0.0033444816053511705 0.002951484965873455 5720 299 17 1
0.0008741258741258741 0.006688963210702341 0.0005534034311012728 5720 299 5 2
0.0024475524475524478 0.016722408026755852 0.0016602102933038186 5720 299 14 5
0.0013986013986013986 0.0033444816053511705 0.0012912746725696365 5720 299 8 1
0.0034965034965034965 0.0033444816053511705 0.003504888396974728 5720 299 20 1
0.017132867132867134 0.04013377926421405 0.01586423169156982 5720 299 98 12
0.005244755244755245 0.006688963210702341 0.005165098690278546 5720 299 30 2
0.005944055944055944 0.026755852842809364 0.004796163069544364 5720 299 34 8
0.04493006993006993 0.043478260869565216 0.04501014572957019 5720 299 257 13
0.00017482517482517483 0.0033444816053511705 0.0 5720 299 1 1
0.016258741258741258 0.0802675585284281 0.012728278915329275 5720 299 93 24
0.001048951048951049 0.0033444816053511705 0.0009223390518354548 5720 299 6 1
0.029545

In [9]:
def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)

            chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""

            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            
            
### remove the comment tag below
#             print(chunked)
            
    except Exception as e:
        print(str(e))


process_content()


In [10]:
for i in tokenized:
    words = nltk.word_tokenize(i)
    tagged = nltk.pos_tag(words)

    chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""

    chunkParser = nltk.RegexpParser(chunkGram)
    chunked = chunkParser.parse(tagged)

### remove the comment tag below
#     chunked.draw()

In [11]:
for i in tokenized:
    words = nltk.word_tokenize(i)
    tagged = nltk.pos_tag(words)

    chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""

    chunkParser = nltk.RegexpParser(chunkGram)
    chunked = chunkParser.parse(tagged)
    
#     for subtree in chunked.subtrees():
#         print(subtree)

In [12]:
for i in tokenized:
    words = nltk.word_tokenize(i)
    tagged = nltk.pos_tag(words)
    chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
    chunkParser = nltk.RegexpParser(chunkGram)
    chunked = chunkParser.parse(tagged)

#     print(chunked)
    
    for subtree in chunked.subtrees(filter=lambda t: t.label() == 'Chunk'):
        print(subtree)

#     chunked.draw()

(Chunk PRESIDENT/NNP GEORGE/NNP W./NNP BUSH/NNP)
(Chunk ADDRESS/NNP)
(Chunk A/NNP JOINT/NNP SESSION/NNP)
(Chunk THE/NNP CONGRESS/NNP ON/NNP THE/NNP STATE/NNP)
(Chunk THE/NNP UNION/NNP January/NNP)
(Chunk THE/NNP PRESIDENT/NNP)
(Chunk Thank/NNP)
(Chunk Mr./NNP Speaker/NNP)
(Chunk Vice/NNP President/NNP Cheney/NNP)
(Chunk Congress/NNP)
(Chunk Supreme/NNP Court/NNP)
(Chunk called/VBD America/NNP)
(Chunk Coretta/NNP Scott/NNP King/NNP)
(Chunk Applause/NNP)
(Chunk President/NNP George/NNP W./NNP Bush/NNP)
(Chunk State/NNP)
(Chunk Union/NNP Address/NNP)
(Chunk Capitol/NNP)
(Chunk Tuesday/NNP)
(Chunk Jan/NNP)
(Chunk White/NNP House/NNP photo/NN)
(Chunk Eric/NNP DraperEvery/NNP time/NN)
(Chunk Capitol/NNP dome/NN)
(Chunk have/VBP served/VBN America/NNP)
(Chunk Tonight/NNP)
(Chunk Union/NNP)
(Chunk Applause/NNP)
(Chunk United/NNP)
(Chunk America/NNP)
(Chunk Applause/NNP)
(Chunk America/NNP)
(Chunk September/NNP)
(Chunk Dictatorships/NNP shelter/NN)
(Chunk Applause/NNP)
(Chunk Afghanistan/NNP)
(