### GET TEMPLATES CONTAINING > 2 NOUNS

### GET DICTIONARIES POSTAG --> WORDS

### GET DICTIONARIES WORDS --> POSTAG


In [113]:
import nltk
import keras.preprocessing
import codecs
import re
import pickle
import random

In [98]:
with codecs.open('data/frost/input.txt','rb', encoding='utf-8') as f:
    data=f.read()

In [99]:
#CLEAN TEXT
def simple_clean(string):
        string = re.sub(r"\n", " KKK ", string) #MARK LINES
        string = re.sub(r"[^가-힣A-Za-z0-9(),!?\'\`]", " ", string)
        string = re.sub(r"\'", "", string)
        string = re.sub(r"-", "", string)
        string = re.sub(r":", "", string)

        string = re.sub(r"\'s", " \'s", string)
        string = re.sub(r"\'d", "ed", string) #for the old style
        string = re.sub(r"n\'t", " n\'t", string)
        string = re.sub(r"I\'ve", "I have", string)
        string = re.sub(r"\'ll", " will", string)

        string = re.sub(r"[0-9]+", "EOS", string) # EOS tag for numeric titling

        string = re.sub(r";", ",", string)
        string = re.sub(r",", " , ", string)
        string = re.sub(r"!", " ! ", string)
        string = re.sub(r"\?", " ? ", string)
        string = re.sub(r"\.", " . ", string)

        string = re.sub(r"\(", " \( ", string)
        string = re.sub(r"\)", " \) ", string)
        string = re.sub(r"\s{2,}", " ", string)

        return string.strip().split('KKK')

In [100]:
#GET TOKENIZED LINES
def get_token_sent(data):
    k=simple_clean(data)
    result=[]
    for el in k:
        res=el.split(' ')
        if set(res)!=set(['']):
            res=[x for x in res if x!='']
            #res=['sos']+res+['eos']
            result.append(res)
    return result

#GET POSTAG FOR SENTENCE
def get_pos_tag(sent):
    res=nltk.pos_tag(sent)
    res=[tup[1] for tup in res]
    return res

In [101]:
#GET TOKENIZED SENTENCE FOR THE TEXT
dat=get_token_sent(data)

In [102]:
#POSTAG ALL LINES
i=0
dataset=[]
while True:
    try:
        r=dat[i]
    except:
        break
    tag=get_pos_tag(r)
    if (tag.count('NN')+tag.count('NNS'))>1:
        dataset.append(tag)
    i+=1

In [103]:
# HOW MANY SINGULAR NOUNS WITH HAVE IN LINE 2?
get_pos_tag(dat[1]).count('NN')

0

In [104]:
#fly is a noun according to this, but it should also be a verb
nltk.pos_tag(['fly'])

[('fly', 'NN')]

In [105]:
# In order to get all possible tags for a particular word, we need to search over some corpus and find them.
#Search all tags for every word in the dataset
from collections import defaultdict, Counter
import itertools
counts = defaultdict(Counter)
tagged_sents = [nltk.pos_tag(sent) for sent in dat]
for word, pos in itertools.chain(*tagged_sents):
     counts[word][pos] += 1

In [106]:
#fly is a verb and a noun
counts['fly']

Counter({'NN': 13, 'VB': 6})

In [107]:
#kind is a noun, but 
print (counts['report'])
print(counts['kind'])

##add more examples pbt or brown nltk.corpus.brown

Counter()
Counter({'NN': 16})


In [17]:
# search over brown corpus too
"""from nltk.corpus import brown
print (brown.words())
tagged_sents = nltk.pos_tag(brown.words())
for word, pos in tagged_sents:
     counts[word][pos] += 1"""

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]


In [108]:
#create dictionary TAG--> WORDS
tag2words=defaultdict(list)
word2tags=defaultdict(list)
for word, tag in zip(counts.keys(), counts.values()):
    mc=tag.most_common(8)
    for pos_t, _ in mc:
        tag2words[pos_t].append(word.lower())
        word2tags[word.lower()].append(pos_t)

In [109]:
tag2words['NNS'][:10]

['wishes',
 'trees',
 'know',
 'knew',
 'walls',
 'raspberries',
 'fences',
 'vines',
 'woods',
 'chops']

In [110]:
word2tags["fly"]

['NN', 'VB']

In [92]:
dataset[0]

['CD', 'IN', 'PRP$', 'NNS', 'VBZ', 'IN', 'DT', 'JJ', 'NNS', ',']

In [115]:
NN2postag=defaultdict(list)
for sent in dataset:
    if sent.count('NN')>1:
        aux=[i for i,x in enumerate(sent) if x == 'NN']
        aux=random.sample(aux, 2)
        NN2postag['NN-NN'].append((sent, aux))
    if sent.count('NN')==1 and sent.count('NNS')==1:
        idxnn=sent.index('NN')
        idxnns=sent.index('NNS')
        NN2postag['NN-NNS'].append((sent,[idxnn, idxnns]))
    if sent.count('NNS')>1:
        aux=[i for i,x in enumerate(sent) if x == 'NNS']
        aux=random.sample(aux, 2)
        NN2postag['NNS-NNS'].append((sent, aux))
    

In [96]:
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

In [123]:
#save results
with open('postag_dict.p', 'wb') as f:
    pickle.dump([NN2postag, tag2words,word2tags],f, protocol=2 )

In [117]:
len(NN2postag['NN-NN']

2750

In [120]:
NN2postag['NN-NNS'][0]

(['CC', 'VBD', 'DT', 'NN', 'CC', 'DT', 'JJ', 'NNS', ','], [3, 7])

In [122]:
len(NN2postag['NN-NNS'])

714

In [121]:
len(dataset)

3721