In [3]:
import nltk
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import gutenberg, PlaintextCorpusReader
from nltk.util import ngrams
from nltk.lm.preprocessing import pad_both_ends, padded_everygram_pipeline
from nltk.lm import  MLE, Vocabulary
from nltk.tokenize.treebank import TreebankWordDetokenizer
import pandas as pd
import json

In [4]:
text = nltk.corpus.gutenberg.raw('chesterton-thursday.txt')
sents = nltk.sent_tokenize(text) #divide text into sentences
tokenized_text = [list(map(str.lower, word_tokenize(sent))) for sent in sents] #tokenize each sentence into words

In [5]:
padded_sent=list(pad_both_ends(tokenized_text[0], n=2))  #pad_both_ends adds padding to the beginning and end of the sentence
bigrams=list(ngrams(padded_sent, n=2)) #ngrams creates bi-grams from the padded sentence
padded_sent=list(pad_both_ends(tokenized_text[0], n=3)) #pad_both_ends adds padding to the beginning and end of the sentence. Qui mettiamo n=3 perch√® ci servono due parole precedenti
trigrams=list(ngrams(padded_sent, n=3)) #ngrams creates tri-grams from the padded sentence
print("Padded sentence n=3: ", padded_sent) #print padded sentence
print("Bigrams: ", bigrams) #print bi-grams
print("Trigrams: ", trigrams) #print tri-grams

Padded sentence n=3:  ['<s>', '<s>', '[', 'the', 'man', 'who', 'was', 'thursday', 'by', 'g.', 'k.', 'chesterton', '1908', ']', 'to', 'edmund', 'clerihew', 'bentley', 'a', 'cloud', 'was', 'on', 'the', 'mind', 'of', 'men', ',', 'and', 'wailing', 'went', 'the', 'weather', ',', 'yea', ',', 'a', 'sick', 'cloud', 'upon', 'the', 'soul', 'when', 'we', 'were', 'boys', 'together', '.', '</s>', '</s>']
Bigrams:  [('<s>', '['), ('[', 'the'), ('the', 'man'), ('man', 'who'), ('who', 'was'), ('was', 'thursday'), ('thursday', 'by'), ('by', 'g.'), ('g.', 'k.'), ('k.', 'chesterton'), ('chesterton', '1908'), ('1908', ']'), (']', 'to'), ('to', 'edmund'), ('edmund', 'clerihew'), ('clerihew', 'bentley'), ('bentley', 'a'), ('a', 'cloud'), ('cloud', 'was'), ('was', 'on'), ('on', 'the'), ('the', 'mind'), ('mind', 'of'), ('of', 'men'), ('men', ','), (',', 'and'), ('and', 'wailing'), ('wailing', 'went'), ('went', 'the'), ('the', 'weather'), ('weather', ','), (',', 'yea'), ('yea', ','), (',', 'a'), ('a', 'sick'),

In [6]:
n = 2
training_ngrams, padded_sents = padded_everygram_pipeline(n,tokenized_text)  #padded_everygram_pipeline creates a training set of n-grams and a list of padded sentences
print("Training n-grams: ", training_ngrams) #print training n-grams
print("Padded sentences: ", padded_sents) #print padded sentences


Training n-grams:  <generator object padded_everygram_pipeline.<locals>.<genexpr> at 0x0000022F305AF340>
Padded sentences:  <itertools.chain object at 0x0000022F6175F640>


In [7]:
# we want a 3-gram model
n = 3
training_ngrams, padded_sents = padded_everygram_pipeline(n,
tokenized_text)
model = MLE(n)
model.fit(training_ngrams, padded_sents)

In [8]:
text1= nltk.corpus.gutenberg.raw('shakespeare-caesar.txt')
text2= nltk.corpus.gutenberg.raw('shakespeare-hamlet.txt')
text3=nltk.corpus.gutenberg.raw('shakespeare-macbeth.txt')

sents1 = nltk.sent_tokenize(text1)
sents2 = nltk.sent_tokenize(text2)
sents3 = nltk.sent_tokenize(text3)
print ('first two sententes of Caesar\n')
print  (sents1[0],sents1[0])
print ('first two sententes of Hamlet\n')
print  (sents2[0],sents2[0])
print ('first two sententes of Macbeth\n')
print  (sents3[0],sents3[0])
sents=sents1+sents2+sents3
print  ('n. sententes in Caesar: ',len(sents1),' sentences\n')
print  ('n. sententes in Hamlet: ',len(sents2),' sentences\n')
print  ('n. sententes in Macbeth: ',len(sents3),' sentences\n')
print  ('Total n. of sentences in our Shakespere corpus: ',len(sents),' sentences\n')
#sents.append(nltk.sent_tokenize(text2))
#sents.append(nltk.sent_tokenize(text3))

tokenized_text = [list(map(str.lower, word_tokenize(sent))) 
                  for sent in sents]



# we want a 3-gram model
n = 3
training_ngrams, padded_sents = padded_everygram_pipeline(n, tokenized_text)
model = MLE(n)
model.fit(training_ngrams, padded_sents)
print(model.vocab)
print('*** Checking for some words in the vocabulary... ***\n')
print (model.vocab.lookup('brute'))
print (model.vocab.lookup('lord'))
print (model.vocab.lookup('thane'))
print (model.vocab.lookup(['aliens', 'from', 'Mars']))
print (model.counts['brute'])
print (model.counts['lord'])
print (model.counts['thane'])
print('*** Now using the model... ***\n')
print('*** scores ***\n')
print ('brute', model.score('brute'))
print ('lord', model.score('lord'))
print ('thane', model.score('thane'))
print ('aliens', model.score('aliens'))
# P('is'|'lord')
print (model.score('is', 'lord'.split()))
print (model.score('of', 'the tragedie'.split()))
# P('question'|'is the')
print (model.score('question', 'is the'.split()))
print('*** perplexity ***\n')
test1 = [('and', 'make', 'your')]
test2 = [('into', 'the'), ('sea', 'and')]
test3 = [('inter', 'will', 'win'), ('the', 'champions', 'league')]
test4 = [('into', 'the', 'sky')]
test5 = [('into', 'the', 'sea')]
print (test1, model.perplexity(test1))
print (test2, model.perplexity(test2))
print (test3, model.perplexity(test3))
print (test4, model.perplexity(test4))
print (test5, model.perplexity(test5))
print(model.generate(10))
print(model.generate(10,random_seed=3))
print(model.generate(10,random_seed=5))
print(model.generate(10,random_seed=3))
print(model.generate(10,random_seed=5))


first two sententes of Caesar

[The Tragedie of Julius Caesar by William Shakespeare 1599]


Actus Primus. [The Tragedie of Julius Caesar by William Shakespeare 1599]


Actus Primus.
first two sententes of Hamlet

[The Tragedie of Hamlet by William Shakespeare 1599]


Actus Primus. [The Tragedie of Hamlet by William Shakespeare 1599]


Actus Primus.
first two sententes of Macbeth

[The Tragedie of Macbeth by William Shakespeare 1603]


Actus Primus. [The Tragedie of Macbeth by William Shakespeare 1603]


Actus Primus.
n. sententes in Caesar:  1592  sentences

n. sententes in Hamlet:  2355  sentences

n. sententes in Macbeth:  1465  sentences

Total n. of sentences in our Shakespere corpus:  5412  sentences

<Vocabulary with cutoff=1 unk_label='<UNK>' and 7806 items>
*** Checking for some words in the vocabulary... ***

brute
lord
thane
('<UNK>', 'from', '<UNK>')
1
293
25
*** Now using the model... ***

*** scores ***

brute 9.46880030300161e-06
lord 0.0027743584887794715
thane 0.000236

In [9]:
text1= nltk.corpus.gutenberg.raw('shakespeare-caesar.txt')
text2= nltk.corpus.gutenberg.raw('shakespeare-hamlet.txt')
text3=nltk.corpus.gutenberg.raw('shakespeare-macbeth.txt')

sents1 = nltk.sent_tokenize(text1)
sents2 = nltk.sent_tokenize(text2)
sents3 = nltk.sent_tokenize(text3)
sents=sents1+sents2+sents3
tokenized_text = [list(map(str.lower, word_tokenize(sent))) 
                  for sent in sents]



# we want a 3-gram model
n = 3
training_ngrams, padded_sents = padded_everygram_pipeline(n, tokenized_text)
model = MLE(n)
model.fit(training_ngrams, padded_sents)
print(model.generate(20, random_seed=0))
print(model.generate(20, random_seed=1))

detokenize = TreebankWordDetokenizer().detokenize

def generate_sent(model, num_words, random_seed):
    content = []
    for token in model.generate(num_words, random_seed=random_seed):
        if token == '<s>':
            continue
        if token == '</s>':
            break
        content.append(token)
    return detokenize(content)


print (generate_sent(model, 20, random_seed=0))
print (generate_sent(model, 20, random_seed=1))
print (generate_sent(model, 20, random_seed=2))
print (generate_sent(model, 20, random_seed=3))
print (generate_sent(model, 20, random_seed=4))
print (generate_sent(model, 20, random_seed=5))




['that', 'they', 'follow', ',', 'did', 'loose', 'his', 'lustre', ':', 'i', 'will', 'not', 'come', 'to', 'my', 'father', 'much', 'offended', 'qu', '.']
[':', 'tis', 'true', ',', 'octa', '.', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>']
that they follow, did loose his lustre: i will not come to my father much offended qu.
: tis true, octa.
where we are, my lord bru.


intended towards him?


In [13]:
detokenize = TreebankWordDetokenizer().detokenize
def generate_sent(model, num_words, random_seed):
    content = []
    for token in model.generate(num_words, random_seed=random_seed):
        if token == '<s>':
            continue
        if token == '</s>':
            break
        content.append(token)
    return detokenize(content)  


df = pd.read_csv('../Donald-Tweets!.csv')
df.head()
trump_corpus = list(df['Tweet_Text'].apply(word_tokenize))
n = 3
train_data, padded_sents = padded_everygram_pipeline(n, trump_corpus)
trump_model = MLE(n)
trump_model.fit(train_data, padded_sents)
print(generate_sent(trump_model, num_words=20, random_seed=0))
print(generate_sent(trump_model, num_words=20, random_seed=1))
print(generate_sent(trump_model, num_words=20, random_seed=2))




picks it up! Democrats numbers are down big in new Quinnipiac poll just released . Wow . Unbelievable crowd
17 other people!
via my Facebook page in St. Joseph, Michigan . Streaming live - join us today because of my constant


In [None]:
def read_Corpus(corpus_root):
    tweetsfiles = PlaintextCorpusReader(corpus_root, '.*')
    tweetlist=[]
    for fileid in tweetsfiles.fileids():
        with open(corpus_root+fileid) as user_file:
            parsed_json = json.load(user_file)
            tweets=parsed_json.get('data')
            for tweet in tweets:
                tweetlist.append(tweet.get('text'))
                #print (tweet.get('text'))
    return tweetlist


corpus_folder='../tweets_itapol/'
tweets = read_Corpus(corpus_folder)
print ('Some tweets:\n')
print (tweets[0])
print (tweets[1])
print (tweets[2])

tokenized_text = [list(map(str.lower, word_tokenize(tweet))) 
                  for tweet in tweets]

# we want a 3-gram model
n = 3
training_ngrams, padded_sents = padded_everygram_pipeline(n, tokenized_text)
model = MLE(n)
model.fit(training_ngrams, padded_sents)

detokenize = TreebankWordDetokenizer().detokenize

def generate_sent(model, num_words, random_seed):
    content = []
    for token in model.generate(num_words, random_seed=random_seed):
        if token == '<s>':
            continue
        if token == '</s>':
            break
        content.append(token)
    return detokenize(content)

print ('\nGenerating some tweets with 3-grams...\n')
print ('\n ***', generate_sent(model, 20, random_seed=10))
print ('\n ***',generate_sent(model, 20, random_seed=11))
print ('\n ***',generate_sent(model, 20, random_seed=12))
print ('\n ***',generate_sent(model, 20, random_seed=13))
print ('\n ***',generate_sent(model, 20, random_seed=14))
print ('\n ***',generate_sent(model, 20, random_seed=15))

n = 4
training_ngrams, padded_sents = padded_everygram_pipeline(n, tokenized_text)
model = MLE(n)
model.fit(training_ngrams, padded_sents)

print ('\nGenerating some tweets with 4-grams...\n')
print ('\n ***', generate_sent(model, 20, random_seed=10))
print ('\n ***',generate_sent(model, 20, random_seed=11))
print ('\n ***',generate_sent(model, 20, random_seed=12))
print ('\n ***',generate_sent(model, 20, random_seed=13))
print ('\n ***',generate_sent(model, 20, random_seed=14))
print ('\n ***',generate_sent(model, 20, random_seed=15))

FileNotFoundError: [Errno 2] No such file or directory: '../tweets_itapoltweets_1662817574_0'