In [10]:
import re
import os
import string
from nltk import sent_tokenize, word_tokenize, download
from IPython.display import clear_output
from collections import Counter
import gc
import pickle
from nltk.tokenize import RegexpTokenizer
from nltk.util import ngrams
"""
# Save objects
with open('object_name', 'wb') as f:
    pickle.dump(object_variable, f)

# Load objects
with open('object_name', 'rb') as f:
    object_variable = pickle.load(f)
"""
from pprint import pprint
print("Imports Completed")

Imports Completed


In [36]:
download('punkt')

[nltk_data] Downloading package punkt to /home/datum/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
def clean_text(text):
    """ 
    1. Remove html like text from europarl e.g. <Chapter 1>
    2. Remove line breaks
    3. Reduce all whitespaces to 1
    4. turn everything to lower case
    """
    clean = re.compile('<.*?>')
    
    out = text.replace('\n', ' ') # Remove line breaks
    out = re.sub(clean, ' ', out) # Remove tagged text e.g. <Chapter 1>
    out = re.sub(' +', ' ', out) # Reduce whitespace down to one
    
    out = out.lower() # Turn everything to lower case
    
    return out

In [39]:
abs_path = os.getcwd()
path = abs_path + '/en/'
text = ''
corpus_clean = []
sentences = []
corpus_original = ''
total = len(os.listdir(path)) # Total files
count = 0

for file in os.listdir(path):
    f = open(path + file, 'r', encoding="utf-8")
    file_text = f.read()
    corpus_original = corpus_original + file_text
    f.close()

    regex = re.compile('[%s]' % re.escape(string.punctuation))
    file_sentences = [regex.sub('', sent).strip() for sent in sent_tokenize(clean_text(file_text))]

    corpus_clean = corpus_clean + file_sentences
    count += 1
    
    clear_output(wait = True)
    print('File ' + file + ' finished. Completed ' + str(round(count*100/total,2)) + '%')

File ep-99-04-14.txt finished. Completed 100.0%


In [None]:
"""
demo corpus file:
------------------------------------------------------------------
original text: <class 'str'>
<CHAPTER ID="005">
Voting time
<SPEAKER ID="104" NAME="President">
The next item is the vote.
<P>
(For the results and other details on the vote: see Minutes)
------------------------------------------------------------------
Its clean_text(text): <class 'str'>
 voting time the next item is the vote. (for the results and other details on the vote: see minutes) 
------------------------------------------------------------------
Its sent_tokenize(clean_text(text)): <class 'list'>
[' voting time the next item is the vote.', '(for the results and other details on the vote: see minutes)']
------------------------------------------------------------------
Its sentences = [sent.strip() for sent in sent_tokenize(clean_text(text))]: <class 'list'>
['voting time the next item is the vote.', '(for the results and other details on the vote: see minutes)']
------------------------------------------------------------------
"""

In [40]:
# Save the basic objects:
with open('corpus_original', 'wb') as f:
    pickle.dump(corpus_original, f)
with open('corpus_clean', 'wb') as f:
    pickle.dump(corpus_clean, f)

In [41]:
regex = re.compile('[%s]' % re.escape(string.punctuation))
corpus_clean_string = regex.sub('', clean_text(corpus_original))

print('-------------------------')
print('corpus_clean_string created.')
print('-------------------------')

with open('corpus_clean_string', 'wb') as f:
    pickle.dump(corpus_clean_string, f)

del corpus_original

-------------------------
corpus_clean_string created.
-------------------------


In [42]:
gc.collect()

317

In [44]:
AllWords = word_tokenize(corpus_clean_string)
print('-------------------------')
print('Words Tokenized.')
print('-------------------------')

vocabulary = set(AllWords)
print('Vocabulary Created.')
print('-------------------------')

WordCounts = Counter(AllWords)
print('WordCounts Calculated.')
print('-------------------------')

with open('AllWords', 'wb') as f:
    pickle.dump(AllWords, f) 

with open('vocabulary', 'wb') as f:
    pickle.dump(vocabulary, f) 
    
with open('WordCounts', 'wb') as f:
    pickle.dump(WordCounts, f) 

del corpus_clean_string, AllWords

-------------------------
Words Tokenized.
-------------------------
Vocabulary Created.
-------------------------
WordCounts Calculated.
-------------------------


In [46]:
gc.collect()

0

#### Early starting point: Instead of all the above..

In [130]:
corpus_clean = None
WordCounts = None
vocabulary = None

# Load objects

with open('corpus_clean', 'rb') as f:
    corpus_clean = pickle.load(f)

with open('vocabulary', 'rb') as f:
    vocabulary = pickle.load(f)
    
with open('WordCounts', 'rb') as f:
    WordCounts = pickle.load(f)
    
with open('AllWords', 'rb') as f:
    AllWords = pickle.load(f)

In [131]:
# Ignore low frequency words
valid_vocabulary = [k for k,v in WordCounts.items() if v > 10]
invalid_vocabulary = [k for k,v in WordCounts.items() if v <= 10]
print("valid voc", len(valid_vocabulary))
print("invalid voc", len(invalid_vocabulary))

with open('valid_vocabulary', 'wb') as f:
    pickle.dump(valid_vocabulary, f) 
    
with open('invalid_vocabulary', 'wb') as f:
    pickle.dump(invalid_vocabulary, f) 

valid voc 32678
invalid voc 109653


In [132]:
# Replace OOV words in sentences

def split_sentence(sentence):
    PATTERN = '\w+|\(|\)|\.|\,'
    tokenizer = RegexpTokenizer(pattern=PATTERN) 
    return tokenizer.tokenize(sentence)
dummy_count = 0
total = len(corpus_clean)
for i in range(0,len(corpus_clean)):
    sentence = ''.join(corpus_clean[i]) # make it string
    splitted_sent = split_sentence(sentence)    
    new_sent = []
    for word in splitted_sent:
        new_word = 'UNK' if word not in valid_vocabulary else word
        new_sent.append(new_word)
        corpus_clean[i] = new_sent
    clear_output(wait = True)
    print('Sentences processed ' + str(i+1) + ' out of ' + str(total))
    dummy_count = dummy_count + 1
    if 1000 < dummy_count:
        break

# Have it here, in order to not to forget to save after the big computation burden.
with open('corpus_clean_no_OOV', 'wb') as f:
    pickle.dump(corpus_clean, f)

Sentences processed 1001 out of 2217535


In [None]:
# Deprecated (slower)
#
# Replace OOV words in sentences
# total = len(corpus_clean)
# for i in range(0,len(corpus_clean)):
#    for word in valid_vocabulary:
#        corpus_clean[i].replace(word, 'UNK')
#   clear_output(wait = True)
#   print('Sentences processed ' + str(i+1) + ' out of ' + str(total) )

In [137]:
# Creating the n-grams is actually not needed!! Only counting them is!!
#
# tokens = AllWords
# bigrams = [ gram for gram in ngrams(tokens, 2) ]
# trigrams = [ gram for gram in ngrams(tokens, 3) ]
# #pprint(bigrams)
# with open('bigrams', 'wb') as f:
#     pickle.dump(bigrams, f)
# with open('trigrams', 'wb') as f:
#    pickle.dump(bigrams, f)

In [153]:
# Creating the n-grams is actually not needed!! Only counting them is!!
#
# import sys, numpy as np
# print("AllWords:", np.round(sys.getsizeof(AllWords)/1024/1024/1024,2)," GBs")
# print("Bigrams:", np.round(sys.getsizeof(bigrams)/1024/1024/1024,2)," GBs")
# print("Trigrams:", np.round(sys.getsizeof(trigrams)/1024/1024/1024,2)," GBs")

#### Regular Starting Point

In [2]:
corpus_clean = None
WordCounts = None
vocabulary = None
corpus_clean_no_OOV = None

# Load objects

#with open('corpus_clean', 'rb') as f:
#    corpus_clean = pickle.load(f)

#with open('vocabulary', 'rb') as f:
#    vocabulary = pickle.load(f)
    
#with open('WordCounts', 'rb') as f:
#    WordCounts = pickle.load(f)
    
#with open('AllWords', 'rb') as f:
#    AllWords = pickle.load(f)
    
with open('corpus_clean_no_OOV', 'rb') as f:
    corpus_clean_no_OOV = pickle.load(f)

In [4]:
print(corpus_clean_no_OOV.__class__)
print(corpus_clean_no_OOV[0])
print(corpus_clean_no_OOV[0].__class__)
print(corpus_clean_no_OOV[0][0].__class__)

<class 'list'>
['voting', 'time', 'the', 'next', 'item', 'is', 'the', 'vote']
<class 'list'>
<class 'str'>


In [5]:
gc.collect()

0

#### Counting n-grams

In [12]:
# Single sentence:
unigram_counter = Counter()
unigram_counter.update([gram for gram in ngrams(corpus_clean_no_OOV[0], 1, pad_left=True, pad_right=True,
                                                   left_pad_symbol='<s>',right_pad_symbol='<e>') ])
pprint(corpus_clean_no_OOV[0])
pprint(unigram_counter)

['voting', 'time', 'the', 'next', 'item', 'is', 'the', 'vote']
Counter({('the',): 2,
         ('voting',): 1,
         ('time',): 1,
         ('next',): 1,
         ('item',): 1,
         ('is',): 1,
         ('vote',): 1})


In [13]:
unigram_counter = Counter()

for sent in corpus_clean_no_OOV:   
    unigram_counter.update([gram for gram in ngrams(sent, 1, pad_left=True, pad_right=True,
                                                   left_pad_symbol='<s>',right_pad_symbol='<e>') ])
pprint(unigram_counter)

Counter({(' ',): 52946256,
         ('e',): 33777881,
         ('t',): 26587881,
         ('o',): 21527226,
         ('i',): 21506507,
         ('a',): 20612020,
         ('n',): 19922991,
         ('s',): 17243500,
         ('r',): 16797840,
         ('h',): 12350861,
         ('l',): 10311449,
         ('c',): 8999365,
         ('d',): 8582719,
         ('u',): 7962412,
         ('m',): 7568070,
         ('p',): 6582836,
         ('f',): 5723232,
         ('g',): 4587912,
         ('w',): 4270584,
         ('y',): 3872688,
         ('b',): 3728599,
         ('v',): 2886483,
         ('k',): 1351319,
         ('x',): 470542,
         ('0',): 340337,
         ('j',): 333662,
         ('q',): 279629,
         ('2',): 198022,
         ('1',): 190021,
         ('z',): 143583,
         ('9',): 116594,
         ('5',): 82137,
         ('3',): 70323,
         ('4',): 62072,
         ('7',): 52231,
         ('6',): 51093,
         ('8',): 50665,
         ('’',): 43298,
         ('–',): 37207,

         ('suppressed',): 1,
         ('sufficiently',): 1,
         ('extract',): 1,
         ('employs',): 1,
         ('400',): 1,
         ('rid',): 1,
         ('hates',): 1,
         ('vacuums',): 1,
         ('continent',): 1,
         ('brief',): 1,
         ('interested',): 1,
         ('cynicism',): 1,
         ('ukraine',): 1,
         ('domain',): 1,
         ('issued',): 1,
         ('unreserved',): 1,
         ('win',): 1,
         ('biological',): 1,
         ('phenomenon',): 1,
         ('slowed',): 1,
         ('currency',): 1,
         ('banning',): 1,
         ('fantasy',): 1,
         ('rich',): 1,
         ('pipe',): 1,
         ('dream',): 1,
         ('mans',): 1,
         ('lenient',): 1,
         ('prefer',): 1,
         ('speeches',): 1,
         ('finishing',): 1,
         ('tempting',): 1,
         ('extension',): 1,
         ('thanking',): 1,
         ('persuade',): 1,
         ('tabling',): 1,
         ('spirit',): 1,
         ('logical',): 1,
         ('c

In [14]:
bigram_counter = Counter()

for sent in corpus_clean_no_OOV:
    bigram_counter.update([gram for gram in ngrams(sent, 2, pad_left=True, pad_right=True,
                                                   left_pad_symbol='<s>',right_pad_symbol='<e>') ])
pprint(bigram_counter)

Counter({('e', ' '): 11223123,
         (' ', 't'): 9330653,
         ('t', 'h'): 8004076,
         ('s', ' '): 6677124,
         (' ', 'a'): 6175428,
         ('h', 'e'): 5795503,
         ('t', ' '): 5578523,
         ('n', ' '): 5326519,
         ('i', 'n'): 4930933,
         ('o', 'n'): 4476913,
         ('r', 'e'): 4444196,
         (' ', 'i'): 4398256,
         ('d', ' '): 4184667,
         ('a', 'n'): 4014833,
         (' ', 'o'): 3784388,
         ('a', 't'): 3526038,
         ('e', 'r'): 3484424,
         ('e', 'n'): 3422394,
         ('t', 'i'): 3276372,
         ('i', 's'): 3055452,
         (' ', 'c'): 3036269,
         ('e', 's'): 2991913,
         ('y', ' '): 2948593,
         (' ', 'w'): 2944595,
         (' ', 's'): 2851128,
         ('r', ' '): 2809384,
         ('n', 't'): 2719524,
         ('o', 'r'): 2693077,
         ('o', ' '): 2683901,
         (' ', 'p'): 2624085,
         ('i', 't'): 2602072,
         ('n', 'd'): 2549329,
         ('i', 'o'): 2450433,
         

         ('t', '3'): 16,
         ('v', 'w'): 16,
         ('8', 'r'): 16,
         ('<s>', 'τ'): 16,
         ('\xad', 'b'): 16,
         ('i', '3'): 16,
         ('ž', 'a'): 16,
         ('ï', '<e>'): 16,
         ('â', '4'): 16,
         ('<s>', 'for'): 15,
         ('have', 'been'): 15,
         ('<s>', 'as'): 15,
         ('in', 'particular'): 15,
         ('say', 'that'): 15,
         ('not', 'to'): 15,
         ('a', 'very'): 15,
         ('the', 'environment'): 15,
         ('<s>', 'so'): 15,
         ('we', 'need'): 15,
         ('going', 'to'): 15,
         ('such', 'as'): 15,
         ('z', 'ü'): 15,
         ('ś', 'w'): 15,
         ('9', 'y'): 15,
         ('ū', 'n'): 15,
         ('g', '4'): 15,
         ('š', 'o'): 15,
         ('r', 'ô'): 15,
         ('€', '1'): 15,
         ('o', '”'): 15,
         ('h', 'è'): 15,
         ('c', '3'): 15,
         ('j', 'z'): 15,
         ('k', 'ü'): 15,
         ('2', 'f'): 15,
         ('–', 'c'): 15,
         ('–', 'a'): 15,
      

         ('for', 'europe'): 3,
         ('of', 'an'): 3,
         ('this', 'time'): 3,
         ('for', 'them'): 3,
         ('say', 'in'): 3,
         ('would', 'save'): 3,
         ('a', 'bit'): 3,
         ('for', 'me'): 3,
         ('the', 'previous'): 3,
         ('energy', 'committee'): 3,
         ('environment', 'committees'): 3,
         ('however', 'i'): 3,
         ('in', 'plenary'): 3,
         ('are', 'in'): 3,
         ('can', 'have'): 3,
         ('to', 'accept'): 3,
         ('the', 'choice'): 3,
         ('matter', 'for'): 3,
         ('individual', 'countries'): 3,
         ('and', 'safety'): 3,
         ('the', 'environmental'): 3,
         ('including', 'the'): 3,
         ('storage', 'of'): 3,
         ('maastricht', 'treaty'): 3,
         ('soulier', 'is'): 3,
         ('union', 'to'): 3,
         ('of', 'safety'): 3,
         ('safety', 'in'): 3,
         ('improve', 'the'): 3,
         ('and', 'not'): 3,
         ('ones', '<e>'): 3,
         ('to', 'invest'): 3,

         ('UNK', 'are'): 2,
         ('governments', 'of'): 2,
         ('like', 'other'): 2,
         ('certain', 'amount'): 2,
         ('amount', 'of'): 2,
         ('am', 'afraid'): 2,
         ('afraid', 'that'): 2,
         ('as', 'with'): 2,
         ('them', 'but'): 2,
         ('a', 'serious'): 2,
         ('we', 'always'): 2,
         ('which', 'support'): 2,
         ('just', 'think'): 2,
         ('think', 'if'): 2,
         ('were', 'to'): 2,
         ('fusion', 'power'): 2,
         ('same', 'status'): 2,
         ('waste', 'time'): 2,
         ('possible', 'for'): 2,
         ('longer', '<e>'): 2,
         ('not', 'true'): 2,
         ('president', 'you'): 2,
         ('vote', 'at'): 2,
         ('people', 'who'): 2,
         ('up', 'in'): 2,
         ('socialist', 'group'): 2,
         ('has', 'felt'): 2,
         ('as', 'rapporteur'): 2,
         ('be', 'voting'): 2,
         ('green', 'group'): 2,
         ('rollcall', 'votes'): 2,
         ('on', 'every'): 2,
       

         ('the', 'objectives'): 1,
         ('objectives', 'of'): 1,
         ('treaty', '<e>'): 1,
         ('the', 'conclusion'): 1,
         ('conclusion', 'is'): 1,
         ('that', 'every'): 1,
         ('every', 'effort'): 1,
         ('effort', 'should'): 1,
         ('environment', 'and'): 1,
         ('existing', 'plants'): 1,
         ('plants', 'and'): 1,
         ('if', 'individual'): 1,
         ('countries', 'wish'): 1,
         ('is', 'their'): 1,
         ('their', 'affair'): 1,
         ('affair', '<e>'): 1,
         ('the', 'task'): 1,
         ('task', 'of'): 1,
         ('eus', 'efforts'): 1,
         ('concentrated', 'on'): 1,
         ('environmental', 'aspect'): 1,
         ('energy', 'issue'): 1,
         ('issue', 'in'): 1,
         ('words', 'energy'): 1,
         ('energy', 'savings'): 1,
         ('savings', 'efficiency'): 1,
         ('efficiency', 'renewable'): 1,
         ('president', 'thank'): 1,
         ('thank', 'you'): 1,
         ('you', 'mr'): 1,

         ('contained', 'in'): 1,
         ('shall', 'mention'): 1,
         ('mention', 'just'): 1,
         ('just', 'two'): 1,
         ('two', '<e>'): 1,
         ('in', 'recital'): 1,
         ('recital', 'f'): 1,
         ('f', 'it'): 1,
         ('that', 'whereas'): 1,
         ('environment', 'UNK'): 1,
         ('be', 'dissociated'): 1,
         ('dissociated', 'from'): 1,
         ('sector', 'first'): 1,
         ('first', 'as'): 1,
         ('regards', 'the'): 1,
         ('emissions', 'to'): 1,
         ('it', 'contributes'): 1,
         ('contributes', '<e>'): 1,
         ('simply', 'untrue'): 1,
         ('<s>', 'several'): 1,
         ('several', 'studies'): 1,
         ('studies', 'dating'): 1,
         ('dating', 'back'): 1,
         ('the', 'early'): 1,
         ('early', '1990s'): 1,
         ('1990s', 'clearly'): 1,
         ('clearly', 'show'): 1,
         ('show', 'that'): 1,
         ('that', 'co2'): 1,
         ('emissions', 'caused'): 1,
         ('caused', 'by'

         ('present', 'there'): 1,
         ('there', 'exists'): 1,
         ('exists', 'no'): 1,
         ('which', 'an'): 1,
         ('a', 'maximum'): 1,
         ('maximum', 'credible'): 1,
         ('credible', 'nuclear'): 1,
         ('nuclear', 'accident'): 1,
         ('accident', 'can'): 1,
         ('be', 'excluded'): 1,
         ('excluded', '<e>'): 1,
         ('so', 'every'): 1,
         ('every', 'nuclear'): 1,
         ('europe', 'concerns'): 1,
         ('will', 'produce'): 1,
         ('produce', 'highly'): 1,
         ('highly', 'dangerous'): 1,
         ('dangerous', 'radiation'): 1,
         ('radiation', 'for'): 1,
         ('years', 'has'): 1,
         ('resolved', 'at'): 1,
         ('yet', 'we'): 1,
         ('we', 'happily'): 1,
         ('happily', 'go'): 1,
         ('on', 'producing'): 1,
         ('producing', '<e>'): 1,
         ('a', 'curious'): 1,
         ('curious', 'inhumane'): 1,
         ('inhumane', 'immoral'): 1,
         ('immoral', 'generation'):

         ('4th', 'century'): 1,
         ('century', 'bc'): 1,
         ('bc', 'met'): 1,
         ('met', 'a'): 1,
         ('a', 'philosopher'): 1,
         ('philosopher', 'who'): 1,
         ('also', 'greek'): 1,
         ('greek', 'and'): 1,
         ('and', 'asked'): 1,
         ('asked', 'him'): 1,
         ('him', 'not'): 1,
         ('to', 'deprive'): 1,
         ('deprive', 'him'): 1,
         ('him', 'of'): 1,
         ('now', 'we'): 1,
         ('we', 'too'): 1,
         ('too', 'todays'): 1,
         ('todays', 'mankind'): 1,
         ('mankind', 'in'): 1,
         ('20th', 'century'): 1,
         ('century', 'and'): 1,
         ('the', 'dawn'): 1,
         ('dawn', 'on'): 1,
         ('century', 'we'): 1,
         ('also', 'ask'): 1,
         ('ask', 'not'): 1,
         ('be', 'deprived'): 1,
         ('deprived', 'of'): 1,
         ('UNK', 'us'): 1,
         ('up', 'but'): 1,
         ('UNK', 'up'): 1,
         ('up', 'thermal'): 1,
         ('thermal', 'UNK'): 1,
      

         ('save', 'us'): 1,
         ('us', 'work'): 1,
         ('work', 'here'): 1,
         ('other', 'respects'): 1,
         ('respects', 'i'): 1,
         ('support', 'your'): 1,
         ('very', 'clear'): 1,
         ('would', 'suggest'): 1,
         ('for', 'referral'): 1,
         ('committee', 'is'): 1,
         ('is', 'rejected'): 1,
         ('rejected', 'i'): 1,
         ('shall', 'assume'): 1,
         ('assume', 'that'): 1,
         ('has', 'endorsed'): 1,
         ('endorsed', 'my'): 1,
         ('for', 'how'): 1,
         ('we', 'handle'): 1,
         ('handle', 'the'): 1,
         ('i', 'rise'): 1,
         ('rise', 'to'): 1,
         ('speak', 'in'): 1,
         ('block', 'voting'): 1,
         ('change', 'to'): 1,
         ('rules', 'introduced'): 1,
         ('introduced', 'some'): 1,
         ('ago', 'specifically'): 1,
         ('specifically', 'to'): 1,
         ('this', 'sort'): 1,
         ('of', 'situation'): 1,
         ('situation', '<e>'): 1,
         ('i

         ('its', 'more'): 1,
         ('more', 'widespread'): 1,
         ('widespread', 'generalization'): 1,
         ('generalization', 'was'): 1,
         ('was', 'proposed'): 1,
         ('proposed', 'in'): 1,
         ('in', '1997'): 1,
         ('1997', 'emphasis'): 1,
         ('emphasis', 'should'): 1,
         ('should', 'instead'): 1,
         ('instead', 'have'): 1,
         ('been', 'placed'): 1,
         ('placed', 'on'): 1,
         ('on', 'intraeuropean'): 1,
         ('intraeuropean', 'rationalization'): 1,
         ('rationalization', 'instead'): 1,
         ('of', 'concentrating'): 1,
         ('concentrating', 'so'): 1,
         ('much', 'on'): 1,
         ('on', 'questions'): 1,
         ('questions', 'of'): 1,
         ('of', 'export'): 1,
         ('export', 'and'): 1,
         ('with', 'external'): 1,
         ('external', 'partners'): 1,
         ('partners', '<e>'): 1,
         ('case', 'this'): 1,
         ('this', 'rationalization'): 1,
         ('rationaliz

         ('ø', 'v'): 1,
         ('g', 'ο'): 1,
         ('š', 'č'): 1,
         ('m', 'å'): 1,
         ('i', '0'): 1,
         ('o', '…'): 1,
         ('e', 'ú'): 1,
         ('‘', '7'): 1,
         ('j', 'š'): 1,
         ('r', '…'): 1,
         ('’', '”'): 1,
         ('°', 'f'): 1,
         ('ě', '<e>'): 1,
         ('a', 'ğ'): 1,
         ('ğ', 'ı'): 1,
         ('ı', 'ş'): 1,
         ('b', 'å'): 1,
         ('β', 's'): 1,
         ('a', 'ǔ'): 1,
         ('ǔ', ' '): 1,
         ('1', '¼'): 1,
         ('¼', ' '): 1,
         ('ă', 'a'): 1,
         ('5', 'x'): 1,
         ('¡', 'v'): 1,
         ('ß', 'e'): 1,
         ('6', 'ι'): 1,
         ('å', 'õ'): 1,
         ('õ', '÷'): 1,
         ('÷', 'á'): 1,
         ('é', 'ó'): 1,
         ('ó', 'ô'): 1,
         ('ô', 'þ'): 1,
         ('þ', ' '): 1,
         ('ś', ' '): 1,
         ('ź', 'k'): 1,
         ('t', 'ł'): 1,
         ('u', '…'): 1,
         ('ο', 'ε'): 1,
         ('é', '1'): 1,
         ('\xa0', 'κ'): 1,
         ('

In [15]:
trigram_counter = Counter()
for sent in corpus_clean_no_OOV:
    trigram_counter.update([gram for gram in ngrams(sent, 3, pad_left=True, pad_right=True,
                                                   left_pad_symbol='<s>',right_pad_symbol='<e>') ])
pprint(trigram_counter)

Counter({(' ', 't', 'h'): 6404023,
         ('t', 'h', 'e'): 5233083,
         ('h', 'e', ' '): 4286478,
         ('i', 'o', 'n'): 2272538,
         ('o', 'n', ' '): 2088896,
         (' ', 'o', 'f'): 2064051,
         (' ', 't', 'o'): 2055812,
         (' ', 'i', 'n'): 2013701,
         ('o', 'f', ' '): 2013283,
         ('t', 'o', ' '): 1944162,
         (' ', 'a', 'n'): 1925573,
         (' ', 'c', 'o'): 1871495,
         ('n', 'd', ' '): 1854576,
         ('a', 'n', 'd'): 1774673,
         ('i', 's', ' '): 1616892,
         ('e', 'n', 't'): 1610778,
         ('t', 'i', 'o'): 1531212,
         ('i', 'n', 'g'): 1528626,
         ('i', 'n', ' '): 1501874,
         ('n', 'g', ' '): 1435304,
         ('e', 'd', ' '): 1409658,
         ('e', ' ', 't'): 1398688,
         ('a', 't', ' '): 1378510,
         (' ', 'r', 'e'): 1295758,
         ('e', 's', ' '): 1286140,
         ('n', ' ', 't'): 1231034,
         ('r', 'e', ' '): 1208188,
         ('e', ' ', 'a'): 1167275,
         (' ', 'p', 

         ('o', 'i', 'c'): 12277,
         ('m', ' ', 'r'): 12232,
         ('m', 'o', 'm'): 12232,
         ('o', 'b', 'i'): 12217,
         ('y', ' ', ' '): 12194,
         ('n', 'u', 't'): 12187,
         ('y', ' ', '2'): 12173,
         ('m', 'a', 'c'): 12155,
         ('n', 'h', 'a'): 12153,
         ('f', ' ', 'j'): 12145,
         (' ', 'j', 'a'): 12144,
         ('o', 's', 'p'): 12131,
         ('b', 's', 'o'): 12127,
         ('r', 'y', 'i'): 12104,
         ('h', 'a', 'm'): 12104,
         ('e', 'p', 's'): 12102,
         ('e', 'w', 's'): 12048,
         ('w', ' ', 'f'): 12044,
         ('y', 'e', 'r'): 12041,
         ('o', 's', 'o'): 12040,
         ('b', 's', 'i'): 12032,
         ('k', ' ', 'm'): 12017,
         ('p', ' ', 'b'): 12013,
         ('r', ' ', '1'): 11999,
         ('h', 'u', 'n'): 11990,
         ('l', 'o', 't'): 11984,
         ('t', 'h', '<e>'): 11979,
         ('a', 'i', 't'): 11975,
         ('s', 'e', 'p'): 11972,
         ('o', 'd', 'y'): 11960,
        

         ('<s>', 's', 'v'): 1660,
         ('x', 'i', 'n'): 1658,
         ('s', 'k', 'y'): 1654,
         (' ', '4', '5'): 1653,
         ('l', 'e', '’'): 1653,
         ('x', ' ', 's'): 1651,
         ('e', 'j', 'u'): 1648,
         ('p', ' ', 'l'): 1643,
         ('s', 'o', 'i'): 1643,
         ('s', 'd', 'i'): 1640,
         ('5', '0', '1'): 1640,
         ('a', 'r', 'v'): 1639,
         ('5', '0', '2'): 1637,
         ('<s>', 'i', 'm'): 1633,
         ('o', 'f', 'u'): 1633,
         ('e', 'b', 'e'): 1629,
         ('o', 'l', 'm'): 1628,
         ('0', 's', ' '): 1628,
         ('d', 'u', 'p'): 1626,
         ('n', 'u', 'o'): 1620,
         (' ', 'z', 'a'): 1618,
         ('j', 'a', 'c'): 1616,
         ('0', '1', '5'): 1616,
         ('t', 'v', 'i'): 1615,
         ('e', 'b', 'u'): 1615,
         ('1', ' ', ' '): 1614,
         ('w', ' ', ' '): 1614,
         ('w', 's', 'p'): 1613,
         ('u', 'a', 'c'): 1612,
         ('h', 'n', 'y'): 1609,
         ('e', 'r', 'z'): 1608,
    

         ('1', '9', '1'): 429,
         (' ', '8', '6'): 429,
         ('e', 'u', 'a'): 428,
         ('v', '<e>', '<e>'): 428,
         (' ', 'm', ' '): 428,
         ('t', 'p', 'u'): 428,
         ('t', 'z', 'i'): 428,
         ('6', ' ', 'r'): 427,
         ('u', 'd', 'u'): 427,
         ('y', 'p', 'a'): 427,
         ('s', 'o', 'y'): 427,
         ('2', '0', 't'): 426,
         ('a', 'l', 'q'): 426,
         ('h', ' ', '–'): 426,
         (' ', '9', '8'): 425,
         ('n', 't', 'd'): 425,
         ('8', '9', '8'): 425,
         ('r', 'ü', 'b'): 425,
         ('g', 'y', 'z'): 425,
         ('5', '3', '0'): 424,
         ('d', 'b', 'o'): 424,
         ('h', 'l', ' '): 424,
         ('y', ' ', 'z'): 423,
         ('i', 'c', 'p'): 423,
         ('z', 'o', ' '): 423,
         ('m', 'v', 'e'): 423,
         ('m', 'l', 'e'): 422,
         ('i', 'r', 'b'): 422,
         ('a', 'e', 'g'): 421,
         ('9', ' ', 'n'): 421,
         ('o', 'k', 'l'): 421,
         ('0', '3', '4'): 420,
    

         ('g', ' ', 'x'): 235,
         ('8', '0', '1'): 235,
         ('n', 's', 'b'): 235,
         ('d', 'i', 'x'): 235,
         ('e', 'm', 'l'): 234,
         ('m', 'i', 'o'): 234,
         ('1', '0', 't'): 234,
         ('y', 'f', 'l'): 234,
         ('q', ' ', 'h'): 234,
         ('5', 't', 'h'): 234,
         ('v', ' ', 'h'): 234,
         ('l', 'm', 'l'): 234,
         ('t', 'i', 'k'): 234,
         ('l', 'y', 'r'): 234,
         ('f', 'd', ' '): 234,
         ('y', '\xa0', '2'): 234,
         ('g', 'p', ' '): 234,
         ('\xa0', 'o', 'c'): 234,
         ('3', ' ', 'l'): 233,
         ('d', 'p', 'a'): 233,
         ('n', 'd', 'g'): 233,
         ('9', '0', '1'): 233,
         ('4', '9', '7'): 232,
         ('l', 'd', 'b'): 232,
         ('d', 'd', 'o'): 232,
         (' ', 'q', 'a'): 232,
         ('l', 'ó', 'p'): 232,
         ('ó', 'p', 'e'): 232,
         ('c', 'm', 'o'): 232,
         ('g', 'g', 'f'): 232,
         ('n', 'd', '’'): 232,
         ('1', '\xa0', 'm'): 232,

         ('n', 'y', 'l'): 93,
         ('e', 'd', '\xa0'): 93,
         (' ', 'r', 'f'): 93,
         ('h', 'k', 'i'): 93,
         (' ', 's', 'f'): 93,
         ('2', '3', '6'): 93,
         ('a', 'l', '\xa0'): 93,
         ('r', '\xa0', '7'): 93,
         ('7', ' ', '6'): 93,
         ('r', 'i', 'r'): 93,
         ('c', 'f', 'r'): 93,
         ('2', '6', '5'): 93,
         ('d', 'r', 'í'): 93,
         ('r', 'í', 'g'): 93,
         ('í', 'g', 'u'): 93,
         ('a', 't', 'w'): 92,
         ('n', 't', 'b'): 92,
         ('d', 'a', 'ń'): 92,
         ('r', 'x', 'i'): 92,
         ('y', 'a', 'c'): 92,
         ('k', 'e', 'f'): 92,
         ('a', 'd', 'g'): 92,
         ('a', 'o', 'b'): 92,
         ('n', 'a', 'ń'): 92,
         ('9', '5', '8'): 92,
         ('2', '9', '5'): 92,
         ('r', 'k', 'r'): 92,
         ('8', '4', '9'): 92,
         ('l', 'w', 'o'): 92,
         ('a', 'z', 'l'): 92,
         ('c', 'i', '<e>'): 92,
         ('2', '3', '8'): 92,
         ('y', 't', 't'): 92,

         ('‘', 'b', 'a'): 45,
         (' ', '‘', 'j'): 45,
         ('c', 'r', 'd'): 45,
         (' ', 'v', 'ă'): 45,
         ('v', 'ă', 'l'): 45,
         ('ă', 'l', 'e'): 45,
         ('‘', '’', ' '): 45,
         ('j', 'a', 'f'): 45,
         ('k', 'i', 'b'): 45,
         (' ', 'l', 'f'): 45,
         ('é', 'r', 'o'): 45,
         ('u', 'z', 'k'): 45,
         ('2', '4', '<e>'): 45,
         ('j', 'i', 'r'): 45,
         ('m', '9', '9'): 45,
         (' ', 'l', 'r'): 45,
         (' ', 'v', 'ä'): 45,
         ('5', '4', '1'): 45,
         ('l', 'á', 's'): 45,
         ('ő', 'k', 'é'): 45,
         ('l', 'n', 's'): 45,
         ('7', '3', '8'): 45,
         ('d', 'f', 's'): 45,
         ('a', 'i', 'e'): 45,
         (' ', 'k', '9'): 45,
         ('s', ' ', 'à'): 44,
         ('1', '8', 'm'): 44,
         ('m', 'e', 'u'): 44,
         ('j', ' ', 'c'): 44,
         ('<s>', 'i', 'g'): 44,
         ('3', 'y', 'e'): 44,
         ('v', 'i', '<e>'): 44,
         ('5', '3', '8'): 44,
    

         ('<s>', '<s>', '·'): 22,
         ('<s>', '·', ' '): 22,
         (' ', 'e', '4'): 22,
         ('a', 'v', 'k'): 22,
         ('g', 'i', 'p'): 22,
         ('\xa0', 'k', 'h'): 22,
         ('i', 'u', 'd'): 22,
         ('v', 'a', '\xa0'): 22,
         ('o', 's', 'z'): 22,
         (' ', '2', 'c'): 22,
         ('s', 'p', 'b'): 22,
         ('t', 'i', 'á'): 22,
         ('g', 'h', 'm'): 22,
         ('f', 'i', 'p'): 22,
         ('ô', 'ï', ' '): 22,
         ('r', 'm', 'r'): 22,
         ('m', 'c', 'v'): 22,
         ('3', '8', 'a'): 22,
         ('u', 'x', '0'): 22,
         ('j', 'ú', 'c'): 22,
         ('ú', 'c', 'a'): 22,
         ('<s>', '<s>', 'and'): 21,
         ('n', 'á', 'r'): 21,
         (' ', 'z', 'ī'): 21,
         ('o', 'p', '1'): 21,
         ('g', 'r', 'é'): 21,
         ('b', 'b', '<e>'): 21,
         ('a', 'v', 's'): 21,
         ('e', 'k', 'c'): 21,
         ('k', 'c', 'y'): 21,
         ('<s>', '4', '0'): 21,
         ('3', '0', 'y'): 21,
         ('í', 'k'

         ('m', 's', 'm'): 11,
         ('w', 'f', 'i'): 11,
         ('y', 'i', 'b'): 11,
         ('<s>', 'b', '5'): 11,
         ('l', 'a', 'ţ'): 11,
         ('ţ', 'i', ' '): 11,
         ('i', 'k', 'l'): 11,
         ('ó', ' ', 'i'): 11,
         (' ', 'z', 'l'): 11,
         ('3', '\xa0', '5'): 11,
         ('t', 't', 'f'): 11,
         ('w', 'l', '<e>'): 11,
         ('5', '1', 's'): 11,
         ('o', 'n', 'ç'): 11,
         ('à', ' ', 'p'): 11,
         ('s', 'y', 'u'): 11,
         (' ', 'é', 'c'): 11,
         ('o', '”', ' '): 11,
         ('1', 'a', 'v'): 11,
         ('r', 'x', '0'): 11,
         (' ', 'c', '3'): 11,
         ('z', 'e', 'b'): 11,
         ('q', 'a', 'd'): 11,
         ('j', ' ', 'l'): 11,
         ('ó', ' ', 'm'): 11,
         ('l', 'ò', ' '): 11,
         (' ', 'ε', 'μ'): 11,
         ('b', ' ', '1'): 11,
         ('ï', ' ', 't'): 11,
         ('<s>', 'u', 'e'): 11,
         ('f', 'e', 'f'): 11,
         (' ', 'h', 'b'): 11,
         ('p', 'y', 'c'): 11,
 

         ('5', '°', 'c'): 6,
         ('<s>', 'o', 'x'): 6,
         ('’', ' ', '1'): 6,
         ('4', '7', '\xa0'): 6,
         ('u', 'é', 'n'): 6,
         ('e', 'x', 'k'): 6,
         ('y', 'u', 'b'): 6,
         ('b', 'c', 'h'): 6,
         ('z', 'c', 'a'): 6,
         ('<s>', '6', '7'): 6,
         ('9', '9', 'r'): 6,
         ('e', 'v', '1'): 6,
         ('0', '7', 's'): 6,
         ('<s>', '4', '6'): 6,
         ('c', 'á', 'd'): 6,
         ('a', 'ń', '<e>'): 6,
         ('<s>', '5', '3'): 6,
         ('m', 'f', 'l'): 6,
         ('a', 't', 'é'): 6,
         ('n', 's', 'q'): 6,
         (' ', 'j', '5'): 6,
         ('h', 'd', 'z'): 6,
         ('0', 'g', 'k'): 6,
         ('j', 't', 'y'): 6,
         ('o', 'x', 'o'): 6,
         ('x', 'o', 'f'): 6,
         ('r', 'e', '”'): 6,
         ('ü', 'r', 'm'): 6,
         ('e', 'm', 'd'): 6,
         ('z', 'u', 'b'): 6,
         ('k', '3', '<e>'): 6,
         ('h', 'g', '<e>'): 6,
         ('p', '7', 't'): 6,
         ('7', 't', 'a'): 

         ('0', 'd', 'r'): 4,
         ('a', 'z', 'r'): 4,
         ('8', '\xa0', '\xa0'): 4,
         ('<s>', '5', '6'): 4,
         ('–', 's', 't'): 4,
         ('2', 'e', 'q'): 4,
         ('s', 'r', 'f'): 4,
         ('m', 'c', '<e>'): 4,
         ('t', 'm', 'r'): 4,
         ('1', 'd', '<e>'): 4,
         ('o', 'u', 'é'): 4,
         ('7', 'p', 'a'): 4,
         ('b', 'p', 't'): 4,
         ('i', 'k', 'w'): 4,
         ('2', '5', 'g'): 4,
         ('ü', 'l', '<e>'): 4,
         ('d', '\xa0', 'e'): 4,
         ('j', 's', 'h'): 4,
         ('g', ' ', 'α'): 4,
         ('s', 'é', '\xa0'): 4,
         ('í', 'a', '\xa0'): 4,
         ('g', '\xa0', 'i'): 4,
         ('z', '\xa0', 'm'): 4,
         ('e', 'z', 'm'): 4,
         ('4', '4', 'd'): 4,
         ('4', '3', 'd'): 4,
         ('a', 'g', '’'): 4,
         (' ', 'd', 'ǐ'): 4,
         ('d', 'ǐ', 'm'): 4,
         ('ǐ', 'm', 'a'): 4,
         ('b', '3', '5'): 4,
         ('p', 'y', 'k'): 4,
         ('6', '\xa0', 'h'): 4,
         ('

         ('æ', 's', 't'): 3,
         ('s', 'n', '’'): 3,
         ('g', 'm', 'd'): 3,
         ('r', 'y', '2'): 3,
         ('b', 'm', '<e>'): 3,
         ('–', ' ', '9'): 3,
         ('p', '\xa0', '8'): 3,
         (' ', '€', '3'): 3,
         ('r', 'l', 'c'): 3,
         ('m', 'u', 'q'): 3,
         ('0', '0', 'd'): 3,
         ('–', 'i', 't'): 3,
         ('c', '6', '\xa0'): 3,
         ('3', '7', 'a'): 3,
         ('g', '\xa0', '4'): 3,
         ('p', 'r', 'y'): 3,
         ('l', 'w', 'h'): 3,
         ('g', '8', 's'): 3,
         ('g', 'p', '4'): 3,
         ('r', 'o', '3'): 3,
         ('é', 'g', ' '): 3,
         ('i', '5', ' '): 3,
         (' ', 'β', '7'): 3,
         ('β', '7', '0'): 3,
         ('m', 't', '’'): 3,
         (' ', 'm', 'ǎ'): 3,
         ('m', 'ǎ', 'n'): 3,
         ('ǎ', 'n', 'e'): 3,
         ('o', 'n', '…'): 3,
         ('i', '3', '1'): 3,
         ('c', 'h', '–'): 3,
         ('h', '–', 't'): 3,
         ('a', 'b', '’'): 3,
         ('ć', ' ', 'e'): 3,
   

         ('b', 'h', 'l'): 2,
         ('c', 'r', 'ú'): 2,
         ('r', 'á', 'l'): 2,
         ('g', 'c', 'r'): 2,
         ('m', 'h', 'ó'): 2,
         ('ú', 'n', 't'): 2,
         ('a', 'í', 'l'): 2,
         ('e', 'á', ' '): 2,
         ('4', '0', 'º'): 2,
         ('g', 'j', 'e'): 2,
         (' ', 'ο', 'c'): 2,
         ('ο', 'c', '<e>'): 2,
         ('1', '9', 'a'): 2,
         ('ń', ' ', 'f'): 2,
         ('ń', ' ', 'u'): 2,
         ('ž', ' ', 'p'): 2,
         ('p', '\xa0', '6'): 2,
         ('s', 'c', 'c'): 2,
         ('2', '\xa0', 'e'): 2,
         ('c', '6', '–'): 2,
         ('r', 'v', 'u'): 2,
         ('3', '1', 'b'): 2,
         (' ', 'é', 'p'): 2,
         ('s', 'é', 'c'): 2,
         ('i', 'z', 'v'): 2,
         ('m', 'ö', '<e>'): 2,
         (' ', '£', ' '): 2,
         ('o', 'k', 'v'): 2,
         ('o', 'c', 'á'): 2,
         ('d', 't', 'í'): 2,
         ('g', 'e', '2'): 2,
         ('n', 't', '4'): 2,
         ('4', 'i', 'f'): 2,
         ('5', 'i', ' '): 2,
    

         ('e', 'm', 'ä'): 2,
         ('3', 'l', 'i'): 2,
         ('e', 'g', 'p'): 2,
         ('ñ', 'a', 'l'): 2,
         ('1', 'ι', 'ν'): 2,
         ('2', '4', 's'): 2,
         (' ', 'r', 'à'): 2,
         ('r', 'à', 'd'): 2,
         ('à', 'd', 'i'): 2,
         ('\xa0', '\xa0', 'o'): 2,
         ('r', ' ', 'ş'): 2,
         (' ', 'ş', 'a'): 2,
         ('ş', 'a', 'h'): 2,
         ('o', 'p', 'v'): 2,
         ('p', 'v', 'a'): 2,
         ('g', 'c', 'e'): 2,
         ('d', 'n', 'r'): 2,
         ('r', 'ö', 't'): 2,
         ('p', 's', 'n'): 2,
         ('v', 'ä', 'r'): 2,
         ('\xa0', 'c', 'm'): 2,
         ('2', '9', 'm'): 2,
         ('c', 'd', 't'): 2,
         ('c', '4', '4'): 2,
         (' ', 's', 'û'): 2,
         ('s', 'û', 'r'): 2,
         ('û', 'r', 'e'): 2,
         (' ', 'n', '5'): 2,
         ('n', '5', '5'): 2,
         ('i', 'f', 'α'): 2,
         ('f', 'α', ' '): 2,
         ('3', 's', '<e>'): 2,
         ('<s>', '<s>', '—'): 2,
         ('<s>', '—', ' '): 

         ('<s>', 'the', 'aim'): 1,
         ('aim', 'of', 'the'): 1,
         ('treaty', 'is', 'to'): 1,
         ('is', 'to', 'promote'): 1,
         ('to', 'promote', 'nuclear'): 1,
         ('promote', 'nuclear', 'energy'): 1,
         ('it', 'should', 'thus'): 1,
         ('should', 'thus', 'be'): 1,
         ('thus', 'be', 'revised'): 1,
         ('be', 'revised', 'in'): 1,
         ('revised', 'in', 'such'): 1,
         ('such', 'a', 'way'): 1,
         ('a', 'way', 'that'): 1,
         ('way', 'that', 'health'): 1,
         ('that', 'health', 'and'): 1,
         ('safety', 'are', 'the'): 1,
         ('are', 'the', 'prime'): 1,
         ('the', 'prime', 'objectives'): 1,
         ('prime', 'objectives', '<e>'): 1,
         ('objectives', '<e>', '<e>'): 1,
         ('the', 'resources', 'should'): 1,
         ('resources', 'should', 'be'): 1,
         ('used', 'for', 'work'): 1,
         ('for', 'work', 'on'): 1,
         ('work', 'on', 'the'): 1,
         ('the', 'environmental', 

         ('the', 'byproducts', 'of'): 1,
         ('byproducts', 'of', 'their'): 1,
         ('of', 'their', 'operation'): 1,
         ('their', 'operation', 'even'): 1,
         ('operation', 'even', 'under'): 1,
         ('even', 'under', 'accidental'): 1,
         ('under', 'accidental', 'conditions'): 1,
         ('accidental', 'conditions', '<e>'): 1,
         ('that', 'is', 'surely'): 1,
         ('is', 'surely', 'quite'): 1,
         ('surely', 'quite', 'wrong'): 1,
         ('quite', 'wrong', 'for'): 1,
         ('wrong', 'for', 'i'): 1,
         ('for', 'i', 'wonder'): 1,
         ('wonder', 'how', 'the'): 1,
         ('how', 'the', 'radioactivity'): 1,
         ('the', 'radioactivity', 'from'): 1,
         ('radioactivity', 'from', 'UNK'): 1,
         ('from', 'UNK', 'can'): 1,
         ('UNK', 'can', 'be'): 1,
         ('can', 'be', 'measured'): 1,
         ('be', 'measured', 'off'): 1,
         ('measured', 'off', 'the'): 1,
         ('off', 'the', 'coasts'): 1,
         ('

         ('would', 'have', 'written'): 1,
         ('have', 'written', 'a'): 1,
         ('written', 'a', 'similar'): 1,
         ('a', 'similar', 'report'): 1,
         ('similar', 'report', '<e>'): 1,
         ('me', 'say', 'the'): 1,
         ('say', 'the', 'following'): 1,
         ('the', 'following', 'to'): 1,
         ('following', 'to', 'mr'): 1,
         ('to', 'mr', 'weber'): 1,
         ('mr', 'weber', 'if'): 1,
         ('weber', 'if', 'it'): 1,
         ('if', 'it', 'really'): 1,
         ('it', 'really', 'were'): 1,
         ('really', 'were', 'true'): 1,
         ('were', 'true', 'that'): 1,
         ('true', 'that', 'nuclear'): 1,
         ('energy', 'does', 'not'): 1,
         ('does', 'not', 'save'): 1,
         ('not', 'save', 'on'): 1,
         ('save', 'on', 'co2'): 1,
         ('on', 'co2', 'then'): 1,
         ('co2', 'then', 'can'): 1,
         ('then', 'can', 'he'): 1,
         ('can', 'he', 'explain'): 1,
         ('he', 'explain', 'why'): 1,
         ('explai

         ('with', 'long', 'UNK'): 1,
         ('long', 'UNK', 'into'): 1,
         ('UNK', 'into', 'ones'): 1,
         ('into', 'ones', 'with'): 1,
         ('ones', 'with', 'short'): 1,
         ('with', 'short', 'UNK'): 1,
         ('short', 'UNK', '<e>'): 1,
         ('way', 'the', 'UNK'): 1,
         ('UNK', 'of', 'wastes'): 1,
         ('of', 'wastes', 'for'): 1,
         ('wastes', 'for', 'final'): 1,
         ('for', 'final', 'storage'): 1,
         ('final', 'storage', 'could'): 1,
         ('storage', 'could', 'be'): 1,
         ('could', 'be', 'limited'): 1,
         ('be', 'limited', 'and'): 1,
         ('limited', 'and', 'the'): 1,
         ('and', 'the', 'packaging'): 1,
         ('the', 'packaging', 'of'): 1,
         ('packaging', 'of', 'each'): 1,
         ('of', 'each', 'UNK'): 1,
         ('each', 'UNK', 'could'): 1,
         ('UNK', 'could', 'be'): 1,
         ('could', 'be', 'finally'): 1,
         ('be', 'finally', 'decided'): 1,
         ('finally', 'decided', 'o

         ('remaining', 'resources', 'of'): 1,
         ('resources', 'of', 'this'): 1,
         ('of', 'this', 'earth'): 1,
         ('this', 'earth', '<e>'): 1,
         ('earth', '<e>', '<e>'): 1,
         ('we', 'need', 'only'): 1,
         ('need', 'only', 'look'): 1,
         ('only', 'look', 'at'): 1,
         ('look', 'at', 'afghanistan'): 1,
         ('at', 'afghanistan', 'at'): 1,
         ('afghanistan', 'at', 'UNK'): 1,
         ('at', 'UNK', 'at'): 1,
         ('UNK', 'at', 'kurdistan'): 1,
         ('at', 'kurdistan', 'at'): 1,
         ('kurdistan', 'at', 'the'): 1,
         ('at', 'the', 'gulf'): 1,
         ('the', 'gulf', 'the'): 1,
         ('gulf', 'the', 'wars'): 1,
         ('the', 'wars', 'being'): 1,
         ('wars', 'being', 'waged'): 1,
         ('being', 'waged', 'there'): 1,
         ('waged', 'there', 'are'): 1,
         ('there', 'are', 'about'): 1,
         ('are', 'about', 'energy'): 1,
         ('about', 'energy', '<e>'): 1,
         ('if', 'we', 'serio

         ('be', 'industrially', 'reused'): 1,
         ('industrially', 'reused', 'and'): 1,
         ('reused', 'and', 'of'): 1,
         ('and', 'of', 'solid'): 1,
         ('of', 'solid', 'urban'): 1,
         ('the', 'house', 'will'): 1,
         ('house', 'will', 'accept'): 1,
         ('will', 'accept', 'this'): 1,
         ('accept', 'this', 'suggestion'): 1,
         ('this', 'suggestion', '<e>'): 1,
         ('suggestion', '<e>', '<e>'): 1,
         ('should', 'also', 'like'): 1,
         ('the', 'rapporteur', 'on'): 1,
         ('rapporteur', 'on', 'her'): 1,
         ('on', 'her', 'excellent'): 1,
         ('her', 'excellent', 'report'): 1,
         ('excellent', 'report', 'with'): 1,
         ('with', 'its', 'many'): 1,
         ('its', 'many', 'good'): 1,
         ('many', 'good', 'recommendations'): 1,
         ('good', 'recommendations', 'on'): 1,
         ('recommendations', 'on', 'promoting'): 1,
         ('on', 'promoting', 'the'): 1,
         ('promoting', 'the', 'us

         ('through', 'a', 'very'): 1,
         ('a', 'very', 'difficult'): 1,
         ('very', 'difficult', 'period'): 1,
         ('difficult', 'period', 'has'): 1,
         ('period', 'has', 'the'): 1,
         ('has', 'the', 'potential'): 1,
         ('the', 'potential', 'through'): 1,
         ('potential', 'through', 'biomass'): 1,
         ('through', 'biomass', 'to'): 1,
         ('biomass', 'to', 'contribute'): 1,
         ('to', 'contribute', 'positively'): 1,
         ('contribute', 'positively', 'to'): 1,
         ('positively', 'to', 'rebalancing'): 1,
         ('to', 'rebalancing', 'the'): 1,
         ('rebalancing', 'the', 'renewable'): 1,
         ('the', 'renewable', 'energy'): 1,
         ('renewable', 'energy', 'deficit'): 1,
         ('energy', 'deficit', 'within'): 1,
         ('deficit', 'within', 'the'): 1,
         ('<s>', 'despite', 'considerable'): 1,
         ('despite', 'considerable', 'advantages'): 1,
         ('considerable', 'advantages', 'in'): 1,
     

         ('best', 'to', 'keep'): 1,
         ('to', 'keep', 'you'): 1,
         ('keep', 'you', 'advised'): 1,
         ('you', 'advised', '<e>'): 1,
         ('advised', '<e>', '<e>'): 1,
         ('i', 'am', 'really'): 1,
         ('am', 'really', 'sorry'): 1,
         ('really', 'sorry', 'but'): 1,
         ('sorry', 'but', 'we'): 1,
         ('but', 'we', 'have'): 1,
         ('we', 'have', 'made'): 1,
         ('have', 'made', 'a'): 1,
         ('made', 'a', 'serious'): 1,
         ('a', 'serious', 'mistake'): 1,
         ('serious', 'mistake', '<e>'): 1,
         ('<s>', 'that', 'really'): 1,
         ('that', 'really', 'is'): 1,
         ('really', 'is', 'true'): 1,
         ('is', 'true', '<e>'): 1,
         ('<s>', 'in', 'one'): 1,
         ('in', 'one', 'go'): 1,
         ('one', 'go', 'we'): 1,
         ('go', 'we', 'have'): 1,
         ('we', 'have', 'named'): 1,
         ('have', 'named', 'four'): 1,
         ('named', 'four', 'regions'): 1,
         ('four', 'regions', 'i

         ('to', 'the', 'armaments'): 1,
         ('the', 'armaments', 'industry'): 1,
         ('armaments', 'industry', 'to'): 1,
         ('industry', 'to', 'favour'): 1,
         ('to', 'favour', 'a'): 1,
         ('favour', 'a', 'transatlantic'): 1,
         ('a', 'transatlantic', 'union'): 1,
         ('transatlantic', 'union', 'rather'): 1,
         ('union', 'rather', 'than'): 1,
         ('rather', 'than', 'a'): 1,
         ('than', 'a', 'european'): 1,
         ('a', 'european', 'merger'): 1,
         ('european', 'merger', 'in'): 1,
         ('merger', 'in', 'this'): 1,
         ('this', 'case', 'an'): 1,
         ('case', 'an', 'UNK'): 1,
         ('an', 'UNK', 'merger'): 1,
         ('UNK', 'merger', 'in'): 1,
         ('merger', 'in', 'the'): 1,
         ('field', 'of', 'aeronautics'): 1,
         ('of', 'aeronautics', 'and'): 1,
         ('aeronautics', 'and', 'electronics'): 1,
         ('and', 'electronics', '<e>'): 1,
         ('electronics', '<e>', '<e>'): 1,
        

         ('running', 'nuclear', 'installations'): 1,
         ('nuclear', 'installations', 'towards'): 1,
         ('installations', 'towards', 'european'): 1,
         ('towards', 'european', 'citizens'): 1,
         ('european', 'citizens', 'when'): 1,
         ('citizens', 'when', 'it'): 1,
         ('comes', 'to', 'nuclear'): 1,
         ('to', 'nuclear', 'safety'): 1,
         ('nuclear', 'safety', '<e>'): 1,
         ('<s>', '<s>', 'other'): 1,
         ('<s>', 'other', 'reports'): 1,
         ('other', 'reports', 'will'): 1,
         ('reports', 'will', 'make'): 1,
         ('possible', 'to', 'look'): 1,
         ('look', 'at', 'the'): 1,
         ('at', 'the', 'rights'): 1,
         ('wrongs', 'of', 'various'): 1,
         ('of', 'various', 'different'): 1,
         ('various', 'different', 'types'): 1,
         ('different', 'types', 'of'): 1,
         ('sources', 'in', 'europe'): 1,
         ('here', 'we', 'have'): 1,
         ('have', 'to', 'accept'): 1,
         ('to', 'acc

         (' ', 'г', 'а'): 1,
         ('г', 'а', 'з'): 1,
         ('а', 'з', 'е'): 1,
         ('з', 'е', 'т'): 1,
         ('е', 'т', 'а'): 1,
         ('т', 'а', ' '): 1,
         ('а', ' ', 'i'): 1,
         ('u', 's', '8'): 1,
         ('s', '8', '5'): 1,
         ('u', 'r', '3'): 1,
         ('m', 'p', 'â'): 1,
         ('b', 'â', ' '): 1,
         ('s', 'à', ' '): 1,
         ('<s>', '3', 'g'): 1,
         ('<s>', 'q', 'm'): 1,
         (' ', 'α', 'c'): 1,
         ('α', 'c', 'p'): 1,
         ('c', 'p', 'ε'): 1,
         ('p', 'ε', 'u'): 1,
         (' ', '5', '¾'): 1,
         ('5', '¾', ' '): 1,
         (' ', '3', '¾'): 1,
         ('3', '¾', ' '): 1,
         ('9', '7', 'd'): 1,
         ('7', 'd', 'r'): 1,
         ('x', 's', '4'): 1,
         ('s', '4', 'a'): 1,
         ('<s>', 'g', '8'): 1,
         ('i', 'd', '7'): 1,
         ('d', '7', '0'): 1,
         ('é', 'o', 'l'): 1,
         ('í', 'm', 'o'): 1,
         ('4', 'm', 'm'): 1,
         ('6', 'm', 'm'): 1,
        

         ('l', '3', 'o'): 1,
         ('3', 'o', 'u'): 1,
         ('l', '4', 'w'): 1,
         ('6', '5', 'g'): 1,
         ('5', 'g', 'u'): 1,
         ('l', 'e', '6'): 1,
         ('e', '6', 'i'): 1,
         ('d', 's', '7'): 1,
         ('s', '7', 'r'): 1,
         ('d', '2', 'e'): 1,
         ('2', 'e', 'v'): 1,
         ('ö', 'r', 'z'): 1,
         (' ', 'p', 'ì'): 1,
         ('p', 'ì', 'a'): 1,
         ('ì', 'a', 'o'): 1,
         ('μ', 'ο', ' '): 1,
         ('r', '°', 's'): 1,
         ('°', 's', 'e'): 1,
         ('n', '2', 'n'): 1,
         ('2', 'n', '3'): 1,
         ('4', '’', 'h'): 1,
         ('’', 'h', 'e'): 1,
         ('w', 't', 'c'): 1,
         ('9', '1', 'å'): 1,
         ('1', 'å', 'c'): 1,
         ('7', '8', 'n'): 1,
         ('p', 'd', 'y'): 1,
         ('3', '0', 'g'): 1,
         ('o', 'w', '”'): 1,
         ('w', '”', ' '): 1,
         ('p', 'a', '”'): 1,
         ('a', '”', '<e>'): 1,
         ('u', 'q', 't'): 1,
         (' ', 'b', 'ú'): 1,
         ('b

         ('m', 'y', 'f'): 1,
         ('l', 'g', 'ä'): 1,
         ('g', 'ä', 'u'): 1,
         ('ä', 'u', ' '): 1,
         ('<s>', '0', '8'): 1,
         ('s', 'è', 'l'): 1,
         ('4', '0', '£'): 1,
         ('0', '£', '5'): 1,
         ('h', 'g', 'w'): 1,
         ('s', 'v', '<e>'): 1,
         (' ', 'm', 'q'): 1,
         ('m', 'q', 'm'): 1,
         ('q', 'm', ' '): 1,
         ('d', 'j', 'z'): 1,
         ('j', 'z', 'a'): 1,
         ('é', 'f', 'u'): 1,
         ('g', 'i', 'é'): 1,
         ('ó', 'l', 'o'): 1,
         ('u', 't', 'k'): 1,
         ('b', 'á', 't'): 1,
         (' ', '‘', '\xa0'): 1,
         ('‘', '\xa0', '…'): 1,
         (' ', 'κ', 'e'): 1,
         ('κ', 'e', 'm'): 1,
         ('ä', 'r', 'j'): 1,
         ('a', 'r', 'ö'): 1,
         ('r', 'ö', ' '): 1,
         ('r', 't', 'ő'): 1,
         ('t', 'ő', ' '): 1,
         ('a', 't', '2'): 1,
         ('u', '\xa0', ' '): 1,
         (' ', 'i', '9'): 1,
         ('i', '9', '4'): 1,
         ('n', 'z', 'w'): 1,
 