In [2]:
import re
import os
import string
from nltk import sent_tokenize, word_tokenize, download
from IPython.display import clear_output
from collections import Counter
import gc
import pickle
from nltk.tokenize import RegexpTokenizer
from nltk.util import ngrams
"""
# Save objects
with open('object_name', 'wb') as f:
    pickle.dump(object_variable, f)

# Load objects
with open('object_name', 'rb') as f:
    object_variable = pickle.load(f)
"""
from pprint import pprint
print("Imports Completed")

Imports Completed


In [36]:
download('punkt')

[nltk_data] Downloading package punkt to /home/datum/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
def clean_text(text):
    """ 
    1. Remove html like text from europarl e.g. <Chapter 1>
    2. Remove line breaks
    3. Reduce all whitespaces to 1
    4. turn everything to lower case
    """
    clean = re.compile('<.*?>')
    
    out = text.replace('\n', ' ') # Remove line breaks
    out = re.sub(clean, ' ', out) # Remove tagged text e.g. <Chapter 1>
    out = re.sub(' +', ' ', out) # Reduce whitespace down to one
    
    out = out.lower() # Turn everything to lower case
    
    return out

In [39]:
abs_path = os.getcwd()
path = abs_path + '/en/'
text = ''
corpus_clean = []
sentences = []
corpus_original = ''
total = len(os.listdir(path)) # Total files
count = 0

for file in os.listdir(path):
    f = open(path + file, 'r', encoding="utf-8")
    file_text = f.read()
    corpus_original = corpus_original + file_text
    f.close()

    regex = re.compile('[%s]' % re.escape(string.punctuation))
    file_sentences = [regex.sub('', sent).strip() for sent in sent_tokenize(clean_text(file_text))]

    corpus_clean = corpus_clean + file_sentences
    count += 1
    
    clear_output(wait = True)
    print('File ' + file + ' finished. Completed ' + str(round(count*100/total,2)) + '%')

File ep-99-04-14.txt finished. Completed 100.0%


In [None]:
"""
demo corpus file:
------------------------------------------------------------------
original text: <class 'str'>
<CHAPTER ID="005">
Voting time
<SPEAKER ID="104" NAME="President">
The next item is the vote.
<P>
(For the results and other details on the vote: see Minutes)
------------------------------------------------------------------
Its clean_text(text): <class 'str'>
 voting time the next item is the vote. (for the results and other details on the vote: see minutes) 
------------------------------------------------------------------
Its sent_tokenize(clean_text(text)): <class 'list'>
[' voting time the next item is the vote.', '(for the results and other details on the vote: see minutes)']
------------------------------------------------------------------
Its sentences = [sent.strip() for sent in sent_tokenize(clean_text(text))]: <class 'list'>
['voting time the next item is the vote.', '(for the results and other details on the vote: see minutes)']
------------------------------------------------------------------
"""

In [40]:
# Save the basic objects:
with open('corpus_original', 'wb') as f:
    pickle.dump(corpus_original, f)
with open('corpus_clean', 'wb') as f:
    pickle.dump(corpus_clean, f)

In [41]:
regex = re.compile('[%s]' % re.escape(string.punctuation))
corpus_clean_string = regex.sub('', clean_text(corpus_original))

print('-------------------------')
print('corpus_clean_string created.')
print('-------------------------')

with open('corpus_clean_string', 'wb') as f:
    pickle.dump(corpus_clean_string, f)

del corpus_original

-------------------------
corpus_clean_string created.
-------------------------


In [42]:
gc.collect()

317

In [44]:
AllWords = word_tokenize(corpus_clean_string)
print('-------------------------')
print('Words Tokenized.')
print('-------------------------')

vocabulary = set(AllWords)
print('Vocabulary Created.')
print('-------------------------')

WordCounts = Counter(AllWords)
print('WordCounts Calculated.')
print('-------------------------')

with open('AllWords', 'wb') as f:
    pickle.dump(AllWords, f) 

with open('vocabulary', 'wb') as f:
    pickle.dump(vocabulary, f) 
    
with open('WordCounts', 'wb') as f:
    pickle.dump(WordCounts, f) 

del corpus_clean_string, AllWords

-------------------------
Words Tokenized.
-------------------------
Vocabulary Created.
-------------------------
WordCounts Calculated.
-------------------------


In [46]:
gc.collect()

0

#### Early starting point: Instead of all the above..

In [130]:
corpus_clean = None
WordCounts = None
vocabulary = None

# Load objects

with open('corpus_clean', 'rb') as f:
    corpus_clean = pickle.load(f)

with open('vocabulary', 'rb') as f:
    vocabulary = pickle.load(f)
    
with open('WordCounts', 'rb') as f:
    WordCounts = pickle.load(f)
    
with open('AllWords', 'rb') as f:
    AllWords = pickle.load(f)

In [131]:
# Ignore low frequency words
valid_vocabulary = [k for k,v in WordCounts.items() if v > 10]
invalid_vocabulary = [k for k,v in WordCounts.items() if v <= 10]
print("valid voc", len(valid_vocabulary))
print("invalid voc", len(invalid_vocabulary))

with open('valid_vocabulary', 'wb') as f:
    pickle.dump(valid_vocabulary, f) 
    
with open('invalid_vocabulary', 'wb') as f:
    pickle.dump(invalid_vocabulary, f) 

valid voc 32678
invalid voc 109653


In [132]:
# Replace OOV words in sentences

def split_sentence(sentence):
    PATTERN = '\w+|\(|\)|\.|\,'
    tokenizer = RegexpTokenizer(pattern=PATTERN) 
    return tokenizer.tokenize(sentence)
dummy_count = 0
total = len(corpus_clean)
for i in range(0,len(corpus_clean)):
    sentence = ''.join(corpus_clean[i]) # make it string
    splitted_sent = split_sentence(sentence)    
    new_sent = []
    for word in splitted_sent:
        new_word = 'UNK' if word not in valid_vocabulary else word
        new_sent.append(new_word)
        corpus_clean[i] = new_sent
    clear_output(wait = True)
    print('Sentences processed ' + str(i+1) + ' out of ' + str(total))
    dummy_count = dummy_count + 1
    if 1000 < dummy_count:
        break

# Have it here, in order to not to forget to save after the big computation burden.
with open('corpus_clean_no_OOV', 'wb') as f:
    pickle.dump(corpus_clean, f)

Sentences processed 1001 out of 2217535


In [None]:
# Deprecated (slower)
#
# Replace OOV words in sentences
# total = len(corpus_clean)
# for i in range(0,len(corpus_clean)):
#    for word in valid_vocabulary:
#        corpus_clean[i].replace(word, 'UNK')
#   clear_output(wait = True)
#   print('Sentences processed ' + str(i+1) + ' out of ' + str(total) )

In [137]:
# Creating the n-grams is actually not needed!! Only counting them is!!
#
# tokens = AllWords
# bigrams = [ gram for gram in ngrams(tokens, 2) ]
# trigrams = [ gram for gram in ngrams(tokens, 3) ]
# #pprint(bigrams)
# with open('bigrams', 'wb') as f:
#     pickle.dump(bigrams, f)
# with open('trigrams', 'wb') as f:
#    pickle.dump(bigrams, f)

In [153]:
# Creating the n-grams is actually not needed!! Only counting them is!!
#
# import sys, numpy as np
# print("AllWords:", np.round(sys.getsizeof(AllWords)/1024/1024/1024,2)," GBs")
# print("Bigrams:", np.round(sys.getsizeof(bigrams)/1024/1024/1024,2)," GBs")
# print("Trigrams:", np.round(sys.getsizeof(trigrams)/1024/1024/1024,2)," GBs")

#### Regular Starting Point

In [3]:
corpus_clean = None
WordCounts = None
vocabulary = None
corpus_clean_no_OOV = None

# Load objects

#with open('corpus_clean', 'rb') as f:
#    corpus_clean = pickle.load(f)

#with open('vocabulary', 'rb') as f:
#    vocabulary = pickle.load(f)
    
#with open('WordCounts', 'rb') as f:
#    WordCounts = pickle.load(f)
    
with open('AllWords', 'rb') as f:
    AllWords = pickle.load(f)
    
with open('corpus_clean_no_OOV', 'rb') as f:
    corpus_clean_no_OOV = pickle.load(f)

In [4]:
print(corpus_clean_no_OOV.__class__)
print(corpus_clean_no_OOV[0])
print(corpus_clean_no_OOV[0].__class__)
print(corpus_clean_no_OOV[0][0].__class__)

<class 'list'>
['voting', 'time', 'the', 'next', 'item', 'is', 'the', 'vote']
<class 'list'>
<class 'str'>


In [9]:
print(AllWords.__class__)
print(AllWords[0])
print(AllWords[0].__class__)
print(AllWords[0:10])

<class 'list'>
voting
<class 'str'>
['voting', 'time', 'the', 'next', 'item', 'is', 'the', 'vote', 'for', 'the']


In [5]:
gc.collect()

0

#### Counting n-grams

In [12]:
# Single sentence, for testing corpus_clean_no_OOV:
unigram_counter = Counter()
unigram_counter.update([gram for gram in ngrams(corpus_clean_no_OOV[0], 1, pad_left=True, pad_right=True,
                                                   left_pad_symbol='<s>',right_pad_symbol='<e>') ])
pprint(corpus_clean_no_OOV[0])
pprint(unigram_counter)

['voting', 'time', 'the', 'next', 'item', 'is', 'the', 'vote']
Counter({('the',): 2,
         ('voting',): 1,
         ('time',): 1,
         ('next',): 1,
         ('item',): 1,
         ('is',): 1,
         ('vote',): 1})


In [10]:
# Single sentence for testing AllWords:
unigram_counter = Counter()
unigram_counter.update([gram for gram in ngrams(AllWords[0:10], 1, pad_left=True, pad_right=True,
                                                   left_pad_symbol='<s>',right_pad_symbol='<e>') ])
pprint(AllWords[0:10])
pprint(unigram_counter)

['voting', 'time', 'the', 'next', 'item', 'is', 'the', 'vote', 'for', 'the']
Counter({('the',): 3,
         ('voting',): 1,
         ('time',): 1,
         ('next',): 1,
         ('item',): 1,
         ('is',): 1,
         ('vote',): 1,
         ('for',): 1})


In [16]:
unigram_counter = Counter()
sample = corpus_clean_no_OOV[0:1000]

for sent in sample:   
    unigram_counter.update([gram for gram in ngrams(sent, 1, pad_left=True, pad_right=True,
                                                   left_pad_symbol='<s>',right_pad_symbol='<e>') ])
pprint(unigram_counter)

Counter({('the',): 1574,
         ('of',): 839,
         ('to',): 749,
         ('and',): 619,
         ('in',): 527,
         ('a',): 420,
         ('is',): 407,
         ('that',): 395,
         ('we',): 313,
         ('i',): 303,
         ('for',): 279,
         ('it',): 247,
         ('energy',): 241,
         ('this',): 229,
         ('on',): 224,
         ('nuclear',): 216,
         ('be',): 209,
         ('not',): 190,
         ('which',): 176,
         ('european',): 162,
         ('are',): 158,
         ('UNK',): 157,
         ('as',): 153,
         ('have',): 138,
         ('with',): 131,
         ('by',): 117,
         ('should',): 115,
         ('mr',): 108,
         ('report',): 106,
         ('would',): 104,
         ('at',): 101,
         ('power',): 100,
         ('also',): 95,
         ('all',): 86,
         ('but',): 86,
         ('will',): 81,
         ('has',): 80,
         ('from',): 77,
         ('our',): 76,
         ('an',): 74,
         ('can',): 74,
         (

         ('per',): 5,
         ('cycle',): 5,
         ('sweden',): 5,
         ('majority',): 5,
         ('noone',): 5,
         ('find',): 5,
         ('expect',): 5,
         ('vital',): 5,
         ('giving',): 5,
         ('term',): 5,
         ('true',): 5,
         ('element',): 5,
         ('neither',): 5,
         ('huge',): 5,
         ('materials',): 5,
         ('throughout',): 5,
         ('management',): 5,
         ('plant',): 5,
         ('project',): 5,
         ('consideration',): 5,
         ('share',): 5,
         ('concerning',): 5,
         ('despite',): 5,
         ('normally',): 5,
         ('communitys',): 5,
         ('direction',): 5,
         ('mainly',): 5,
         ('implementation',): 5,
         ('appropriate',): 5,
         ('noon',): 5,
         ('financial',): 5,
         ('oil',): 5,
         ('tax',): 5,
         ('million',): 5,
         ('kind',): 5,
         ('ecu',): 5,
         ('detailed',): 5,
         ('value',): 5,
         ('target',): 5,

         ('thermie',): 1,
         ('regrets',): 1,
         ('phare',): 1,
         ('tacis',): 1,
         ('extravagant',): 1,
         ('machine',): 1,
         ('patch',): 1,
         ('scrapheap',): 1,
         ('binding',): 1,
         ('2000',): 1,
         ('drastically',): 1,
         ('values',): 1,
         ('pillar',): 1,
         ('reallocate',): 1,
         ('putting',): 1,
         ('2050',): 1,
         ('earliest',): 1,
         ('obstacle',): 1,
         ('aids',): 1,
         ('lyrical',): 1,
         ('talked',): 1,
         ('raise',): 1,
         ('sights',): 1,
         ('considerably',): 1,
         ('3040',): 1,
         ('focus',): 1,
         ('heavy',): 1,
         ('penalties',): 1,
         ('pollution',): 1,
         ('simple',): 1,
         ('mechanism',): 1,
         ('squandering',): 1,
         ('kinds',): 1,
         ('praise',): 1,
         ('examination',): 1,
         ('obtained',): 1,
         ('cleaner',): 1,
         ('economy',): 1,
         

In [17]:
bigram_counter = Counter()
sample = corpus_clean_no_OOV[0:1000]

for sent in sample:
    bigram_counter.update([gram for gram in ngrams(sent, 2, pad_left=True, pad_right=True,
                                                   left_pad_symbol='<s>',right_pad_symbol='<e>') ])
pprint(bigram_counter)

Counter({('of', 'the'): 208,
         ('in', 'the'): 142,
         ('<s>', 'i'): 109,
         ('the', 'european'): 96,
         ('to', 'the'): 94,
         ('it', 'is'): 90,
         ('<s>', 'the'): 87,
         ('nuclear', 'power'): 82,
         ('for', 'the'): 76,
         ('on', 'the'): 70,
         ('<s>', 'it'): 68,
         ('<s>', 'we'): 66,
         ('that', 'the'): 64,
         ('and', 'the'): 63,
         ('mr', 'president'): 60,
         ('<s>', 'mr'): 58,
         ('renewable', 'energy'): 57,
         ('energy', 'sources'): 51,
         ('to', 'be'): 50,
         ('of', 'nuclear'): 45,
         ('is', 'a'): 44,
         ('nuclear', 'energy'): 44,
         ('<s>', 'in'): 42,
         ('european', 'union'): 42,
         ('by', 'the'): 41,
         ('that', 'is'): 40,
         ('we', 'are'): 39,
         ('member', 'states'): 39,
         ('i', 'would'): 38,
         ('the', 'commission'): 38,
         ('with', 'the'): 37,
         ('<s>', 'that'): 37,
         ('and', 'i'): 

         ('is', 'already'): 2,
         ('community', 'plan'): 2,
         ('plan', 'of'): 2,
         ('action', '<e>'): 2,
         ('available', 'which'): 2,
         ('that', 'context'): 2,
         ('european', 'level'): 2,
         ('and', 'this'): 2,
         ('towards', 'a'): 2,
         ('substantial', 'reduction'): 2,
         ('in', 'addition'): 2,
         ('themselves', '<e>'): 2,
         ('the', 'application'): 2,
         ('application', 'of'): 2,
         ('raw', 'materials'): 2,
         ('disarmament', 'and'): 2,
         ('peace', '<e>'): 2,
         ('the', 'specifications'): 2,
         ('efforts', 'towards'): 2,
         ('is', 'important'): 2,
         ('particularly', 'to'): 2,
         ('to', 'deal'): 2,
         ('deal', 'with'): 2,
         ('we', 'wish'): 2,
         ('the', 'debates'): 2,
         ('institutional', 'bodies'): 2,
         ('and', 'let'): 2,
         ('commission', 'for'): 2,
         ('community', 'action'): 2,
         ('that', 'has'): 2,


         ('age', 'but'): 1,
         ('but', 'their'): 1,
         ('their', 'ageing'): 1,
         ('ageing', 'must'): 1,
         ('not', 'weaken'): 1,
         ('weaken', 'their'): 1,
         ('a', 'safety'): 1,
         ('safety', 'point'): 1,
         ('eu', 'must'): 1,
         ('must', 'use'): 1,
         ('use', 'its'): 1,
         ('its', 'knowhow'): 1,
         ('knowhow', 'and'): 1,
         ('and', 'expertise'): 1,
         ('expertise', 'to'): 1,
         ('to', 'help'): 1,
         ('help', 'the'): 1,
         ('cis', 'in'): 1,
         ('in', 'solving'): 1,
         ('solving', 'their'): 1,
         ('safety', 'problems'): 1,
         ('matter', 'was'): 1,
         ('was', 'referred'): 1,
         ('referred', 'to'): 1,
         ('energy', 'charter'): 1,
         ('charter', '<e>'): 1,
         ('resources', 'set'): 1,
         ('set', 'aside'): 1,
         ('aside', 'for'): 1,
         ('purpose', 'must'): 1,
         ('however', 'be'): 1,
         ('taken', 'away'): 1

         ('provided', 'on'): 1,
         ('the', 'advantages'): 1,
         ('advantages', 'and'): 1,
         ('and', 'disadvantages'): 1,
         ('disadvantages', 'when'): 1,
         ('when', 'exporting'): 1,
         ('exporting', 'nuclear'): 1,
         ('is', 'far'): 1,
         ('more', 'important'): 1,
         ('examine', 'alternative'): 1,
         ('points', 'i'): 1,
         ('i', 'quite'): 1,
         ('quite', 'definitely'): 1,
         ('definitely', 'reject'): 1,
         ('reject', 'the'): 1,
         ('will', 'support'): 1,
         ('support', 'all'): 1,
         ('amendments', 'relating'): 1,
         ('to', 'abandoning'): 1,
         ('abandoning', 'nuclear'): 1,
         ('president', 'once'): 1,
         ('again', 'here'): 1,
         ('the', 'difficulties'): 1,
         ('difficulties', 'is'): 1,
         ('debate', 'calm'): 1,
         ('calm', 'and'): 1,
         ('and', 'collected'): 1,
         ('collected', 'and'): 1,
         ('and', 'avoid'): 1,
       

         ('i', 'very'): 1,
         ('very', 'much'): 1,
         ('much', 'welcome'): 1,
         ('has', 'recognised'): 1,
         ('recognised', 'this'): 1,
         ('need', 'and'): 1,
         ('and', 'incorporated'): 1,
         ('incorporated', 'parliaments'): 1,
         ('parliaments', 'proposals'): 1,
         ('proposals', 'in'): 1,
         ('entirely', 'agree'): 1,
         ('commissions', 'view'): 1,
         ('the', 'priorities'): 1,
         ('priorities', 'currently'): 1,
         ('currently', 'set'): 1,
         ('set', 'for'): 1,
         ('are', 'incompatible'): 1,
         ('incompatible', 'with'): 1,
         ('longterm', 'requirements'): 1,
         ('requirements', 'of'): 1,
         ('paper', 'the'): 1,
         ('is', 'presenting'): 1,
         ('presenting', 'a'): 1,
         ('a', 'really'): 1,
         ('really', 'god'): 1,
         ('god', 'analysis'): 1,
         ('situation', 'although'): 1,
         ('say', 'i'): 1,
         ('have', 'wished'): 1,
   

         ('co2', 'issue'): 1,
         ('issue', 'seriously'): 1,
         ('seriously', 'and'): 1,
         ('taking', 'an'): 1,
         ('active', 'responsibility'): 1,
         ('responsibility', 'for'): 1,
         ('for', 'avoiding'): 1,
         ('avoiding', 'the'): 1,
         ('threat', 'of'): 1,
         ('of', 'climate'): 1,
         ('climate', 'change'): 1,
         ('the', 'maximum'): 1,
         ('maximum', 'use'): 1,
         ('be', 'ensured'): 1,
         ('ensured', 'if'): 1,
         ('if', 'member'): 1,
         ('states', 'continue'): 1,
         ('be', 'allowed'): 1,
         ('allowed', 'to'): 1,
         ('pursue', 'an'): 1,
         ('an', 'independent'): 1,
         ('independent', 'energy'): 1,
         ('policy', 'based'): 1,
         ('on', 'their'): 1,
         ('own', 'circumstances'): 1,
         ('circumstances', 'and'): 1,
         ('and', 'conditions'): 1,
         ('the', 'subsidiarity'): 1,
         ('subsidiarity', 'principle'): 1,
         ('princ

         ('at', 'issue'): 1,
         ('issue', 'and'): 1,
         ('happen', 'in'): 1,
         ('understand', 'the'): 1,
         ('point', 'you'): 1,
         ('you', 'make'): 1,
         ('make', '<e>'): 1,
         ('was', 'trying'): 1,
         ('to', 'expedite'): 1,
         ('expedite', 'the'): 1,
         ('the', 'business'): 1,
         ('business', 'of'): 1,
         ('am', 'absolutely'): 1,
         ('absolutely', 'certain'): 1,
         ('certain', 'that'): 1,
         ('i', 'operated'): 1,
         ('operated', 'within'): 1,
         ('rules', '<e>'): 1,
         ('you', 'read'): 1,
         ('read', 'rule'): 1,
         ('1155', 'you'): 1,
         ('will', 'find'): 1,
         ('is', 'entitled'): 1,
         ('to', 'block'): 1,
         ('fact', 'there'): 1,
         ('no', 'need'): 1,
         ('give', 'notice'): 1,
         ('notice', 'of'): 1,
         ('that', 'but'): 1,
         ('services', 'through'): 1,
         ('through', 'this'): 1,
         ('week', 'have')

         ('situation', 'determined'): 1,
         ('determined', 'by'): 1,
         ('of', 'foreign'): 1,
         ('foreign', 'troops'): 1,
         ('troops', 'not'): 1,
         ('in', 'bosnia'): 1,
         ('bosnia', 'but'): 1,
         ('all', 'over'): 1,
         ('balkan', 'area'): 1,
         ('area', 'either'): 1,
         ('either', 'in'): 1,
         ('of', 'multinational'): 1,
         ('multinational', 'forces'): 1,
         ('forces', 'as'): 1,
         ('of', 'albania'): 1,
         ('albania', 'or'): 1,
         ('military', 'bases'): 1,
         ('bases', 'and'): 1,
         ('and', 'facilities'): 1,
         ('facilities', '<e>'): 1,
         ('agreements', 'between'): 1,
         ('area', 'instead'): 1,
         ('of', 'supporting'): 1,
         ('the', 'creation'): 1,
         ('new', 'zones'): 1,
         ('zones', 'of'): 1,
         ('of', 'influence'): 1,
         ('influence', 'and'): 1,
         ('of', 'interests'): 1,
         ('interests', 'alien'): 1,
     

In [18]:
trigram_counter = Counter()
sample = corpus_clean_no_OOV[0:1000]
for sent in sample:
    trigram_counter.update([gram for gram in ngrams(sent, 3, pad_left=True, pad_right=True,
                                                   left_pad_symbol='<s>',right_pad_symbol='<e>') ])
pprint(trigram_counter)

Counter({('<s>', '<s>', 'i'): 109,
         ('<s>', '<s>', 'the'): 87,
         ('<s>', '<s>', 'it'): 68,
         ('<s>', '<s>', 'we'): 66,
         ('<s>', '<s>', 'mr'): 58,
         ('<s>', 'mr', 'president'): 52,
         ('<s>', '<s>', 'in'): 42,
         ('the', 'european', 'union'): 39,
         ('renewable', 'energy', 'sources'): 38,
         ('<s>', '<s>', 'that'): 37,
         ('<s>', 'it', 'is'): 35,
         ('<s>', '<s>', 'this'): 35,
         ('energy', '<e>', '<e>'): 32,
         ('nuclear', 'power', 'stations'): 26,
         ('of', 'the', 'european'): 25,
         ('report', '<e>', '<e>'): 24,
         ('<s>', '<s>', 'if'): 23,
         ('<s>', '<s>', 'and'): 21,
         ('<s>', 'that', 'is'): 20,
         ('<s>', '<s>', 'but'): 20,
         ('the', 'european', 'parliament'): 19,
         ('<s>', '<s>', 'there'): 19,
         ('mr', 'president', 'i'): 18,
         ('policy', '<e>', '<e>'): 17,
         ('of', 'nuclear', 'power'): 17,
         ('of', 'renewable', 'energ

         ('<s>', '<s>', 'similarly'): 2,
         ('for', 'a', 'common'): 2,
         ('at', 'this', 'point'): 2,
         ('practice', '<e>', '<e>'): 2,
         ('small', 'and', 'mediumsized'): 2,
         ('the', 'house', 'is'): 2,
         ('people', 'in', 'the'): 2,
         ('<s>', 'i', 'do'): 2,
         ('here', 'in', 'parliament'): 2,
         ('responsible', 'for', 'the'): 2,
         ('protests', 'about', 'the'): 2,
         ('about', 'the', 'use'): 2,
         ('<s>', '<s>', 'nobody'): 2,
         ('to', 'the', 'question'): 2,
         ('wish', 'to', 'see'): 2,
         ('on', 'this', '<e>'): 2,
         ('which', 'has', 'been'): 2,
         ('within', 'the', 'european'): 2,
         ('debate', '<e>', '<e>'): 2,
         ('a', 'certain', 'amount'): 2,
         ('certain', 'amount', 'of'): 2,
         ('i', 'am', 'afraid'): 2,
         ('am', 'afraid', 'that'): 2,
         ('afraid', 'that', 'the'): 2,
         ('that', 'the', 'same'): 2,
         ('sources', 'as', 'with'): 

         ('deserve', 'safety', 'levels'): 1,
         ('safety', 'levels', 'which'): 1,
         ('levels', 'which', 'are'): 1,
         ('which', 'are', 'equivalent'): 1,
         ('are', 'equivalent', 'to'): 1,
         ('equivalent', 'to', 'ours'): 1,
         ('to', 'ours', '<e>'): 1,
         ('ours', '<e>', '<e>'): 1,
         ('if', 'we', 'just'): 1,
         ('we', 'just', 'get'): 1,
         ('just', 'get', 'up'): 1,
         ('get', 'up', 'and'): 1,
         ('up', 'and', 'go'): 1,
         ('and', 'go', 'we'): 1,
         ('go', 'we', 'would'): 1,
         ('would', 'also', 'be'): 1,
         ('also', 'be', 'replaced'): 1,
         ('be', 'replaced', 'in'): 1,
         ('replaced', 'in', 'that'): 1,
         ('in', 'that', 'domain'): 1,
         ('that', 'domain', 'and'): 1,
         ('domain', 'and', 'it'): 1,
         ('is', 'our', 'very'): 1,
         ('our', 'very', 'own'): 1,
         ('very', 'own', 'safety'): 1,
         ('own', 'safety', 'which'): 1,
         ('which

         ('development', 'and', 'puts'): 1,
         ('and', 'puts', 'forward'): 1,
         ('puts', 'forward', 'some'): 1,
         ('forward', 'some', 'suggestions'): 1,
         ('some', 'suggestions', 'in'): 1,
         ('suggestions', 'in', 'reply'): 1,
         ('in', 'reply', '<e>'): 1,
         ('reply', '<e>', '<e>'): 1,
         ('<s>', 'the', 'committee'): 1,
         ('committee', 'on', 'energys'): 1,
         ('on', 'energys', 'report'): 1,
         ('energys', 'report', 'presented'): 1,
         ('report', 'presented', 'by'): 1,
         ('presented', 'by', 'mr'): 1,
         ('soulier', 'on', 'the'): 1,
         ('commission', 'communication', 'is'): 1,
         ('communication', 'is', 'a'): 1,
         ('is', 'a', 'balanced'): 1,
         ('a', 'balanced', 'one'): 1,
         ('balanced', 'one', '<e>'): 1,
         ('my', 'opinion', 'it'): 1,
         ('opinion', 'it', 'succeeds'): 1,
         ('it', 'succeeds', 'in'): 1,
         ('succeeds', 'in', 'avoiding'): 1,
   

         ('we', 'need', 'work'): 1,
         ('need', 'work', '<e>'): 1,
         ('<s>', 'i', 'refer'): 1,
         ('refer', 'to', 'high'): 1,
         ('to', 'high', 'technology'): 1,
         ('high', 'technology', '<e>'): 1,
         ('is', 'the', 'nuclear'): 1,
         ('nuclear', 'industry', 'ladies'): 1,
         ('industry', 'ladies', 'and'): 1,
         ('and', 'gentlemen', 'which'): 1,
         ('gentlemen', 'which', 'has'): 1,
         ('which', 'has', 'enabled'): 1,
         ('has', 'enabled', 'our'): 1,
         ('enabled', 'our', 'industries'): 1,
         ('our', 'industries', 'greatly'): 1,
         ('industries', 'greatly', 'to'): 1,
         ('greatly', 'to', 'improve'): 1,
         ('to', 'improve', 'technology'): 1,
         ('improve', 'technology', '<e>'): 1,
         ('<s>', '<s>', 'china'): 1,
         ('<s>', 'china', 'now'): 1,
         ('china', 'now', 'has'): 1,
         ('now', 'has', '2'): 1,
         ('has', '2', '100'): 1,
         ('2', '100', 'nuclea

         ('of', 'view', 'which'): 1,
         ('view', 'which', 'go'): 1,
         ('which', 'go', 'beyond'): 1,
         ('go', 'beyond', 'shortterm'): 1,
         ('and', 'gentlemen', 'what'): 1,
         ('gentlemen', 'what', 'i'): 1,
         ('what', 'i', 'have'): 1,
         ('to', 'say', 'represents'): 1,
         ('say', 'represents', 'my'): 1,
         ('represents', 'my', 'personal'): 1,
         ('my', 'personal', 'view'): 1,
         ('personal', 'view', 'and'): 1,
         ('view', 'and', 'not'): 1,
         ('and', 'not', 'the'): 1,
         ('not', 'the', 'view'): 1,
         ('the', 'view', 'of'): 1,
         ('view', 'of', 'my'): 1,
         ('my', 'group', '<e>'): 1,
         ('soulier', 'for', 'what'): 1,
         ('for', 'what', 'he'): 1,
         ('what', 'he', 'has'): 1,
         ('he', 'has', 'written'): 1,
         ('has', 'written', 'in'): 1,
         ('written', 'in', 'his'): 1,
         ('in', 'his', 'report'): 1,
         ('<s>', '<s>', 'his'): 1,
         (

         ('in', 'the', 'production'): 1,
         ('production', 'of', 'mox'): 1,
         ('of', 'mox', 'fuels'): 1,
         ('mox', 'fuels', 'gives'): 1,
         ('fuels', 'gives', 'it'): 1,
         ('gives', 'it', 'a'): 1,
         ('it', 'a', 'technological'): 1,
         ('a', 'technological', 'lead'): 1,
         ('technological', 'lead', 'UNK'): 1,
         ('lead', 'UNK', 'anywhere'): 1,
         ('UNK', 'anywhere', 'else'): 1,
         ('anywhere', 'else', 'in'): 1,
         ('else', 'in', 'the'): 1,
         ('application', 'of', 'this'): 1,
         ('of', 'this', 'technical'): 1,
         ('this', 'technical', 'knowledge'): 1,
         ('technical', 'knowledge', 'to'): 1,
         ('knowledge', 'to', 'produce'): 1,
         ('to', 'produce', 'fuels'): 1,
         ('produce', 'fuels', 'from'): 1,
         ('fuels', 'from', 'UNK'): 1,
         ('from', 'UNK', 'raw'): 1,
         ('UNK', 'raw', 'materials'): 1,
         ('raw', 'materials', 'of'): 1,
         ('materials', 

         ('that', 'involves', '<e>'): 1,
         ('involves', '<e>', '<e>'): 1,
         ('<s>', 'the', 'cultivation'): 1,
         ('cultivation', 'of', 'energy'): 1,
         ('of', 'energy', 'crops'): 1,
         ('energy', 'crops', 'could'): 1,
         ('crops', 'could', 'create'): 1,
         ('could', 'create', 'an'): 1,
         ('create', 'an', 'additional'): 1,
         ('an', 'additional', 'turnover'): 1,
         ('additional', 'turnover', 'of'): 1,
         ('turnover', 'of', 'ecu'): 1,
         ('of', 'ecu', 'UNK'): 1,
         ('ecu', 'UNK', 'billion'): 1,
         ('UNK', 'billion', 'for'): 1,
         ('billion', 'for', 'the'): 1,
         ('it', 'would', 'also'): 1,
         ('would', 'also', 'offer'): 1,
         ('also', 'offer', 'an'): 1,
         ('offer', 'an', 'incentive'): 1,
         ('an', 'incentive', 'for'): 1,
         ('incentive', 'for', 'cultivating'): 1,
         ('for', 'cultivating', 'setaside'): 1,
         ('cultivating', 'setaside', 'areas'): 1,


         ('because', 'it', 'UNK'): 1,
         ('UNK', 'be', 'effectively'): 1,
         ('be', 'effectively', 'recycled'): 1,
         ('effectively', 'recycled', '<e>'): 1,
         ('recycled', '<e>', '<e>'): 1,
         ('i', 'would', 'recall'): 1,
         ('would', 'recall', 'that'): 1,
         ('case', 'of', 'UNK'): 1,
         ('UNK', 'wastes', 'carbon'): 1,
         ('wastes', 'carbon', 'dioxide'): 1,
         ('carbon', 'dioxide', 'is'): 1,
         ('dioxide', 'is', 'produced'): 1,
         ('is', 'produced', 'even'): 1,
         ('produced', 'even', 'if'): 1,
         ('even', 'if', 'they'): 1,
         ('they', 'are', 'UNK'): 1,
         ('are', 'UNK', 'but'): 1,
         ('UNK', 'but', 'they'): 1,
         ('not', 'produce', 'any'): 1,
         ('produce', 'any', 'useful'): 1,
         ('any', 'useful', 'energy'): 1,
         ('useful', 'energy', 'in'): 1,
         ('this', 'way', '<e>'): 1,
         ('<s>', 'it', 'seems'): 1,
         ('seems', 'to', 'me'): 1,
         

         ('scientific', 'institution', 'on'): 1,
         ('institution', 'on', 'this'): 1,
         ('i', 'think', 'it'): 1,
         ('think', 'it', 'is'): 1,
         ('is', 'simply', 'not'): 1,
         ('simply', 'not', 'enough'): 1,
         ('not', 'enough', 'to'): 1,
         ('enough', 'to', 'promote'): 1,
         ('to', 'promote', '100'): 1,
         ('promote', '100', '000'): 1,
         ('100', '000', 'roofs'): 1,
         ('000', 'roofs', 'programmes'): 1,
         ('roofs', 'programmes', 'here'): 1,
         ('programmes', 'here', '<e>'): 1,
         ('i', 'would', 'wish'): 1,
         ('would', 'wish', 'to'): 1,
         ('to', 'see', 'a'): 1,
         ('see', 'a', 'more'): 1,
         ('a', 'more', 'detailed'): 1,
         ('more', 'detailed', 'analysis'): 1,
         ('detailed', 'analysis', 'of'): 1,
         ('analysis', 'of', 'whether'): 1,
         ('of', 'whether', 'it'): 1,
         ('whether', 'it', 'makes'): 1,
         ('it', 'makes', 'any'): 1,
         ('ma

         ('block', 'to', 'have'): 1,
         ('have', 'a', 'look'): 1,
         ('look', 'at', 'them'): 1,
         ('at', 'them', 'they'): 1,
         ('them', 'they', 'were'): 1,
         ('they', 'were', 'entitled'): 1,
         ('were', 'entitled', 'to'): 1,
         ('entitled', 'to', 'do'): 1,
         ('<s>', 'but', 'the'): 1,
         ('but', 'the', 'block'): 1,
         ('the', 'block', 'vote'): 1,
         ('block', 'vote', 'is'): 1,
         ('vote', 'is', 'quite'): 1,
         ('is', 'quite', 'clearly'): 1,
         ('quite', 'clearly', 'provided'): 1,
         ('clearly', 'provided', 'for'): 1,
         ('provided', 'for', 'within'): 1,
         ('for', 'within', 'our'): 1,
         ('within', 'our', 'rules'): 1,
         ('our', 'rules', 'and'): 1,
         ('rules', 'and', 'we'): 1,
         ('and', 'we', 'follow'): 1,
         ('we', 'follow', 'the'): 1,
         ('follow', 'the', 'rules'): 1,
         ('president', 'there', 'are'): 1,
         ('are', 'still', 'differ

         ('seafood', 'sector', 'and'): 1,
         ('sector', 'and', 'which'): 1,
         ('and', 'which', 'at'): 1,
         ('which', 'at', 'the'): 1,
         ('the', 'moment', 'are'): 1,
         ('moment', 'are', 'not'): 1,
         ('not', 'particularly', 'overfished'): 1,
         ('particularly', 'overfished', '<e>'): 1,
         ('overfished', '<e>', '<e>'): 1,
         ('since', 'the', 'crampton'): 1,
         ('crampton', 'report', 'has'): 1,
         ('report', 'has', 'set'): 1,
         ('has', 'set', 'most'): 1,
         ('set', 'most', 'of'): 1,
         ('most', 'of', 'these'): 1,
         ('of', 'these', 'objectives'): 1,
         ('these', 'objectives', 'our'): 1,
         ('objectives', 'our', 'group'): 1,
         ('our', 'group', 'voted'): 1,
         ('group', 'voted', 'in'): 1,
         ('voted', 'in', 'favour'): 1,
         ('of', 'it', 'despite'): 1,
         ('it', 'despite', 'some'): 1,
         ('despite', 'some', 'of'): 1,
         ('of', 'the', 'lacks'): 

         ('regional', 'cooperation', '<e>'): 1,
         ('<s>', 'we', 'disagree'): 1,
         ('we', 'disagree', 'radically'): 1,
         ('disagree', 'radically', 'with'): 1,
         ('radically', 'with', 'the'): 1,
         ('with', 'the', 'proposal'): 1,
         ('proposal', 'for', 'discriminatory'): 1,
         ('for', 'discriminatory', 'treatment'): 1,
         ('discriminatory', 'treatment', 'which'): 1,
         ('treatment', 'which', 'aims'): 1,
         ('which', 'aims', 'at'): 1,
         ('aims', 'at', 'nothing'): 1,
         ('at', 'nothing', 'else'): 1,
         ('nothing', 'else', 'than'): 1,
         ('else', 'than', 'the'): 1,
         ('than', 'the', 'perpetuation'): 1,
         ('the', 'perpetuation', 'of'): 1,
         ('perpetuation', 'of', 'a'): 1,
         ('of', 'a', 'policy'): 1,
         ('policy', 'of', 'division'): 1,
         ('of', 'division', 'and'): 1,
         ('division', 'and', 'discrimination'): 1,
         ('and', 'discrimination', 'the'): 1,
  

         ('and', 'control', 'every'): 1,
         ('control', 'every', 'aspect'): 1,
         ('every', 'aspect', 'of'): 1,
         ('power', 'industry', '<e>'): 1})
