## Extract key phrases to put as example queries on web site

In [6]:
PROC_DATA_PREFIX = '/Users/alexskrn/Documents/NLP/WordAlign/wordalign_notebooks/data'

In [7]:
corpus_file = 'en_ru_heroku_1000'


In [8]:
import string
import nltk
from nltk.corpus import stopwords

def preprocess(text):
    """Return a string cleaned up."""
    # lowercase
    text = text.lower()
    # word-tokenize & remove numbers if the entire token consists of numbers
    text = ' '.join(t for t in nltk.wordpunct_tokenize(text) if not t.isdigit() and not t in stopwords.words('english'))
    # remove punctuation
    punct_remove = set(string.punctuation) | {'−', '\t', '\n', '\r', '\x0b', '\x0c', '◦', '°'}
    text = ''.join(char for char in text if char not in punct_remove)
    # strip extra whitespaces
    text = ' '.join(text.split())
    return text

In [9]:
corpus_src = []
with open(PROC_DATA_PREFIX + '/' + corpus_file, 'r', encoding='utf8') as inF:
    for line in inF:
        src, _ = line.split('\t')
        corpus_src.append(preprocess(src).strip())
print(len(corpus_src))

994


In [13]:
def ngrammer(tokens, n=2):
    ngrams = []
    for i in range(0,len(tokens)-n+1):
        ngrams.append(' '.join(tokens[i:i+n]))
    return ngrams

In [17]:
from collections import Counter

word_counter = Counter()

for text in corpus_src:
    text = text.split()
    word_counter.update(ngrammer(text, n=2))

print(word_counter.most_common(30))

[('united nations', 39), ('secretary general', 39), ('security council', 18), ('paragraph resolution', 14), ('decides extend', 12), ('democratic republic', 11), ('republic congo', 11), ('member states', 11), ('international tribunal', 11), ('special tribunal', 11), ('humanitarian assistance', 10), ('requests secretary', 10), ('imposed paragraph', 9), ('measures imposed', 9), ('human rights', 9), ('general report', 9), ('decides remain', 9), ('seized matter', 9), ('specially designed', 9), ('extend mandate', 8), ('calls upon', 8), ('remain seized', 8), ('united states', 7), ('arms embargo', 7), ('armed groups', 7), ('troop contributing', 6), ('contributing countries', 6), ('displaced persons', 6), ('tribunal rwanda', 6), ('unmovic iaea', 6)]


In [18]:
word_counter = Counter()

for text in corpus_src:
    text = text.split()
    word_counter.update(ngrammer(text, n=3))

print(word_counter.most_common(30))

[('democratic republic congo', 11), ('requests secretary general', 10), ('secretary general report', 9), ('decides extend mandate', 8), ('decides remain seized', 8), ('remain seized matter', 8), ('measures imposed paragraph', 7), ('troop contributing countries', 6), ('international tribunal rwanda', 6), ('charter united nations', 6), ('united states america', 5), ('imposed paragraph resolution', 5), ('responsibility identified committee', 5), ('report secretary general', 5), ('mandate united nations', 4), ('syrian arab republic', 4), ('disarmament demobilization reintegration', 4), ('president security council', 4), ('unmanned aerial vehicles', 4), ('condemns violations provisions', 4), ('violations provisions moscow', 4), ('provisions moscow agreement', 4), ('moscow agreement may', 4), ('agreement may ceasefire', 4), ('may ceasefire separation', 4), ('ceasefire separation forces', 4), ('separation forces annex', 4), ('lasting solution question', 4), ('solution question western', 4), (

In [19]:
word_counter = Counter()

for text in corpus_src:
    text = text.split()
    word_counter.update(ngrammer(text, n=4))

print(word_counter.most_common(30))

[('decides remain seized matter', 8), ('condemns violations provisions moscow', 4), ('violations provisions moscow agreement', 4), ('provisions moscow agreement may', 4), ('moscow agreement may ceasefire', 4), ('agreement may ceasefire separation', 4), ('may ceasefire separation forces', 4), ('ceasefire separation forces annex', 4), ('lasting solution question western', 4), ('solution question western sahara', 4), ('technology development design production', 4), ('mindua democratic republic congo', 3), ('gon kwon republic korea', 3), ('measures imposed paragraph resolution', 3), ('requests secretary general report', 3), ('requests secretary general submit', 3), ('secretary general report council', 3), ('non civil certified aircraft', 3), ('unmanned aerial vehicles parts', 3), ('aerial vehicles parts components', 3), ('violations international humanitarian law', 3), ('development design production components', 3), ('design production components equipment', 3), ('antoine kesia mbe mindua

In [20]:
from nltk.collocations import *

In [21]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()

In [27]:
coprus_list = []
for text in corpus_src:
    coprus_list.extend(text.split())
    
finder2 = BigramCollocationFinder.from_words(coprus_list)
finder2.nbest(bigram_measures.likelihood_ratio, 50)

[('united', 'nations'),
 ('secretary', 'general'),
 ('seized', 'matter'),
 ('security', 'council'),
 ('decides', 'extend'),
 ('human', 'rights'),
 ('specially', 'designed'),
 ('republic', 'congo'),
 ('member', 'states'),
 ('remain', 'seized'),
 ('paragraph', 'resolution'),
 ('special', 'tribunal'),
 ('calls', 'upon'),
 ('democratic', 'republic'),
 ('measures', 'imposed'),
 ('owned', 'controlled'),
 ('arms', 'embargo'),
 ('humanitarian', 'assistance'),
 ('western', 'sahara'),
 ('imposed', 'paragraph'),
 ('ballistic', 'missile'),
 ('decides', 'remain'),
 ('troop', 'contributing'),
 ('armed', 'groups'),
 ('requests', 'secretary'),
 ('displaced', 'persons'),
 ('unmovic', 'iaea'),
 ('international', 'tribunal'),
 ('contributing', 'countries'),
 ('extend', 'mandate'),
 ('anti', 'tank'),
 ('côte', 'ivoire'),
 ('privileges', 'immunities'),
 ('russian', 'federation'),
 ('sierra', 'leone'),
 ('disarmament', 'demobilization'),
 ('lasting', 'solution'),
 ('unmanned', 'aerial'),
 ('separation', 'fo

In [26]:
coprus_list[:3]

[['antoine', 'kesia', 'mbe', 'mindua', 'democratic', 'republic', 'congo'],
 ['howard', 'morrison', 'united', 'kingdom'],
 ['jean', 'claude', 'antonetti', 'france']]