In [1]:
import re
from math import log

In [2]:
def bigram(x):
    for i in range(len(x)-1):
        yield x[i] + '_' + x[i+1]

## Read

In [3]:
# считываем датасет
en_articles_part = sc.textFile("/data/wiki/en_articles_part")

In [4]:
# делаем из en_articles_part датасет "ключ-значение"
splitted_articles = en_articles_part.map(lambda x: (x.split('\t')[0], x.split('\t')[1]))

In [5]:
# считываем стоп-слова
stop_words = sc.textFile("/data/wiki/stop_words_en-xpo6.txt")

In [6]:
stop_words = stop_words.collect()

In [7]:
stop_words[:5]

[u'a', u'about', u'above', u'across', u'after']

## Задача #1: народные биграммы.

In [18]:
# оставляем в текстах статей только буквы
cleaned_articles = splitted_articles.map(lambda x: re.sub('[^a-z0-9 ]', '', x[1].lower()))

In [20]:
# делаем списки из слов (делим по пробелам)
articles_to_list = cleaned_articles.map(lambda x: [word.strip() for word in x.split(' ')])\
                                   .map(lambda x: [word for word in x if word])

In [21]:
# делаем биграммы
bigrams = articles_to_list.map(lambda x: list(bigram(x)))

In [22]:
bigrams.take(5)

[[u'anarchism_anarchism',
  u'anarchism_is',
  u'is_often',
  u'often_defined',
  u'defined_as',
  u'as_a',
  u'a_political',
  u'political_philosophy',
  u'philosophy_which',
  u'which_holds',
  u'holds_the',
  u'the_state',
  u'state_to',
  u'to_be',
  u'be_undesirable',
  u'undesirable_unnecessary',
  u'unnecessary_or',
  u'or_harmful',
  u'harmful_the',
  u'the_following',
  u'following_sources',
  u'sources_cite',
  u'cite_anarchism',
  u'anarchism_as',
  u'as_a',
  u'a_political',
  u'political_philosophy',
  u'philosophy_slevin',
  u'slevin_carl',
  u'carl_anarchism',
  u'anarchism_the',
  u'the_concise',
  u'concise_oxford',
  u'oxford_dictionary',
  u'dictionary_of',
  u'of_politics',
  u'politics_ed',
  u'ed_iain',
  u'iain_mclean',
  u'mclean_and',
  u'and_alistair',
  u'alistair_mcmillan',
  u'mcmillan_oxford',
  u'oxford_university',
  u'university_press',
  u'press_2003',
  u'2003_however',
  u'however_others',
  u'others_argue',
  u'argue_that',
  u'that_while',
  u'whil

In [23]:
# берем биграммы, начинающиеся со слова narodnaya
filtered_bigrams = bigrams.flatMap(lambda x: [word for word in x if word.startswith('narodnaya')])

In [24]:
filtered_bigrams.take(5)

[u'narodnaya_volya',
 u'narodnaya_volya',
 u'narodnaya_volya',
 u'narodnaya_volya',
 u'narodnaya_volya']

In [25]:
# подсчитываем число вхождений биграмм 
grouped_bigrams = filtered_bigrams.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y).collect()

In [26]:
grouped_bigrams

[(u'narodnaya_gazeta', 1), (u'narodnaya_volya', 9)]

In [27]:
# вывод в лексикографическом порядке
for key, value in sorted(grouped_bigrams, key=lambda x: x[0]):
    print key, '\t', value 

narodnaya_gazeta 	1
narodnaya_volya 	9


# Задача #2: коллокации

### 2.1 Подготовка RDD

In [28]:
# бродкастим словарь со стоп-словами
br_stop_words = sc.broadcast(stop_words)

In [34]:
# удаляем стоп-слова из списков слов, полученных из каждой статьи
filtered_lists = articles_to_list.map(lambda x: [word for word in x if word not in br_stop_words.value])

In [35]:
# получаем биграммы из отфильтрованного списка слов
upd_bigrams = filtered_lists.flatMap(lambda x: list(bigram(x)))

In [36]:
# подсчитываем число вхождений биграмм 
grouped_bigrams = upd_bigrams.map(lambda x: (x, 1))\
                             .reduceByKey(lambda x, y: x + y)

In [37]:
# берем только те биграммы, которые встречались более 500 раз 
frequent_bigrams = grouped_bigrams.filter(lambda x: x[1] >= 500)

In [38]:
frequent_bigrams.take(5)

[(u'soviet_union', 844),
 (u'catholic_church', 575),
 (u'american_baseball', 603),
 (u'united_nations', 695),
 (u'american_actor', 1092)]

### 2.2. NPMI

Общее кол-во слов в тексте

In [39]:
flat_words = filtered_lists.flatMap(lambda x: x)

In [40]:
flat_words.take(5)

[u'anarchism', u'anarchism', u'defined', u'political', u'philosophy']

In [41]:
total_number_of_words = flat_words.count()

In [42]:
total_number_of_words

6848907

In [43]:
total_number_of_words = sc.broadcast(total_number_of_words)

In [44]:
word_count = flat_words.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y)

In [45]:
word_count.take(5)

[(u'biennials', 10),
 (u'tripolitan', 2),
 (u'vexillifera', 1),
 (u'refreshable', 9),
 (u'capitaltheir', 1)]

In [46]:
# вероятность увидеть слово “a” в датасете
p_a = word_count.map(lambda x: (x[0], float(x[1])/total_number_of_words.value)).cache()

In [47]:
p_a.take(5)

[(u'biennials', 1.4600869890626344e-06),
 (u'tripolitan', 2.9201739781252686e-07),
 (u'vexillifera', 1.4600869890626343e-07),
 (u'refreshable', 1.314078290156371e-06),
 (u'nunnery', 7.300434945313172e-07)]

Общее кол-во пар слов в тексте

In [48]:
total_number_of_pairs = upd_bigrams.count()

In [49]:
total_number_of_pairs

6844807

In [50]:
total_number_of_pairs = sc.broadcast(total_number_of_pairs)

In [51]:
# вероятность увидеть пару слов “a” и “b”, идущих подряд
p_ab = frequent_bigrams.map(lambda x: (x[0], float(x[1])/total_number_of_pairs.value)).cache()

In [52]:
p_ab.take(5)

[(u'soviet_union', 0.00012330515674145379),
 (u'catholic_church', 8.40052904340473e-05),
 (u'american_baseball', 8.80959828377922e-05),
 (u'united_nations', 0.00010153682930723978),
 (u'american_actor', 0.00015953700374605158)]

In [53]:
p_ab.count()

38

Расчет по формуле:
$$ PMI(a,b) = ln( \dfrac{P(ab)}{P(a) * P(b)}) $$

$$ NPMI(a,b) = \dfrac{PMI(a,b)}{-ln(P(ab))} $$

In [54]:
part_1 = p_ab.map(lambda x: (x[0].split('_')[0], x))\
             .join(p_a)

In [55]:
part_1.take(5)

[(u'soviet',
  ((u'soviet_union', 0.00012330515674145379), 0.0003442885120209692)),
 (u'civil', ((u'civil_war', 0.00014492738801839117), 0.000366335825555815)),
 (u'references',
  ((u'references_reading', 7.567780946928087e-05), 0.0005938173784517733)),
 (u'references',
  ((u'references_external', 0.0001823280042812018), 0.0005938173784517733)),
 (u'high', ((u'high_school', 9.11640021406009e-05), 0.0008509386972257034))]

In [56]:
part_2 = part_1.map(lambda x: (x[1][0][0].split('_')[1], x[1]))\
             .join(p_a)

In [57]:
part_2.take(5)

[(u'union',
  (((u'soviet_union', 0.00012330515674145379), 0.0003442885120209692),
   0.0005089863243872343)),
 (u'zealand',
  (((u'new_zealand', 0.00011921446433770887), 0.0027582503310382227),
   0.0001242534027692302)),
 (u'africa',
  (((u'south_africa', 8.502796353498353e-05), 0.0008728400020616428),
   0.00029128735431799556)),
 (u'force',
  (((u'air_force', 0.00016581913850894554), 0.0006390800751127151),
   0.0005593593255098952)),
 (u'states',
  (((u'united_states', 0.0009579525032626925), 0.0014228547708415372),
   0.0015018454769498257))]

In [58]:
# x[1][0][0][0] - bigramm
# x[1][0][0][1] - P(ab)
# x[1][0][1] - P(a)
# x[1][1] - P(b)

part_3 = part_2.map(lambda x: (x[1][0][0][0], log(x[1][0][0][1]/(x[1][0][1]*x[1][1]))/(-log(x[1][0][0][1]))))\
               .collect()

In [62]:
part_3[:5]

[(u'soviet_union', 0.7284059680131149),
 (u'new_zealand', 0.6477064337046002),
 (u'south_africa', 0.6201557097970846),
 (u'university_press', 0.688645571160978),
 (u'air_force', 0.7053256101735037)]

In [60]:
TOP = 39

for key, value in sorted(part_3, key=lambda x: x[1], reverse=True)[:TOP]:
    print key, '\t', round(value, 3)

los_angeles 	0.972
external_links 	0.949
prime_minister 	0.884
united_states 	0.878
san_francisco 	0.852
new_york 	0.787
supreme_court 	0.775
19th_century 	0.757
20th_century 	0.751
references_external 	0.732
soviet_union 	0.728
air_force 	0.705
baseball_player 	0.692
university_press 	0.689
united_kingdom 	0.684
roman_catholic 	0.683
references_reading 	0.669
notes_references 	0.661
award_best 	0.661
north_america 	0.653
new_zealand 	0.648
civil_war 	0.64
catholic_church 	0.625
world_war 	0.621
south_africa 	0.62
war_ii 	0.617
took_place 	0.613
roman_empire 	0.61
united_nations 	0.593
american_singersongwriter 	0.565
high_school 	0.564
american_actor 	0.559
american_actress 	0.54
american_baseball 	0.513
york_city 	0.491
american_football 	0.481
years_later 	0.414
north_american 	0.379
