In [9]:
import sys
import numpy as np
import operator
import pandas
import pymorphy2
import math
import re

In [3]:
input_path = 'data/news_src_no_tokenized.txt'

In [146]:
# определение части речи
# return (normal form, POS, proper name?)
class POSTagger:
    __stemmer = pymorphy2.MorphAnalyzer()
    __cache = {}
    def parse(self, token):
        form = ('', '')
        if token not in self.__cache:
            res = self.__stemmer.parse(token)[0]
            dd = str(res.tag).split(',')
            name = dd[min(len(dd) - 1, 3)].split()[0]
            surn = dd[min(len(dd) - 1, 4)].split()[0]
            form = (res.normal_form, res.tag.POS, name, surn)
            self.__cache[token] = form
        else :
            form = self.__cache[token]
        return form
pos_tagger = POSTagger()


wordFrequency = dict()
bigrams = dict()
tgrams = dict()
allbigrams = dict()

# получаем биграммы
def getData(input_file):
    wordFrequency.clear()
    bigrams.clear()
    tgrams.clear()
    allbigrams.clear()
    file = open(input_file)    
    for pos, line in enumerate(file):
        if pos == 50:
            break
        url, data = line.strip().split('\t')    
        data = re.split('[\s.,!?()"]+', data.decode('utf-8')) #data.split("\\p{P}?[ \\t\\n\\r]+")
#         print(data[:10])              
        for i in range(len(data) - 1):              
            w1 = data[i]
            w2 = data[i + 1]            
            if len(w1) < 2 or len(w2) < 2:
                continue
            morph1 = pos_tagger.parse(w1)
            morph2 = pos_tagger.parse(w2)
            wordFrequency[morph1[0]] = wordFrequency.get(morph1[0], 0) + 1            
            bigram = w1 + ' ' + w2
            allbigrams[bigram] = allbigrams.get(bigram, 0) + 1
            if i + 2 < len(data):
                w3 = data[i + 2]
                tgram = bigram + ' ' + w3
                tgrams[tgram] = tgrams.get(tgram, 0) + 1
            if morph1[2] == 'Name' or morph2[2] == 'Name' or morph1[3] == 'Surn' or morph2[3] == 'Surn':                
                bigrams[bigram] = bigrams.get(bigram, 0) + 1
#                 print bigram
        
    print 'bigrams =', len(bigrams)    


In [157]:
def word_count():
    cnt_words = 0
    for w in wordFrequency.keys():
        cnt_words += wordFrequency[w]
    return cnt_words

cnt_words = word_count()

#2.576
def students_test(bigram):
    w1, w2 = bigram.split()[:2]
    m1, m2 = pos_tagger.parse(w1)[0], pos_tagger.parse(w2)[0]
    p1 = float(wordFrequency.get(m1, 1)) / cnt_words
    p2 = float(wordFrequency.get(m2, 1)) / cnt_words    
    m = p1 * p2
    s = m * (1 - m)    
    x = float(bigrams[bigram]) / cnt_words    
    res = (x - m) / (s / cnt_words) ** 0.5
    return res

def count_bigrams():
    cnt_words = word_count()
#     print(cnt_words)
    studs = []
    for bigram in bigrams.keys():
        studs.append((bigram, students_test(bigram)))
    studs.sort(key=lambda x: x[1])
    return studs


In [158]:
def get_pandas(names, indexs, values):
    return pandas.DataFrame(data=values, columns=names, index=indexs)
# get_pandas(['a', 'b'], ['c', 'd', 'x'], [[11, 12], [22, 23], [33, 34]])

In [159]:
def get_3grams(s):
    res = []
    words = s.split()
    for i in range(len(words) - 2):
        b = ' '.join(words[i:i+2])
        allbigrams[b] = allbigrams.get(b, 0) + 1
        t = ' '.join(words[i:i+3])
        tgrams[t] = tgrams.get(t, 0) + 1
        res.append(t)
    return res

def value_3gram(s):
    w1, w2, w3 = s.split()
    cnt3 = tgrams[s]
    cnt2 = allbigrams[w1 + ' ' + w2] 
    return float(cnt3 + 1) / (cnt2 + len(allbigrams))

In [160]:
def test_1(input_file):
    getData(input_file)
    res = count_bigrams()
    for x in res[:10]:
        print x[0], x[1]
    
test_1(input_path)

def test_2(input_file):
    getData('data/news_src_tokenized.txt')
    s = 'Президент США Барак Обама решил сняться в телепередаче с Беар Гриллсом'
    tgrs = get_3grams(s)
    idx, p = [], []
    for t in tgrs:
        idx.append(tgrams[t])  
        p.append(value_3gram(t))
    return get_pandas(tgrs, ['idx', 'p'], [idx, p])

test_2(input_path)

bigrams = 598
на своей должности -0.606202370901
на Ерофее 2.68150285723
Дмитриев был 3.11409773864
на Тарановского 3.19212933612
он Александра 3.30526251542
развития Сергей 3.5165321044
в том числе 3.52464635481
Дмитриевым он 4.07560219698
Дмитриев он 4.07560219698
что любой 4.81021927868


Unnamed: 0,Президент США Барак,США Барак Обама,Барак Обама решил,Обама решил сняться,решил сняться в,сняться в телепередаче,в телепередаче с,телепередаче с Беар,с Беар Гриллсом
idx,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
p,0.000123,0.000123,0.000123,0.000123,0.000123,0.000123,0.000123,0.000123,0.000123
