# NLP HSE School

In [1]:
from pymystem3 import Mystem
import pandas as pd
import re
m=Mystem()
pd.set_option('display.width', 1000)
pd.set_option('display.expand_frame_repr', False)

In [2]:
sentilex = pd.read_csv('RuSentiLex2017_revised_2utf.txt',
                       names=['word','part','initial','sentiment','source','amb1','amb2'],
                       skiprows=20,skipinitialspace=True,index_col=2)
sentilex['sentiment'] = sentilex['sentiment'].map({'negative':-1,'positive':1,'neutral':0,'positive/negative':0})

In [3]:
vocab = sentilex.to_dict(orient='index')

In [4]:

def read_file(n):
    file = pd.read_csv('Texts/art{}.txt'.format(n),sep='\n',names=['lines'])
    file.lines = file.lines.str.replace("{Author, Unknown} ","")
    
    ent_df = pd.read_csv('Texts/art{}.ann'.format(n),sep='\t',names=['ind','descr','ne'],header=None)
    ent_df['initial'] = ent_df['ne'].apply(lambda x: ("".join(m.lemmatize(x.lower()))).replace('\n',""))
    ent_df = ent_df.set_index(ent_df['ne'])
    syn_df = pd.read_json('Texts/art{}syn.json'.format(n))

    syn_df = syn_df.reset_index()
    def get_struct_data(x):
        sentl =[]
        pars=[]
        for text in x:
            for sent in text:
                sentl = sent.split(' ')
                to_app = [sentl[i] for i in range(len(sentl)) if i in([0,1,4,6,8,10,12])]
                to_app.append(m.lemmatize(to_app[2])[0])
                pars.append(to_app)
        return(pars)

    syn_df['syntax'] = syn_df['syntax'].apply(get_struct_data)
    
    syn_df['index']=syn_df['index'] +1
    file = file.reset_index().merge(syn_df)
    return(file.drop('text',1), ent_df.to_dict(orient='index'))


In [5]:
import pandas as pd
from sentiframes_proc import find_polar
table = pd.read_csv('sentiframes_df.csv')

lst = [['31', '42', 'бомбит', 'VERB', 'Aspect=Imperf|VerbForm=Inf|fPOS=VERB++', '2', 'nsubj', 'бомбить'],       
       ['78', '83', 'Сирии', 'NOUN', 'Animacy=Inan|Case=Acc|Gender=Fem|Number=Sing|fPOS=NOUN++', '10', 'dobj', 'сирия'],
       ['107', '115', 'Румийлан', 'NOUN', 'Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing|fPOS=NOUN++', '15', 'appos', 'румийлан']]


find_polar(lst[0], lst[1], lst[2])


[('румийлан', 'сирия', -1), ('сирия', 'румийлан', -1)]

In [6]:
text_lines, ent_dict = read_file(1)

In [7]:
#получение данных из синтаксиса
def get_mutual_node(syntax,ent1,ent1pos,ent2,ent2pos):
    
    def get_node(e,p):
        
        e = e.split(" ")[0].split("-")[0]
        ent_dict = dict()
        for i in range(len(syntax)):
            if(syntax[i][2]==e):
                
                ent_dict[abs(int(syntax[i][1])-p)]=i
        
        return ent_dict[sorted(ent_dict)[0]]
    def get_route(i):
        route = list()
        for _ in range(len(syntax)):
            h=int(syntax[i][5])
            if(h ==-1): return route
            route.append(syntax[h])
            i=h
    ent_node1 = get_node(ent1,ent1pos)
    ent_node2 = get_node(ent2,ent2pos)
    
    route1= get_route(ent_node1)[::-1]
    route2= get_route(ent_node2)[::-1]
    if(len(route1)==0): return syntax[ent_node1],syntax[ent_node1],syntax[ent_node2]
    if(len(route2)==0): return syntax[ent_node2],syntax[ent_node1],syntax[ent_node2]

    for i in range(min(len(route1),len(route2))):
        
        if(route1[i][0]==route2[i][0]): obj=route1[i]
        else: break
    
    
    return [obj,syntax[ent_node1],syntax[ent_node2]]

In [8]:
import re
import nltk
from nltk import ngrams as ng
def get_sent_list(lemmas):
    sent_list = []
    string = "".join(lemmas)
    for i in reversed(range(1,min(11,len(lemmas)))):
        ngrams = ng(lemmas,i)
        for ngram in ngrams:
            subs = "".join(list(ngram))
            if(subs in vocab): 
                sent_list.append(vocab[subs]['sentiment'])
                lemmas = m.lemmatize(string.replace(subs,""))
    return sent_list

def sentiment_extraction(x,ent_dict):
    line = x['lines']
    syntax = x['syntax']
    ne_list = []
    for ne in ent_dict.keys():
        regex= re.compile(r"\b{}\b".format(re.escape(ne)),flags=re.UNICODE)
        for pos in ([m.start(0) for m in re.finditer(regex, line)]):
            ne_list.append((ne,pos))
    ne_pairs  = [(ne1,ne2) for ne1 in ne_list for ne2 in ne_list if ne1 != ne2]
    
    mutuals = list()
    polarities = list()
    mut_pol = list()
    for ((ne1, ne1_pos), (ne2,ne2_pos)) in ne_pairs:
        mut = get_mutual_node(syntax,ne1, ne1_pos, ne2, ne2_pos)
        if len(mut)>0:
            mutuals.append(mut)
        polar = find_polar(mut[0], mut[1], mut[2])
        if  polar!=None and len(polar)>0:
            polarities.append(polar)
        elif mut[0][len(mut[0])-1] in vocab:
            polarities.append([(ent_dict[ne1]['initial'],ent_dict[ne2]['initial'],vocab[mut[0][len(mut[0])-1]]['sentiment'])])
                   
    x['mutuals']=mutuals
    x['polarities']=polarities
    
    
    x['ne_pairs'] = ne_pairs
    
    x['ne_substrings'] = [(ne1,ne2,line[pos1+len(ne1):pos2]) for (ne1,pos1),(ne2,pos2) in ne_pairs if pos2>pos1+10]
    
    sent_list = list()
    for ne1,ne2,subs in x['ne_substrings']:
        sent_list.append((ne1,ne2,get_sent_list(m.lemmatize(subs))))
    x['sent_list'] = sent_list
    
    return(x)

In [9]:
#считывание и извлечение
text_lines, ent_dict = read_file(1)


text_lines = text_lines.apply(lambda x:sentiment_extraction(x,ent_dict),axis=1)
text_lines


Unnamed: 0,index,lines,syntax,mutuals,polarities,ne_pairs,ne_substrings,sent_list
0,1,СМИ Ирана: Финляндия не хочет вступать в НАТО ...,"[[0, 3, СМИ, NOUN, Animacy=Inan|Case=Nom|Gende...","[([0, 3, СМИ, NOUN, Animacy=Inan|Case=Nom|Gend...",[],"[((СМИ, 0), (Ирана, 4)), ((СМИ, 0), (Финляндия...","[(СМИ, Финляндия, Ирана: ), (СМИ, НАТО, Иран...","[(СМИ, Финляндия, []), (СМИ, НАТО, [-1]), (СМИ..."
1,2,inosmi.ru,"[[0, 6, inosmi, ADJ, Animacy=Inan|Case=Nom|Gen...",[],[],[],[],[]
2,3,"5-го, 2016","[[0, 1, 5, NUM, fPOS=NUM++, -1, ROOT, 5], [1, ...",[],[],[],[],[]
3,4,Зачем американцам понадобилось перебросить 250...,"[[0, 5, Зачем, ADV, Degree=Pos|fPOS=ADV++, 2, ...","[[[31, 42, перебросить, VERB, Aspect=Perf|Verb...",[],"[((Сирии, 78), (Румийлан, 107)), ((Сирии, 78),...","[(Сирии, Румийлан, , к окрестностям города ), ...","[(Сирии, Румийлан, []), (Сирии, Khorasan, []),..."
4,5,30.04) Алиреза Резахах (Alireza Rezakhah).,"[[0, 5, 30.04, NUM, fPOS=NUM++, 2, nummod, 30....","[([15, 22, Резахах, NOUN, Animacy=Inan|Case=Lo...",[],"[((Алиреза Резахах (Alireza Rezakhah, 7), (Рез...",[],[]
5,6,"Согласно заявлениям из Пентагона, цель США — с...","[[0, 8, Согласно, ADP, fPOS=ADP++, 1, case, со...","[[[34, 38, цель, NOUN, Animacy=Inan|Case=Nom|G...",[],"[((Пентагона, 23), (США, 39)), ((Пентагона, 23...","[(Пентагона, США, , цель ), (Пентагона, ИГИЛ, ...","[(Пентагона, США, []), (Пентагона, ИГИЛ, [1]),..."
6,7,Однако в Дамаске действия Вашингтона резко осу...,"[[0, 6, Однако, ADV, Degree=Pos|fPOS=ADV++, 6,...","[[[43, 50, осудили, VERB, Aspect=Perf|Mood=Ind...","[[(дамаск, дамаск, -1)], [(сирия, вашингтон, -...","[((Сирии, 103), (Дамаске, 9)), ((Сирии, 103), ...","[(Дамаске, Сирии, действия Вашингтона резко о...","[(Дамаске, Сирии, [-1, -1]), (Дамаске, Вашингт..."
7,8,Примерно в аналогичном ключе высказалась и Мос...,"[[0, 8, Примерно, ADV, Degree=Pos|fPOS=ADV++, ...","[[[29, 40, высказалась, VERB, Aspect=Perf|Gend...",[],"[((Москва, 43), (Дамаском, 74)), ((Москва, 43)...","[(Москва, Дамаском, , где «несогласованные с )...","[(Москва, Дамаском, []), (Москва, Белого дома,..."
8,9,"Почему же и в Москве и Дамаске, которые также ...","[[0, 6, Почему, ADV, Degree=Pos|fPOS=ADV++, 19...","[[[98, 105, вызвали, VERB, Aspect=Perf|Mood=In...",[],"[((Сирии, 71), (США, 157)), ((Сирии, 71), (Дам...","[(Сирии, США, , действия Вашингтона вызвали ст...","[(Сирии, США, [0, 0]), (Сирии, Вашингтона, [])..."
9,10,"Следует напомнить, указывает он, что Сирия ока...","[[0, 7, Следует, VERB, Aspect=Imp|Mood=Ind|Num...","[[[43, 52, оказалась, VERB, Aspect=Perf|Gender...",[],"[((ИГ, 224), (Сирия, 37)), ((ИГ, 224), (Обамы,...","[(Сирия, ИГ, оказалась одним из самых проблем...","[(Сирия, ИГ, [-1, 0, -1, -1, -1, -1, -1]), (Си..."


In [10]:
def get_summed_over_sent(sent_list):
    sdict = dict()
    for ne1,ne2,slist in sent_list:
        if (ne1,ne2) in sdict:sdict[(ent_dict[ne1]['initial'],ent_dict[ne2]['initial'])]+=sum(slist)
        else: sdict[(ent_dict[ne1]['initial'],ent_dict[ne2]['initial'])]=sum(slist) 
    return sdict
text_lines['summed'] =  text_lines['sent_list'].apply(get_summed_over_sent)
text_lines['summed']

0     {('сми', 'финляндия'): 0, ('сми', 'нато'): -1,...
1                                                    {}
2                                                    {}
3     {('сирия', 'румийлан'): 0, ('сирия', 'khorasan...
4                                                    {}
5     {('пентагон', 'сша'): 0, ('пентагон', 'игила')...
6     {('дамаск', 'сирия'): -2, ('дамаск', 'вашингто...
7     {('москва', 'дамаск'): 0, ('москва', 'белый до...
8     {('сирия', 'сша'): 0, ('сирия', 'вашингтон'): ...
9     {('сирия', 'иго'): -6, ('сирия', 'обама'): -1,...
10                     {('обама', 'ближний восток'): 2}
11    {('ракка', 'иго'): 0, ('вкс', 'ракка'): 0, ('в...
12                                                   {}
13                                                   {}
14                            {('дамаск', 'резах'): -1}
15    {('белый дом', 'москва'): -2, ('белый дом', 'д...
16    {('сми', 'сирия'): -1, ('сми', 'москва'): -1, ...
17          {('сша', 'дамаск'): 0, ('рф', 'дамас

In [11]:
def get_summed_over_polar(polar):
    sdict = dict()

    for p in polar:
        for sent_list in p:
            ne1,ne2,slist = sent_list
            if (ne1,ne2) in sdict:sdict[(ne1,ne2)]+=slist
            else: sdict[ne1,ne2]=slist 
    return sdict
text_lines['sum_por'] =  text_lines['polarities'].apply(get_summed_over_polar)
text_lines['sum_por']

0                                                    {}
1                                                    {}
2                                                    {}
3                                                    {}
4                                                    {}
5                                                    {}
6     {('дамаск', 'дамаск'): -2, ('сирия', 'вашингто...
7                                                    {}
8                                                    {}
9                                                    {}
10    {('обама', 'ближний восток'): 0, ('ближний вос...
11                                                   {}
12                                                   {}
13                                                   {}
14                                                   {}
15    {('белый дом', 'москва'): -1, ('белый дом', 'д...
16                                                   {}
17                                              

In [12]:
opinions = dict()
def get_summed(sent_list):
        for k in sent_list.keys():
            if k in opinions : opinions[k]+=sent_list[k]
            else: opinions[k]=sent_list[k]
            
_ = text_lines['summed'].apply(get_summed)
_ = text_lines['sum_por'].apply(get_summed)

df = pd.DataFrame([(ne1,ne2,opinions[(ne1,ne2)]) for (ne1,ne2) in opinions.keys()],columns=['ne1','ne2','sent'])
df

Unnamed: 0,ne1,ne2,sent
0,сми,финляндия,0
1,сми,нато,-1
2,сми,россия,-2
3,иран,нато,-1
4,иран,россия,-2
5,финляндия,нато,0
6,финляндия,россия,-2
7,нато,россия,-2
8,сирия,румийлан,0
9,сирия,khorasan,0


In [13]:
df['sent'] = df['sent'].apply(lambda x: 'negative' if x<0 else ('positive' if x>0 else None))


In [14]:
df[~df['sent'].isnull()].to_csv('Texts/1.csv',index=False,header=False)

In [15]:
#считывание и извлечение
from tqdm import tqdm
for i in tqdm(range(1,46)):
    if(i in [9,22,26,18,25,35]): continue
    print (i)
    text_lines, ent_dict = read_file(i)

    text_lines = text_lines.apply(lambda x:sentiment_extraction(x,ent_dict),axis=1)
    opinions = dict()
    text_lines['summed'] =  text_lines['sent_list'].apply(get_summed_over_sent)
    text_lines['sum_por'] =  text_lines['polarities'].apply(get_summed_over_polar)
    _ = text_lines['summed'].apply(get_summed)
    _ = text_lines['sum_por'].apply(get_summed)
    df = pd.DataFrame([(ne1,ne2,opinions[(ne1,ne2)]) for (ne1,ne2) in opinions.keys()],columns=['ne1','ne2','sent'])
    df['sent'] = df['sent'].apply(lambda x: 'negative' if x<0 else ('positive' if x>0 else None))
    df = df[df['ne1']!=df['ne2']]
    df[~df['sent'].isnull()].to_csv('Texts/{}.csv'.format(i),index=False,header=False)

  0%|          | 0/45 [00:00<?, ?it/s]

1


  2%|▏         | 1/45 [00:02<01:34,  2.14s/it]

2


  4%|▍         | 2/45 [00:02<01:14,  1.74s/it]

3


  7%|▋         | 3/45 [00:03<01:02,  1.49s/it]

4


  9%|▉         | 4/45 [00:05<00:59,  1.44s/it]

5


 11%|█         | 5/45 [00:06<00:51,  1.28s/it]

6


 13%|█▎        | 6/45 [00:06<00:42,  1.08s/it]

7


 16%|█▌        | 7/45 [00:06<00:31,  1.21it/s]

8


 18%|█▊        | 8/45 [00:07<00:30,  1.23it/s]

10


 22%|██▏       | 10/45 [00:09<00:27,  1.28it/s]

11


 24%|██▍       | 11/45 [00:09<00:25,  1.32it/s]

12


 27%|██▋       | 12/45 [00:10<00:24,  1.33it/s]

13


 29%|██▉       | 13/45 [00:11<00:22,  1.40it/s]

14


 31%|███       | 14/45 [00:14<00:48,  1.56s/it]

15


 33%|███▎      | 15/45 [00:15<00:38,  1.28s/it]

16


 36%|███▌      | 16/45 [00:15<00:28,  1.03it/s]

17


 38%|███▊      | 17/45 [00:16<00:29,  1.05s/it]

19


 42%|████▏     | 19/45 [00:17<00:21,  1.24it/s]

20


 44%|████▍     | 20/45 [00:17<00:16,  1.48it/s]

21


 47%|████▋     | 21/45 [00:18<00:18,  1.31it/s]

23


 51%|█████     | 23/45 [00:19<00:15,  1.47it/s]

24


 53%|█████▎    | 24/45 [00:20<00:15,  1.36it/s]

27


 60%|██████    | 27/45 [00:22<00:12,  1.42it/s]

28


 62%|██████▏   | 28/45 [00:23<00:11,  1.44it/s]

29


 64%|██████▍   | 29/45 [00:23<00:09,  1.62it/s]

30


 67%|██████▋   | 30/45 [00:24<00:09,  1.63it/s]

31


 69%|██████▉   | 31/45 [00:24<00:07,  1.97it/s]

32


 71%|███████   | 32/45 [00:25<00:09,  1.32it/s]

33


 73%|███████▎  | 33/45 [00:26<00:08,  1.35it/s]

34


 76%|███████▌  | 34/45 [00:27<00:08,  1.32it/s]

36


 80%|████████  | 36/45 [00:28<00:06,  1.48it/s]

37


 82%|████████▏ | 37/45 [00:28<00:05,  1.41it/s]

38


 84%|████████▍ | 38/45 [00:29<00:04,  1.62it/s]

39


 87%|████████▋ | 39/45 [00:30<00:04,  1.46it/s]

40


 89%|████████▉ | 40/45 [00:31<00:03,  1.38it/s]

41


 91%|█████████ | 41/45 [00:32<00:03,  1.09it/s]

42


 93%|█████████▎| 42/45 [00:33<00:02,  1.14it/s]

43


 96%|█████████▌| 43/45 [00:33<00:01,  1.22it/s]

44


 98%|█████████▊| 44/45 [00:35<00:01,  1.20s/it]

45


100%|██████████| 45/45 [00:37<00:00,  1.15s/it]


In [None]:
text_lines

In [16]:
df

Unnamed: 0,ne1,ne2,sent
0,франк-вальтер штайнмайер (frank-walter steinmeier,нато,
1,франк-вальтер штайнмайер (frank-walter steinmeier,штайнмайер,
2,штайнмайер,нато,
3,франк-вальтер штайнмайер,нато,
4,франк-вальтер штайнмайер,штайнмайер,positive
5,нато,альянс,
6,нато,польша,
7,нато,статья 5 североатлантический договор,
8,польша,альянс,
9,польша,статья 5 североатлантический договор,
