# NLP HSE School

In [1]:
from pymystem3 import Mystem
import pandas as pd
import re
m=Mystem()
pd.set_option('display.width', 1000)
pd.set_option('display.expand_frame_repr', False)

In [2]:
sentilex = pd.read_csv('RuSentiLex2017_revised_2utf.txt',
                       names=['word','part','initial','sentiment','source','amb1','amb2'],
                       skiprows=20,skipinitialspace=True,index_col=2)
sentilex['sentiment'] = sentilex['sentiment'].map({'negative':-1,'positive':1,'neutral':0,'positive/negative':0})

In [3]:
vocab = sentilex.to_dict(orient='index')

In [4]:

def read_file(n):
    file = pd.read_csv('Texts/art{}.txt'.format(n),sep='\n',names=['lines'])
    file.lines = file.lines.str.replace("{Author, Unknown} ","")
    
    ent_df = pd.read_csv('Texts/art{}.ann'.format(n),sep='\t',names=['ind','descr','ne'],header=None)
    ent_df['initial'] = ent_df['ne'].apply(lambda x: ("".join(m.lemmatize(x.lower()))).replace('\n',""))
    ent_df = ent_df.set_index(ent_df['ne'])
    syn_df = pd.read_json('Texts/art{}syn.json'.format(n))

    syn_df = syn_df.reset_index()
    def get_struct_data(x):
        sentl =[]
        pars=[]
        for text in x:
            for sent in text:
                sentl = sent.split(' ')
                to_app = [sentl[i] for i in range(len(sentl)) if i in([0,1,4,6,8,10,12])]
                to_app.append(m.lemmatize(to_app[2])[0])
                pars.append(to_app)
        return(pars)

    syn_df['syntax'] = syn_df['syntax'].apply(get_struct_data)
    
    syn_df['index']=syn_df['index'] +1
    file = file.reset_index().merge(syn_df)
    return(file.drop('text',1), ent_df.to_dict(orient='index'))


In [5]:
text_lines, ent_dict = read_file(1)

In [6]:
#получение данных из синтаксиса
def get_mutual_node(syntax,ent1,ent1pos,ent2,ent2pos):
    
    def get_node(e,p):
        
        e = e.split(" ")[0].split("-")[0]
        ent_dict = dict()
        for i in range(len(syntax)):
            if(syntax[i][2]==e):
                
                ent_dict[abs(int(syntax[i][1])-p)]=i
        
        return ent_dict[sorted(ent_dict)[0]]
    def get_route(i):
        route = list()
        for _ in range(len(syntax)):
            h=int(syntax[i][5])
            if(h ==-1): return route
            route.append(syntax[h])
            i=h
    ent_node1 = get_node(ent1,ent1pos)
    ent_node2 = get_node(ent2,ent2pos)
    
    route1= get_route(ent_node1)[::-1]
    route2= get_route(ent_node2)[::-1]
    if(len(route1)==0): return syntax[ent_node1],syntax[ent_node1],syntax[ent_node2]
    if(len(route2)==0): return syntax[ent_node2],syntax[ent_node1],syntax[ent_node2]

    for i in range(min(len(route1),len(route2))):
        
        if(route1[i][0]==route2[i][0]): obj=route1[i]
        else: break
    
    
    return obj,syntax[ent_node1],syntax[ent_node2]

In [11]:
import re
import nltk
from nltk import ngrams as ng
def get_sent_list(lemmas):
    sent_list = []
    string = "".join(lemmas)
    for i in reversed(range(1,min(11,len(lemmas)))):
        ngrams = ng(lemmas,i)
        for ngram in ngrams:
            subs = "".join(list(ngram))
            if(subs in vocab): 
                sent_list.append(vocab[subs]['sentiment'])
                lemmas = m.lemmatize(string.replace(subs,""))
    return sent_list

def sentiment_extraction(x,ent_dict):
    line = x['lines']
    syntax = x['syntax']
    ne_list = []
    for ne in ent_dict.keys():
        regex= re.compile(r"\b{}\b".format(re.escape(ne)),flags=re.UNICODE)
        for pos in ([m.start(0) for m in re.finditer(regex, line)]):
            ne_list.append((ne,pos))
    ne_pairs  = [(ne1,ne2) for ne1 in ne_list for ne2 in ne_list if ne1 != ne2]
    
    for ((ne1, ne1_pos), (ne2,ne2_pos)) in ne_pairs:
        x['mutual'] = get_mutual_node(syntax,ne1, ne1_pos, ne2, ne2_pos)
                   
    x['ne_pairs'] = ne_pairs
    
    x['ne_substrings'] = [(ne1,ne2,line[pos1+len(ne1):pos2]) for (ne1,pos1),(ne2,pos2) in ne_pairs if pos2>pos1+10]
    
    sent_list = list()
    for ne1,ne2,subs in x['ne_substrings']:
        sent_list.append((ne1,ne2,get_sent_list(m.lemmatize(subs))))
    x['sent_list'] = sent_list
    
    return(x)

In [12]:
#считывание и извлечение
text_lines, ent_dict = read_file(1)

text_lines = text_lines.apply(lambda x:sentiment_extraction(x,ent_dict),axis=1)[['ne_pairs','ne_substrings','sent_list']]

In [13]:
def get_summed_over_sent(sent_list):
    sdict = dict()
    for ne1,ne2,slist in sent_list:
        if (ne1,ne2) in sdict:sdict[(ent_dict[ne1]['initial'],ent_dict[ne2]['initial'])]+=sum(slist)
        else: sdict[(ent_dict[ne1]['initial'],ent_dict[ne2]['initial'])]=sum(slist) 
    return sdict
text_lines['summed'] =  text_lines['sent_list'].apply(get_summed_over_sent)
text_lines['summed']

0     {('сми', 'финляндия'): 0, ('сми', 'нато'): -1,...
1                                                    {}
2                                                    {}
3     {('сирия', 'румийлан'): 0, ('сирия', 'khorasan...
4                                                    {}
5     {('пентагон', 'сша'): 0, ('пентагон', 'игила')...
6     {('дамаск', 'сирия'): -2, ('дамаск', 'вашингто...
7     {('москва', 'дамаск'): 0, ('москва', 'белый до...
8     {('сирия', 'сша'): 0, ('сирия', 'вашингтон'): ...
9     {('сирия', 'иго'): -6, ('сирия', 'обама'): -1,...
10                     {('обама', 'ближний восток'): 2}
11    {('ракка', 'иго'): 0, ('вкс', 'ракка'): 0, ('в...
12                                                   {}
13                                                   {}
14                            {('дамаск', 'резах'): -1}
15    {('белый дом', 'москва'): -2, ('белый дом', 'д...
16    {('сми', 'сирия'): -1, ('сми', 'москва'): -1, ...
17          {('сша', 'дамаск'): 0, ('рф', 'дамас

In [14]:
opinions = dict()
def get_summed(sent_list):
        for k in sent_list.keys():
            if k in opinions : opinions[k]+=sent_list[k]
            else: opinions[k]=sent_list[k]
            
_ = text_lines['summed'].apply(get_summed)
df = pd.DataFrame([(ne1,ne2,opinions[(ne1,ne2)]) for (ne1,ne2) in opinions.keys()],columns=['ne1','ne2','sent'])
df

Unnamed: 0,ne1,ne2,sent
0,сми,финляндия,0
1,сми,нато,-1
2,сми,россия,-2
3,иран,нато,-1
4,иран,россия,-2
5,финляндия,нато,0
6,финляндия,россия,-2
7,нато,россия,-2
8,сирия,румийлан,0
9,сирия,khorasan,0


In [15]:
df['sent'] = df['sent'].apply(lambda x: 'negative' if x<0 else ('positive' if x>0 else None))


In [16]:
df[~df['sent'].isnull()].to_csv('Texts/1.csv',index=False,header=False)

In [28]:
#считывание и извлечение
from tqdm import tqdm
for i in tqdm(range(1,46)):
    if(i in [9,22,26,18,25]): continue
    print (i)
    text_lines, ent_dict = read_file(i)

    text_lines = text_lines.apply(lambda x:sentiment_extraction(x,ent_dict),axis=1)[['ne_pairs','ne_substrings','sent_list']]
    opinions = dict()
    text_lines['summed'] =  text_lines['sent_list'].apply(get_summed_over_sent)
    _ = text_lines['summed'].apply(get_summed)
    df = pd.DataFrame([(ne1,ne2,opinions[(ne1,ne2)]) for (ne1,ne2) in opinions.keys()],columns=['ne1','ne2','sent'])
    df['sent'] = df['sent'].apply(lambda x: 'negative' if x<0 else ('positive' if x>0 else None))
    df[~df['sent'].isnull()].to_csv('Texts/{}.csv'.format(i),index=False,header=False)


  0%|          | 0/45 [00:00<?, ?it/s][A

1



  2%|▏         | 1/45 [00:00<00:36,  1.20it/s]

2


  4%|▍         | 2/45 [00:01<00:29,  1.44it/s]

3


  7%|▋         | 3/45 [00:01<00:25,  1.63it/s]

4


  9%|▉         | 4/45 [00:02<00:24,  1.68it/s]

5


 11%|█         | 5/45 [00:02<00:22,  1.76it/s]

6


 16%|█▌        | 7/45 [00:03<00:14,  2.60it/s]

7
8


 18%|█▊        | 8/45 [00:03<00:13,  2.68it/s]

10


 22%|██▏       | 10/45 [00:04<00:15,  2.28it/s]

11


 24%|██▍       | 11/45 [00:05<00:14,  2.39it/s]

12


 27%|██▋       | 12/45 [00:05<00:13,  2.44it/s]

13


 29%|██▉       | 13/45 [00:05<00:13,  2.46it/s]

14


 31%|███       | 14/45 [00:07<00:23,  1.30it/s]

15


 33%|███▎      | 15/45 [00:07<00:20,  1.43it/s]

16


 36%|███▌      | 16/45 [00:08<00:16,  1.81it/s]

17


 38%|███▊      | 17/45 [00:08<00:14,  1.94it/s]

19


 42%|████▏     | 19/45 [00:09<00:11,  2.34it/s]

20


 44%|████▍     | 20/45 [00:09<00:09,  2.61it/s]

21


 47%|████▋     | 21/45 [00:09<00:11,  2.13it/s]

23


 51%|█████     | 23/45 [00:10<00:08,  2.56it/s]

24


 53%|█████▎    | 24/45 [00:10<00:09,  2.25it/s]

27


 60%|██████    | 27/45 [00:11<00:06,  2.70it/s]

28


 62%|██████▏   | 28/45 [00:12<00:07,  2.37it/s]

29


 64%|██████▍   | 29/45 [00:12<00:05,  2.70it/s]

30


 69%|██████▉   | 31/45 [00:12<00:04,  3.29it/s]

31
32


 71%|███████   | 32/45 [00:13<00:06,  2.06it/s]

33


 73%|███████▎  | 33/45 [00:14<00:05,  2.18it/s]

34


 76%|███████▌  | 34/45 [00:14<00:04,  2.25it/s]

35


IndexError: ('list index out of range', 'occurred at index 29')