# NLP HSE School

In [23]:
from pymystem3 import Mystem
import pandas as pd
import re
m=Mystem()
pd.set_option('display.width', 1000)
pd.set_option('display.expand_frame_repr', False)

In [24]:
sentilex = pd.read_csv('RuSentiLex2017_revised_2utf.txt',
                       names=['word','part','initial','sentiment','source','amb1','amb2'],
                       skiprows=20,skipinitialspace=True,index_col=2)
sentilex['sentiment'] = sentilex['sentiment'].map({'negative':-1,'positive':1,'neutral':0,'positive/negative':0})

In [25]:
vocab = sentilex.to_dict(orient='index')

In [26]:

def read_file(n):
    file = pd.read_csv('Texts/art{}.txt'.format(n),sep='\n',names=['lines'])
    file.lines = file.lines.str.replace("{Author, Unknown} ","")
    
    ent_df = pd.read_csv('Texts/art{}.ann'.format(n),sep='\t',names=['ind','descr','ne'],header=None)
    ent_df['initial'] = ent_df['ne'].apply(lambda x: ("".join(m.lemmatize(x.lower()))).replace('\n',""))
    ent_df = ent_df.set_index(ent_df['ne'])
    syn_df = pd.read_json('Texts/art{}syn.json'.format(n))

    syn_df = syn_df.reset_index()
    def get_struct_data(x):
        sentl =[]
        pars=[]
        for text in x:
            for sent in text:
                sentl = sent.split(' ')
                to_app = [sentl[i] for i in range(len(sentl)) if i in([0,1,4,6,8,10,12])]
                to_app.append(m.lemmatize(to_app[2])[0])
                pars.append(to_app)
        return(pars)

    syn_df['syntax'] = syn_df['syntax'].apply(get_struct_data)
    
    syn_df['index']=syn_df['index'] +1
    file = file.reset_index().merge(syn_df)
    return(file.drop('text',1), ent_df.to_dict(orient='index'))


In [27]:
# получение полярностей
from sentiframes_proc import find_polar
table = pd.read_csv('sentiframes_df.csv')

lst = [['31', '42', 'бомбит', 'VERB', 'Aspect=Imperf|VerbForm=Inf|fPOS=VERB++', '2', 'nsubj', 'бомбить'],       
       ['78', '83', 'Сирии', 'NOUN', 'Animacy=Inan|Case=Acc|Gender=Fem|Number=Sing|fPOS=NOUN++', '10', 'dobj', 'сирия'],
       ['107', '115', 'Румийлан', 'NOUN', 'Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing|fPOS=NOUN++', '15', 'appos', 'румийлан']]


find_polar(lst[0], lst[1], lst[2])


[('румийлан', 'сирия', 'neg'), ('сирия', 'румийлан', 'neg')]

In [28]:
text_lines, ent_dict = read_file(1)

In [29]:
#получение данных из синтаксиса
def get_mutual_node(syntax,ent1,ent1pos,ent2,ent2pos):
    
    def get_node(e,p):
        
        e = e.split(" ")[0].split("-")[0]
        ent_dict = dict()
        for i in range(len(syntax)):
            if(syntax[i][2]==e):
                
                ent_dict[abs(int(syntax[i][1])-p)]=i
        
        return ent_dict[sorted(ent_dict)[0]]
    def get_route(i):
        route = list()
        for _ in range(len(syntax)):
            h=int(syntax[i][5])
            if(h ==-1): return route
            route.append(syntax[h])
            i=h
    ent_node1 = get_node(ent1,ent1pos)
    ent_node2 = get_node(ent2,ent2pos)
    
    route1= get_route(ent_node1)[::-1]
    route2= get_route(ent_node2)[::-1]
    if(len(route1)==0): return syntax[ent_node1],syntax[ent_node1],syntax[ent_node2]
    if(len(route2)==0): return syntax[ent_node2],syntax[ent_node1],syntax[ent_node2]

    for i in range(min(len(route1),len(route2))):
        
        if(route1[i][0]==route2[i][0]): obj=route1[i]
        else: break
    
    
    return [obj,syntax[ent_node1],syntax[ent_node2]]

In [32]:
import re
import nltk
from nltk import ngrams as ng
def get_sent_list(lemmas):
    sent_list = []
    string = "".join(lemmas)
    for i in reversed(range(1,min(11,len(lemmas)))):
        ngrams = ng(lemmas,i)
        for ngram in ngrams:
            subs = "".join(list(ngram))
            if(subs in vocab): 
                sent_list.append(vocab[subs]['sentiment'])
                lemmas = m.lemmatize(string.replace(subs,""))
    return sent_list

def sentiment_extraction(x,ent_dict):
    line = x['lines']
    syntax = x['syntax']
    ne_list = []
    for ne in ent_dict.keys():
        regex= re.compile(r"\b{}\b".format(re.escape(ne)),flags=re.UNICODE)
        for pos in ([m.start(0) for m in re.finditer(regex, line)]):
            ne_list.append((ne,pos))
    ne_pairs  = [(ne1,ne2) for ne1 in ne_list for ne2 in ne_list if ne1 != ne2]
    
    mutuals = list()
    polarities = list()
    for ((ne1, ne1_pos), (ne2,ne2_pos)) in ne_pairs:
        mut = get_mutual_node(syntax,ne1, ne1_pos, ne2, ne2_pos)
        mutuals.append(mut)
        plarities = find_polar(mut[0], mut[1], mut[2])
                   
    x['mutuals']=mutuals
    x['polarities']=polarities
    
    
    x['ne_pairs'] = ne_pairs
    
    x['ne_substrings'] = [(ne1,ne2,line[pos1+len(ne1):pos2]) for (ne1,pos1),(ne2,pos2) in ne_pairs if pos2>pos1+10]
    
    sent_list = list()
    for ne1,ne2,subs in x['ne_substrings']:
        sent_list.append((ne1,ne2,get_sent_list(m.lemmatize(subs))))
    x['sent_list'] = sent_list
    
    return(x)

In [33]:
#считывание и извлечение
text_lines, ent_dict = read_file(1)


text_lines = text_lines.apply(lambda x:sentiment_extraction(x,ent_dict),axis=1)
text_lines


AttributeError: ("'list' object has no attribute 'split'", 'occurred at index 0')

In [None]:
def get_summed_over_sent(sent_list):
    sdict = dict()
    for ne1,ne2,slist in sent_list:
        if (ne1,ne2) in sdict:sdict[(ent_dict[ne1]['initial'],ent_dict[ne2]['initial'])]+=sum(slist)
        else: sdict[(ent_dict[ne1]['initial'],ent_dict[ne2]['initial'])]=sum(slist) 
    return sdict
text_lines['summed'] =  text_lines['sent_list'].apply(get_summed_over_sent)
text_lines['summed']

In [None]:
opinions = dict()
def get_summed(sent_list):
        for k in sent_list.keys():
            if k in opinions : opinions[k]+=sent_list[k]
            else: opinions[k]=sent_list[k]
            
_ = text_lines['summed'].apply(get_summed)
df = pd.DataFrame([(ne1,ne2,opinions[(ne1,ne2)]) for (ne1,ne2) in opinions.keys()],columns=['ne1','ne2','sent'])
df

In [None]:
df['sent'] = df['sent'].apply(lambda x: 'negative' if x<0 else ('positive' if x>0 else None))


In [None]:
df[~df['sent'].isnull()].to_csv('Texts/1.csv',index=False,header=False)

In [None]:
#считывание и извлечение
from tqdm import tqdm
for i in tqdm(range(1,46)):
    if(i in [9,22,26,18,25,35]): continue
    print (i)
    text_lines, ent_dict = read_file(i)

    text_lines = text_lines.apply(lambda x:sentiment_extraction(x,ent_dict),axis=1)[['ne_pairs','ne_substrings','sent_list']]
    opinions = dict()
    text_lines['summed'] =  text_lines['sent_list'].apply(get_summed_over_sent)
    _ = text_lines['summed'].apply(get_summed)
    df = pd.DataFrame([(ne1,ne2,opinions[(ne1,ne2)]) for (ne1,ne2) in opinions.keys()],columns=['ne1','ne2','sent'])
    df['sent'] = df['sent'].apply(lambda x: 'negative' if x<0 else ('positive' if x>0 else None))
    df[~df['sent'].isnull()].to_csv('Texts/{}.csv'.format(i),index=False,header=False)

In [11]:
text_lines

Unnamed: 0,ne_pairs,ne_substrings,sent_list
0,"[((СМИ, 0), (Ирана, 4)), ((СМИ, 0), (Финляндия...","[(СМИ, Финляндия, Ирана: ), (СМИ, НАТО, Иран...","[(СМИ, Финляндия, []), (СМИ, НАТО, [-1]), (СМИ..."
1,[],[],[]
2,[],[],[]
3,"[((Сирии, 78), (Румийлан, 107)), ((Сирии, 78),...","[(Сирии, Румийлан, , к окрестностям города ), ...","[(Сирии, Румийлан, []), (Сирии, Khorasan, []),..."
4,"[((Алиреза Резахах (Alireza Rezakhah, 7), (Рез...",[],[]
5,"[((Пентагона, 23), (США, 39)), ((Пентагона, 23...","[(Пентагона, США, , цель ), (Пентагона, ИГИЛ, ...","[(Пентагона, США, []), (Пентагона, ИГИЛ, [1]),..."
6,"[((Сирии, 103), (Дамаске, 9)), ((Сирии, 103), ...","[(Дамаске, Сирии, действия Вашингтона резко о...","[(Дамаске, Сирии, [-1, -1]), (Дамаске, Вашингт..."
7,"[((Москва, 43), (Дамаском, 74)), ((Москва, 43)...","[(Москва, Дамаском, , где «несогласованные с )...","[(Москва, Дамаском, []), (Москва, Белого дома,..."
8,"[((Сирии, 71), (США, 157)), ((Сирии, 71), (Дам...","[(Сирии, США, , действия Вашингтона вызвали ст...","[(Сирии, США, [0, 0]), (Сирии, Вашингтона, [])..."
9,"[((ИГ, 224), (Сирия, 37)), ((ИГ, 224), (Обамы,...","[(Сирия, ИГ, оказалась одним из самых проблем...","[(Сирия, ИГ, [-1, 0, -1, -1, -1, -1, -1]), (Си..."
