<a href="https://colab.research.google.com/github/DmitryKutsev/NIS_SentiFrame/blob/master/BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
%%capture
!pip install pymorphy2[fast]
from pymorphy2 import MorphAnalyzer
morph = MorphAnalyzer()

In [0]:
import pandas as pd
import numpy as np
import json
from sklearn.metrics.pairwise import cosine_distances

Check out [this issue](https://github.com/hanxiao/bert-as-service/issues/380) and "make sure Colab is using Tensorflow 1.x, because bert-serving-start doesn't currently work with TF 2.1 and nohup hides the output of the command failing"

In [0]:
# import tensorflow as tf
# print (tf.__version__)

In [0]:
%%capture
!pip install -U bert-serving-server[http]
!pip install bert-serving-client  # client, independent of `bert-serving-server`

In [0]:
%%capture
!wget https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip
!unzip /content/multi_cased_L-12_H-768_A-12.zip

In [0]:
!nohup bert-serving-start -model_dir=./multi_cased_L-12_H-768_A-12 > out.file 2>&1 &

In [6]:
from bert_serving.client import BertClient
bc = BertClient()
encoded_test = bc.encode(['First do it', 'then do it right', 'then do it better'])
encoded_test

array([[ 0.49155593,  0.08795979,  0.08263359, ...,  1.0980439 ,
         0.41126642, -0.25396958],
       [-0.07413451, -0.2278353 , -0.08978202, ...,  1.5093102 ,
         1.3512001 , -0.03158369],
       [-0.2657526 ,  0.1913553 , -0.3561356 , ...,  1.3962169 ,
         1.4187483 ,  0.12651931]], dtype=float32)

In [0]:
encoded_test = bc.encode(['Вася любит Машу'])
encoded_test

In [0]:
# arg0, arg1, verb = 'маша петя любить'.split
def a0_a1_clause_maker(verb, arg0, arg1, arg0_case='nomn', arg1_case='accs', cap=True):

  '''Генерирует простые предложения вида 'arg0 любит arg1'.
  Параметры:
  verb              глагол в любой форме;
  arg0_case='nomn'  тег падежа подлежащего;
  arg1_case='accs'  тег падежа дополнения;
  arg0='Маша'       подлежащее;
  arg1='Петя'       дополнение
  cap=True        капитализация аргументов-имен собственных.
  Допустимые теги падежей лежат тут: http://opencorpora.org/dict.php?act=gram.'''

  inflected_arg0 = morph.parse(arg0)[0].inflect({'sing', 'nomn'}).word
  inflected_verb = morph.parse(verb)[0].inflect({'sing', '3per', 'pres', 'indc'}).word
  inflected_arg1 = morph.parse(arg1)[0].inflect({'sing', 'accs'}).word

  if (cap == True) and ('Name' in morph.parse(arg1)[0].tag):
    inflected_arg1 = inflected_arg1.capitalize()
  if (cap == True) and ('Name' in morph.parse(arg0)[0].tag):
    inflected_arg0 = inflected_arg0.capitalize()

  sentence = '{} {} {}'.format(inflected_arg0, inflected_verb, inflected_arg1)
  return sentence

# a0_a1_clause_maker('любить', 'Маша', 'Петя')
a0_a1_clause_maker('любить', 'Петя', 'Маша')


  

'Петя любит Машу'

In [0]:
# Пете нравится, что
def clause_wrapper(verb, arg0, arg1, 
                   main_subject=1, main_verb='нравиться',
                   arg0_case='nomn', arg1_case='accs', 
                   cap=True):

  '''Генерирует сложноподчинённые предложения вида 'arg нравится, что arg0 любит arg1'.
  Параметры:
  verb                    глагол-предикат подчинённой клаузы;
  arg0_case='nomn'        тег падежа подлежащего подчинённой клаузы;
  arg1_case='accs'        тег падежа дополнения подчинённой клаузы;
  main_subject            int, номер аргумента-подлежащего главной клаузы (по умолчанию дополнение подчинённой клаузы); 
  main_verb='нравиться',  глагол-предикат главной клаузы;
  arg0='Маша'             подлежащее подчинённой клаузы;
  arg1='Петя'             дополнение подчинённой клаузы;
  cap=True                капитализация аргументов-имен собственных.
  Допустимые теги падежей лежат тут: http://opencorpora.org/dict.php?act=gram.'''

  if main_subject:
    inflected_main_subject = morph.parse(arg1)[0].inflect({'sing', 'datv'}).word.capitalize()
  else:
    inflected_main_subject = morph.parse(arg0)[0].inflect({'sing', 'datv'}).word
  
  if (cap == True) and ('Name' in morph.parse(inflected_main_subject.)[0].tag):
    inflected_main_subject = inflected_main_subject.capitalize()


  inflected_main_verb = morph.parse(main_verb)[0].inflect({'sing', '3per', 'pres', 'indc'}).word
  subordinate_clause = a0_a1_clause_maker(verb, arg0, arg1, arg0_case, arg1_case, cap)
  main_clause = '{} {}'.format (inflected_main_subject, inflected_main_verb)
  sentence = '{}, что {}'.format(main_clause, subordinate_clause)

  return sentence

clause_wrapper('любить', 'Валя', 'Маша', 0)


'вале нравится, что Валя любит Машу'

In [0]:
# morph.parse('Пете')[0]
'Name' in morph.parse('Вале')[0].tag

False

In [0]:
bc.server_status

# Определение удалённых сидов

In [0]:
def make_seed_embedding(seed_dict, model):
  '''
  Принимает на вход список, на выходе дает средний вектор всех слов из этого списка
  '''
  summ = 0
  if model == skipgram_model:
    for verb in seed_dict:
      summ = summ + model[verb + '_V']
    vector = summ/len(seed_dict)
  elif model == bc:
    for verb in seed_dict:
      summ = summ + bc.encode([verb])
    vector = summ/len(seed_dict)
  else:
    for verb in seed_dict:
      summ = summ + model.get_vector(verb)
    vector = summ/len(seed_dict)
  return vector

# seed_one_skipgram = make_seed_embedding(seed_one_dict, skipgram_model)
# seed_one_fasttext = make_seed_embedding(seed_one_dict, fasttext_model)

In [17]:
seed1pos = ['одобрять', 'хвалить', 'поощрять', 
            'обожать', 'восхищаться', 'восторгаться', 
            'нравиться', 'ценить', 'гордиться',
            'хвалить', 'нахваливать', 'превозносить']
seed1neg = ['порицать', 'осуждать', 'негодовать',
            'обвинять', 'наказывать', 
            'ненавидеть', 'убивать', 'разрушать']

avg1pos = make_seed_embedding(seed1pos, bc)
avg1neg = make_seed_embedding(seed1neg, bc)

cosine_distances(np.atleast_2d(avg1pos), np.atleast_2d(avg1pos))
# array([[0.49508154]], dtype=float32)
# cosine_distances(np.atleast_2d(avg2pos), np.atleast_2d(avg1neg))




array([[0.]], dtype=float32)

In [0]:
from itertools import chain
from itertools import combinations

def powerset(iterable):
    "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(len(s)+1))

In [0]:
neg_seeds_list = []
for i in powerset(seed1neg):
  if i:
    neg_seeds_list.append(list(i))
# neg_seeds_list

pos_seeds_list = []
for i in powerset(seed1pos):
  if i:
    pos_seeds_list.append(list(i))
# pos_seeds_list

In [0]:
# pseeds_nseeds = np.zeros(shape=(len(pos_seeds_list),len(neg_seeds_list)))

In [0]:
pseeds_nseeds.shape

(4095, 255)

In [0]:
for i, ps in enumerate (pos_seeds_list):
  # print (i, ps)
  for j, ns in enumerate (neg_seeds_list):
    if pseeds_nseeds[i, j] == 0:
      seed1pos = pos_seeds_list[i]
      seed1neg = neg_seeds_list[j]
      avg1pos = make_seed_embedding(seed1pos, bc)
      avg1neg = make_seed_embedding(seed1neg, bc)
      distance = cosine_distances(np.atleast_2d(avg1pos), np.atleast_2d(avg1neg))
      pseeds_nseeds[i, j] = distance
      if (i%100 == 0) and (j==1):
        print(i)
%time
# np.unravel_index(np.argmin(pseeds_nseeds, axis=None), pseeds_nseeds.shape)

In [0]:
# np.save('pseeds_nseeds_matrix', pseeds_nseeds)
pseeds_nseeds = np.load('pseeds_nseeds_matrix.npy')

In [11]:
np.unravel_index(np.argmin(pseeds_nseeds, axis=None), pseeds_nseeds.shape)


(1209, 182)

In [12]:
np.unravel_index(np.argmax(pseeds_nseeds, axis=None), pseeds_nseeds.shape)

(10, 1)

## Проверка расстояния до сидов

In [0]:
%%capture
# at least two mutual negative attitudes
!wget https://raw.githubusercontent.com/DmitryKutsev/NIS_SentiFrame/master/grouped_verb_lists/neg_a0_a1_mutual.json
!wget https://raw.githubusercontent.com/DmitryKutsev/NIS_SentiFrame/master/grouped_verb_lists/neg_a0_a2_mutual.json
!wget https://raw.githubusercontent.com/DmitryKutsev/NIS_SentiFrame/master/grouped_verb_lists/neg_a1_a2_mutual.json

# at least two mutual positive attitudes
!wget https://raw.githubusercontent.com/DmitryKutsev/NIS_SentiFrame/master/grouped_verb_lists/pos_a0_a1_mutual.json
!wget https://raw.githubusercontent.com/DmitryKutsev/NIS_SentiFrame/master/grouped_verb_lists/pos_a0_a2_mutual.json
!wget https://raw.githubusercontent.com/DmitryKutsev/NIS_SentiFrame/master/grouped_verb_lists/pos_a1_a2_mutual.json

# at least two opposite attitudes (only a0/a1 show opposite attitudes toward each other)
!wget https://raw.githubusercontent.com/DmitryKutsev/NIS_SentiFrame/master/grouped_verb_lists/opp_a0_a1_candidates_l.json

In [0]:
def verb_distance_df_bert(verb_list, model,
                  seed1=['ценить'],
                  seed2=['убивать']):
  df = pd.DataFrame({'variant':verb_list})
  
  distance_to_seed1 = df['variant'].apply(
      lambda x: cosine_distances(
          np.atleast_2d(bc.encode([x])), 
          np.atleast_2d(make_seed_embedding(seed1, bc))).item())
  distance_to_seed2 = df['variant'].apply(
      lambda x: cosine_distances(
          np.atleast_2d(bc.encode([x])), 
          np.atleast_2d(make_seed_embedding(seed2, bc))).item())

  df['distance_to_seed1_bert'] = distance_to_seed1
  df['distance_to_seed2_bert'] = distance_to_seed2

  df['seed1'] = ''.join(seed1)
  df['seed2'] = ''.join(seed2)

  df['attributed_polarity'] = np.where(
      df['distance_to_seed1_bert'] <= df['distance_to_seed2_bert'], 
      df['seed1'], df['seed2'])
  
  return df


In [0]:
with open("neg_a0_a1_mutual.json", "r", encoding="utf-8") as f:
  list2check = json.load(f, encoding="utf-8")

  # list2check = ['порицать', 'осуждать', 'негодовать',
  #           'обвинять', 'наказывать', 
  #           'ненавидеть', 'убивать', 'разрушать', 'одобрять', 'хвалить', 'поощрять', 
  #           'обожать', 'восхищаться', 'восторгаться', 
  #           'нравиться', 'ценить', 'гордиться',
  #           'хвалить', 'нахваливать', 'превозносить']

df = verb_distance_df_bert(list2check, bc, seed1=['поощрять'], seed2=['наказывать'])



In [89]:
len(df[df['attributed_polarity']=='наказывать'])

803

In [91]:
len(df[df['attributed_polarity']=='поощрять'])

219

In [93]:
with open("pos_a0_a1_mutual.json", "r", encoding="utf-8") as f:
  list2check = json.load(f, encoding="utf-8")

with open("pos_a0_a2_mutual.json", "r", encoding="utf-8") as f:
  list2check = list2check + json.load(f, encoding="utf-8")

with open("pos_a1_a2_mutual.json", "r", encoding="utf-8") as f:
  list2check = list2check + json.load(f, encoding="utf-8")

df = verb_distance_df_bert(list2check, bc, seed1=['поощрять'], seed2=['наказывать'])
print (len(df[df['attributed_polarity']=='наказывать']))
print(len(df[df['attributed_polarity']=='поощрять']))

300
171
