<a href="https://colab.research.google.com/github/DmitryKutsev/NIS_SentiFrame/blob/master/virt_udpipe.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install spacy_udpipe

In [2]:
%%capture
!pip install pymorphy2[fast]
from pymorphy2 import MorphAnalyzer
morph = MorphAnalyzer()

In [3]:
import os
import unicodedata
import json
import spacy_udpipe
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Sentence-level experiment

For further exploration of the semantic axis method we make use of BERT embeddings.

Seed verbs remain the same as in the word-level experiments, but may be subject to change later.

1.   Seed sentences are constructed with regard to the arguments. For example, to test a sentence like 'Силовики вломились к журналисту', we compute the semantic axis as follows:

*   Replace the target verb with a seed verb.
*   Make changes to cases of arguments if necessary.
*   Repeat for each seed verb to construct seed sentences.

2.   



# BERT-as-service

Check out [this issue](https://github.com/hanxiao/bert-as-service/issues/380) and "make sure Colab is using Tensorflow 1.x, because bert-serving-start doesn't currently work with TF 2.1 and nohup hides the output of the command failing".

Also make sure you're using GPU accelerator.

In [4]:
%tensorflow_version 1.x
# import tensorflow as tf
# print (tf.__version__)

TensorFlow 1.x selected.


In [5]:
%%capture
!pip install -U bert-serving-server[http]
!pip install bert-serving-client  # client, independent of `bert-serving-server`

In [6]:
%%capture
!wget https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip
!unzip /content/multi_cased_L-12_H-768_A-12.zip

In [7]:
!nohup bert-serving-start -model_dir=./multi_cased_L-12_H-768_A-12 > out.file 2>&1 &

In [8]:
from bert_serving.client import BertClient
bc = BertClient()

In [12]:
encoded_test = bc.encode(['First do it', 
                          'then do it right', 'then do it better'
                          ])
len(encoded_test[0])

768

# UDPipe

In [None]:
syntagrus = spacy_udpipe.load("ru")

# with open('16.txt') as f:
#     text = f.read()

# verbs = list(pd.read_csv('cross_seminar_task.csv', sep='\t')['verb'])


def text2ud(text):
    udpiped = []
    doc = syntagrus(text)
    doc_len = len(doc)
    for i, token in enumerate(doc):
        if i <= 10 or i == doc_len-10:
            continue
        if token.lemma_ in verbs:
            new_entry = {token.lemma_: []}
            for t in reversed(doc[i-10:i]):
                if t.head.lemma_ == token.lemma_:
                    new_entry[token.lemma_].append([t.text, t.lemma_, t.pos_, t.dep_])
            for t in doc[i:i+10]:
                if t.head.lemma_ == token.lemma_:
                    new_entry[token.lemma_].append([t.text, t.lemma_, t.pos_, t.dep_])
            udpiped.append(new_entry)
            
    with open('result.json', 'w') as f:
        json.dump(udpiped, f, ensure_ascii=False, indent=4)

# text2ud(text[:1000])

In [53]:
class SemanticAxis():

    def __init__(self):
        self.seed0 = set()
        self.seed1 = set()
        self.targets = set()
        self.axis_vector = None
        self.axis_similarities = None

    def add_seed(self, seed: set, seed_id: int):
        if seed_id:
            self.seed1 = set(seed)
        else:
            self.seed0 = set(seed)
    def add2seed(self, seed: set, seed_id: int):
        if seed_id:
            self.seed1.update(seed)
        else:
            self.seed0.update(seed)
    def flush_seed(self, seed_id=None, flush_both_seeds=True):
        if seed_id != None:
            if seed_id:
                self.seed1 = set()
            else:
                self.seed0 = set()
        else:
            self.seed0, self.seed1 = set(), set()
    
    def add_targets(self, target):
        self.targets = target
    def add2targets(self, target):
        self.targets.update(target)
    def flush_targets(self):
        self.targets = set()
    
    def compute_bert_axis(self, bert_client):
        assert len(self.seed0) > 0, 'Seed0 set is empty.'
        assert len(self.seed1) > 0, 'Seed1 set is empty.'
        self.bert_client = bert_client

        target_vectors = self.bert_client.encode(list(self.targets))
        seed_vectors = [self.bert_client.encode(list(s)).mean(axis=0) 
        for s in (self.seed0, self.seed1)]

        self.axis_vector = seed_vectors[1] - seed_vectors[0]

        self.axis_similarities = {self.targets[i]:cosine_similarity(
            np.atleast_2d(vector), 
            np.atleast_2d(self.axis_vector)
            ).item() for i, vector in enumerate(target_vectors)}

In [44]:
sa = SemanticAxis()

In [None]:
sa.add_targets(list2check)
sa.add_seed(['разрушать'], 0)
sa.add_seed(['ценить'], 1)

In [None]:
sa.compute_bert_axis(bc)
df = pd.DataFrame({'target':sa.axis_similarities.keys(),
                   'similarity':sa.axis_similarities.values()})
df.sort_values(by='similarity')

In [None]:
from google.colab import drive
drive.mount('/content/drive')
!unrar x ""

In [91]:
def ovd_loader():
    path = 'OVD-Info/2019'
    for month in os.listdir(path):
        for filename in os.listdir('{}/{}'.format(path, month)):
            filepath = '{}/{}/{}'.format(path, month, filename)
            with open(filepath, 'r', encoding='utf-8') as f:
                yield filepath, f.read()

In [95]:
d = {x[0]:unicodedata.normalize("NFKD", x[1]) for x in ovd_loader()}
texts = pd.DataFrame({'url':d.keys(),
                      'text':d.values()})