In [37]:
import os
import numpy as np
from tqdm import tqdm
import sklearn.preprocessing
import pandas as pd
import warnings

In [5]:
SC_ids = set() #All sem classes
path = "SampleCorpus_new/SemanticHierarchy_ID.txt"
with open(path) as f:
    for line in f.readlines():
        sc_id, parrent_id, name = line.split()
        if name != "Name":
            SC_ids.add(int(sc_id))
            if parrent_id != "NULL":
                SC_ids.add(int(parrent_id))

In [7]:
SC_parrents = dict() #sc -> parrent sc
SC_name_to_id = dict() #sc -> name
path = "SampleCorpus_new/SemanticHierarchy_ID.txt"
with open(path) as f:
    for line in f.readlines():
        sc_id, parrent_sc, sc_name = line.split()
        if sc_name != "Name":
            SC_name_to_id[sc_name] = int(sc_id)
            if parrent_sc != "NULL":
                SC_parrents[int(sc_id)] = int(parrent_sc)
                
LC_to_SC = dict() #lc -> parrent 
path = "SampleCorpus_new/SemanticHierarchy_lexical_vs_semantic.txt"
with open(path, encoding="utf8") as f:
    i = 0
#     lc = set()
#     sc = set()
    for line in f.readlines():
        lc_id, lc_name, sc_id, sc_name = line.split("\t")
#         lc.add(lc_id)
#         sc.add(sc_id)
#         if lc_id in sc:
#             print("WDSFF", lc_id)
#         if sc_id in lc:
#             print("!!!!!!", sc_id)
        if i > 0:
            LC_to_SC[int(lc_id)] = int(sc_id)
        i += 1

Для загрузки предобученных эмбеддингов семантических классов.

In [8]:
def load_embeddings(filename):
    emb = dict()
    with open(filename, encoding="utf-8") as f:
        f.readline()
        N, dim = 93892, 200
        for i in range(N):
            line = f.readline().split()
            try: 
                emb[int(line[0])] = np.array(list(map(float, line[1:])))
            except ValueError: #some problems with 80578 line
                emb[int(line[0][1:])] = np.array(list(map(float, line[1:])))
    return emb

In [9]:
sem_class = load_embeddings("SampleCorpus_new/embeddings.txt")

In [27]:
def get_sc_emb(sc_id, sc_embeddings):
    if sc_id in sc_embeddings:
        return sc_embeddings[sc_id]
    while sc_id not in sc_embeddings:
        if sc_id in LC_to_SC:
            sc_id = LC_to_SC[sc_id]
        elif sc_id in SC_parrents:
            sc_id = SC_parrents[sc_id]
        else:
            return np.zeros(200)
    return sc_embeddings[sc_id]

In [19]:
path_mentions = "SampleCorpus_new" + os.sep + "Mentions"
path_chains = "SampleCorpus_new" + os.sep + "Chains"
path_morph = "SampleCorpus_new" + os.sep + "Morph"
path_sem_synt = "SampleCorpus_new" + os.sep + "Sem_synt"

post_ = {
    "NOUN": 0, #имя существительноe
    "ADJF": 1, #имя прилагательное (полное)
    "ADJS": 2, #имя прилагательное (краткое)
    "COMP": 3, #компаратив
    "VERB": 4, #глагол (личная форма)
    "INFN": 5, #глагол (инфинитив)
    "PRTF": 6, #причастие (полное)
    "PRTS": 7, #причастие (краткое)
    "GRND": 8, #деепричастие
    "NUMR": 9, #числительное
    "ADVB": 10, #наречие
    "NPRO": 11, #местоимение-существительное
    "PRED": 12, #предикатив
    "PREP": 13, #предлог
    "CONJ": 14, #союз
    "PRCL": 15, #частица
    "INTJ": 16, #междометие
    "PNCT": 17, #знак препинания
    "ROMN": 18  #римские цифры
}

anim_ = {
    "anim": 1, #одушевлённое
    "inan": 0 #неодушевлённое
}

gend_ = {
    "masc": 0, #мужской
    "femn": 1, #женский
    "neut": 2, #средний
    "ms-f": 3  #общий
}

num_ = {
    "sing": 1, #единственное число
    "plur": 0  #множественное число
}

def get_one_hot_encoded_vector(i, N):
    if len(i) == 0:
        return np.zeros(N, dtype=int)
    label_binarizer = sklearn.preprocessing.LabelBinarizer()
    label_binarizer.fit(range(N))
    b = label_binarizer.transform(i)
    return sum(b)

In [96]:
def merge_all_information(book_id):
    
    #открываю все нужные файлы по номеру книги
    mentions_file = path_mentions + os.sep + book_id
    chains_file = path_chains + os.sep + book_id
    morph_file = path_morph + os.sep + book_id
    sem_file = path_sem_synt + os.sep + book_id
     
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        #Mention Id→Mention Offset→Mention Length
        dataset_mentions = np.loadtxt(mentions_file, dtype=int, delimiter="\t")
        #Mention Id→Mention Offset→Mention Length→Chain Id
        dataset_chains = np.loadtxt(chains_file, dtype=int, delimiter="\t")
        #Token Id→Offset→Length→Token→Lemma→Morphology Tags
        dataset_morph = np.loadtxt(morph_file, dtype=str, delimiter="\t", encoding="utf-8")
        #Offset→Token→Parent Offset→Semantic Class Id→SyntParadigm Id
        dataset_sem_synt = np.loadtxt(sem_file, dtype=str, delimiter="\t", encoding="utf-8")
    
    if len(dataset_mentions) == 0 or len(dataset_chains) == 0 \
        or len(dataset_morph) == 0 or len(dataset_sem_synt) == 0:
        return dict()
    
    dataset = dict()
    
    #добавим информацию об упоминании
    for ment in dataset_mentions:
        id_ment = ment[0]
        offset = ment[1]
        length = ment[2]
        dataset[id_ment] = [offset, length] # offset, length
        assert len(dataset[id_ment]) == 2, dataset[id_ment]
    
    #добавим информацию о номере цепочки    
    #посчитаем количество цепочек
    chain_ids = set()
    ment_in_chains = set()
    is_anaphoric = False
    chain_num = 0
    for ment in dataset_chains:
        chain = int(ment[3])
        id_ment = int(ment[0])
        offset = int(ment[1])
        length = int(ment[2])
        assert id_ment not in ment_in_chains, ment
        ment_in_chains.add(id_ment)
        is_anaphoric = not (chain in chain_ids)
        if is_anaphoric:
            chain_ids.add(chain)
            chain_num += 1
            dataset[id_ment].append(1) # анафорическое
        else:
            dataset[id_ment].append(0) #не анафорическое
        assert len(dataset[id_ment]) == 3, dataset[id_ment]
    
    #если упоминание не состояло ни в одной цепочке
    #в качестве признака анафоричности запишем 1
    for ment in dataset.keys():
        #print(ment)
        if len(dataset[ment]) == 2:
            dataset[ment].append(1) # анафорическое
        assert len(dataset[ment]) == 3, dataset[ment]
        
        
    sem_synt_info = dict()
    for token in dataset_sem_synt:
        offset = token[0]
        try:
            offset = int(offset)
        except ValueError:
            offset = token[0][1:]
            offset = int(offset)
        length = len(token[1])
        parent_offset = token[2]
        lc = int(token[3])
        synt_par = int(token[4])
        if (parent_offset == "NA"):
            parent_offset = -1
        parent_offset = int(parent_offset)
        try:
            sem_synt_info[offset] = [length, parent_offset, LC_to_SC[lc], synt_par]
        except KeyError:
            sem_synt_info[offset] = [length, parent_offset, lc, synt_par]
            #print(offset)
        
    offsets = [] #список всех оффсетов для всех токенов
    
    lemms = set() #добавим в качестве признака встречалась ли лемма в тексте ранее
    morph_info = dict()
    for token_info in dataset_morph:
        offset = token_info[1]
        try:
            offset = int(offset)
        except ValueError:
            print(token_info)
            offset = token_info[1][1:]
            offset = int(offset)
            
        offsets.append(offset)
        length = token_info[2]
        token = token_info[3]
        lemma = token_info[4]
        is_first = not(lemma in lemms)
        lemms.add(lemma)
        #morph_info[offset] = [length]  # добавили длину
        morph_info[offset] = [int(is_first)]  # первый раз встречается лемма или нет
        tags = token_info[5]
        set_tags = set(tags.split()[0].split(",")) #получили множество всех тегов
        if len(tags.split()) > 1:
            set_tags |= set(tags.split()[1].split(","))
        morph_info[offset].append(set_tags)
        morph_info[offset].append(token)
      
    for id_ment in dataset:
        length = dataset[id_ment][1]
        offset = dataset[id_ment][0]
        token_ind = offsets.index(offset)
        tokens_in_ment = 0
        embeddings = [] #для семантических классов всего упоминания
        pos_tags = []
        anim = False 
        is_first = False
        text = ""
        cur_offset = offset
        while cur_offset < offset + length: 
            tokens_in_ment += 1
            token_ind += 1
            #посчитаем семантико-синтаксические признаки
            if cur_offset in sem_synt_info:
                sem_class_id = sem_synt_info[cur_offset][2]
                embeddings.append(get_sc_emb(sem_class_id, sem_class))
#                 try:
#                     embeddings.append(sem_class[sem_class_id]) 
#                 except KeyError: #Если не нашли эмбеддинг для нашего СК, возьмём родительский
#                     try:
#                         embeddings.append(sem_class[SC_parrents[sem_class_id]])
#                     except:
#                         pass
                    
            #обработаем морфологические признаки
            if cur_offset in morph_info:

                is_first |= morph_info[cur_offset][0] # один из токенов встретился в тексте первый раз
                tag_set = morph_info[cur_offset][1]

                post = tag_set & post_.keys()
                pos_tags.extend(list(post))
                #print(pos_tags)

                if "anim" in tag_set:
                    anim = True
                    
                text += morph_info[cur_offset][2] + " "

            if (token_ind < len(offsets)):
                cur_offset = offsets[token_ind]
            else:
                break
        
        
        dataset[id_ment].append(text)
                    
            
        dataset[id_ment].append(tokens_in_ment) #количество токенов в упоминании
        if len(embeddings) > 0:
            dataset[id_ment].extend(np.mean(np.array(embeddings), axis=0)) #средний эмбеддинг
            dataset[id_ment].extend(np.min(np.array(embeddings), axis=0)) #средний эмбеддинг
        else:
            print(id_ment, dataset[id_ment])
            dataset[id_ment].extend(np.zeros(200))
            dataset[id_ment].extend(np.zeros(200))
            
        #один из токенов встретился в тексте первый раз
        dataset[id_ment].append(int(is_first))
            
        # будем считать что если есть хоть один одушевлённый токен
        dataset[id_ment].append(int(anim))
        
        #добавим признак в качестве вектора с составом по частям речи
        dataset[id_ment].extend(get_one_hot_encoded_vector(list(map(lambda x: post_[x], pos_tags)), len(post_)))
        
    return dataset
            

In [97]:
book_2 = merge_all_information("book_2373.txt")

In [98]:
#index = ["offset", "length", "isAnaph", "text", "tokens number", "mean emb", "min emb", "is first", "anim", "post"]
set_2 = pd.DataFrame(data=book_2).T

In [99]:
set_2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,416,417,418,419,420,421,422,423,424,425
1,0,59,1,"Святая Агнесса Чешская , Агнесса Пражская , Аг...",9,0.0399121,0.0786944,-0.00748343,0.0120404,0.00790357,...,0,0,0,0,0,0,0,0,2,0
2,66,18,1,Svatá Anežka Česká,3,0.029213,0.104865,-0.048407,0.105814,-0.055348,...,0,0,0,0,0,0,0,0,0,0
3,91,16,1,Agnes de Bohemia,3,0.029213,0.104865,-0.048407,0.105814,-0.055348,...,0,0,0,0,0,0,0,0,0,0
4,129,5,1,Прага,1,-0.07827,-0.025541,0.059696,0.143379,-0.056491,...,0,0,0,0,0,0,0,0,0,0
5,157,5,0,Прага,1,-0.07827,-0.025541,0.059696,0.143379,-0.056491,...,0,0,0,0,0,0,0,0,0,0
6,165,19,0,католическая святая,2,0.095834,0.0690175,0.0360595,0.150562,-0.0364355,...,0,0,0,0,0,0,0,0,0,0
7,186,20,0,монахиня - клариссинка,3,-0.006896,0.047552,0.0209755,0.0588765,-0.124202,...,0,0,0,0,0,0,0,0,1,0
8,208,38,0,дочь короля Чехии Пржемысла Оттокара I,6,0.0222158,0.0478268,0.0165817,0.0842833,-0.0567735,...,0,0,0,0,0,0,0,0,0,0
9,213,33,1,короля Чехии Пржемысла Оттокара I,5,0.018342,0.0545056,0.0113042,0.0901198,-0.0563928,...,0,0,0,0,0,0,0,0,0,0
10,220,5,1,Чехии,1,0.022239,0.073074,0.102743,0.11009,-0.028183,...,0,0,0,0,0,0,0,0,0,0


In [101]:
#index = ["offset", "length", "isAnaph", "text", "tokensNum", "meanEmb", "minEmb", "isFirst", "anim", "post"]
#reading
files = os.listdir(path_mentions)
dataset = pd.DataFrame(merge_all_information(files[0])).T
for item in tqdm(files[1:]):
    #print(item)
    data = pd.DataFrame(merge_all_information(item)).T
    dataset = dataset.append(data)
    #print(len(dataset))


100%|██████████| 59/59 [00:03<00:00, 18.82it/s]


In [102]:
dataset

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,416,417,418,419,420,421,422,423,424,425
1,12,35,1,Самовольное занятие водного объекта,4,-0.0209123,0.0264983,0.0227517,-0.0139713,-0.0111455,...,0,0,0,0,0,0,0,0,0,0
2,32,15,1,водного объекта,2,0.0378345,0.026108,-0.0126995,-0.018707,0.029443,...,0,0,0,0,0,0,0,0,0,0
3,52,40,1,пользование им без разрешения ( лицензии ),7,0.0075812,0.0468998,-0.0143162,-0.0109116,0.002083,...,0,0,0,0,1,0,0,0,2,0
4,64,2,0,им,1,0.039599,0.027456,-0.005485,0.065839,0.072102,...,0,0,0,0,0,0,0,0,0,0
5,71,21,1,разрешения ( лицензии ),4,-0.019755,0.072928,-0.040782,0.026008,0.024289,...,0,0,0,0,0,0,0,0,2,0
6,71,10,1,разрешения,1,-0.019755,0.072928,-0.040782,0.026008,0.024289,...,0,0,0,0,0,0,0,0,0,0
7,83,8,1,лицензии,1,-0.019755,0.072928,-0.040782,0.026008,0.024289,...,0,0,0,0,0,0,0,0,0,0
8,94,49,0,Самовольное занятие водного объекта или его ча...,7,0.0181669,0.0204901,0.0111289,-0.0196514,-0.0449509,...,0,0,1,0,0,1,0,0,0,0
9,114,15,0,водного объекта,2,0.0378345,0.026108,-0.0126995,-0.018707,0.029443,...,0,0,0,0,0,0,0,0,0,0
10,134,3,0,его,1,0.139701,0.096367,0.054372,-0.140474,-0.072975,...,0,0,1,0,0,0,0,0,0,0


## Baseline

1) Будем считать, что это упоминание является анафорическим, если до него не встречалась эта лемма в тексте и нет - в противном случае

In [83]:
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

def first_baseline(dataset):
    y = list(dataset.loc[:, "isAnaph"])
    x = list(dataset.loc[:, "isFirst"])
    print("Recall", recall_score(y, x))
    print("Precision", precision_score(y, x))
    print("F1", f1_score(y, x))

In [84]:
first_baseline(dataset)

Recall 0.8654088050314466
Precision 0.7265047518479408
F1 0.7898966704936855


2) попробуем линейный классификатор 

In [104]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

y = list(dataset.loc[:, 2])
x = dataset.drop(columns=[2, 3])
#x = dataset.loc[:, ["offset", "length", "tokensNum", "meanEmb", "minEmb", "isFirst", "anim", "post"]]

In [105]:
x

Unnamed: 0,0,1,4,5,6,7,8,9,10,11,...,416,417,418,419,420,421,422,423,424,425
1,12,35,4,-0.0209123,0.0264983,0.0227517,-0.0139713,-0.0111455,0.00555075,0.033358,...,0,0,0,0,0,0,0,0,0,0
2,32,15,2,0.0378345,0.026108,-0.0126995,-0.018707,0.029443,0.0573515,0.055976,...,0,0,0,0,0,0,0,0,0,0
3,52,40,7,0.0075812,0.0468998,-0.0143162,-0.0109116,0.002083,0.0297028,-0.0498822,...,0,0,0,0,1,0,0,0,2,0
4,64,2,1,0.039599,0.027456,-0.005485,0.065839,0.072102,0.087099,-0.020252,...,0,0,0,0,0,0,0,0,0,0
5,71,21,4,-0.019755,0.072928,-0.040782,0.026008,0.024289,-0.007385,-0.103148,...,0,0,0,0,0,0,0,0,2,0
6,71,10,1,-0.019755,0.072928,-0.040782,0.026008,0.024289,-0.007385,-0.103148,...,0,0,0,0,0,0,0,0,0,0
7,83,8,1,-0.019755,0.072928,-0.040782,0.026008,0.024289,-0.007385,-0.103148,...,0,0,0,0,0,0,0,0,0,0
8,94,49,7,0.0181669,0.0204901,0.0111289,-0.0196514,-0.0449509,0.0205824,0.0285623,...,0,0,1,0,0,1,0,0,0,0
9,114,15,2,0.0378345,0.026108,-0.0126995,-0.018707,0.029443,0.0573515,0.055976,...,0,0,0,0,0,0,0,0,0,0
10,134,3,1,0.139701,0.096367,0.054372,-0.140474,-0.072975,-0.025294,0.013431,...,0,0,1,0,0,0,0,0,0,0


In [None]:
lr = LogisticRegression()
lr.fit(X_train, train.lable)
pred = lr.predict_proba(X_test)