Тестируем полученные модели

In [None]:
import pandas as pd
import json
from valentine import valentine_match, valentine_metrics
from valentine.algorithms import Coma, Cupid, DistributionBased
import numpy as np
import os
import pickle

from time import strftime, localtime
import random

import torch
import torch.nn as nn
from torch.utils.data import DataLoader,TensorDataset,random_split,SubsetRandomSampler, ConcatDataset
from torchview import draw_graph

from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
from sklearn.utils.class_weight import compute_class_weight

from data_utils_2 import build_tokenizer, build_embedding_matrix, Dataset

from models import AOA, AOA_2, AOA_3, AOA_4 , AOA_5, AOA_6

In [None]:
from create_datasets import get_data, get_data_f,get_data_2, get_data_and_ground

with open('names.p', 'rb') as file:
    names = pickle.load(file)

def _test_model(data_loader, m, inputs_cols, data_test):
    with torch.no_grad():
        for t_batch, t_sample_batched in enumerate(data_loader):
            t_inputs = [t_sample_batched[col] for col in inputs_cols]
            t_targets = t_sample_batched['class_n']
            t_outputs = m(t_inputs)
            res = []
            for i in range(len(t_targets)):
                if torch.argmax(t_outputs[i], -1)==1:
                    res.append((data_test[i][2:4],t_outputs[i]))
                    

            n_correct = (torch.argmax(t_outputs, -1) == t_targets).sum().item()
            n_total = len(t_outputs)

    acc = n_correct / n_total
    return acc, res

def _test_on_ground_truth(ground_tr, predicted):
    answer = {}
    
    predicted_unique = {i[0][1]:0 for i in predicted}
    
    for truth in ground_tr:
        chosen_pr=''
        for pr in predicted:
            if truth[0]==pr[0][0] and  predicted_unique[pr[0][1]]<1:
                if type(answer.get(truth[0], 0)) ==int: 
                    answer[truth[0]] = (pr[0][1], pr[1])
                    chosen_pr=pr[0][1]
                else:
                    prob_1 = nn.functional.softmax(answer[truth[0]][1], dim=0)
                    prob_2 = nn.functional.softmax(pr[1], dim=0)
                    if prob_1[1]<prob_2[1]:
                        answer[truth[0]] = (pr[0][1], pr[1])
                        chosen_pr=pr[0][1]
        predicted_unique[chosen_pr]=predicted_unique.get(chosen_pr,0)+1
    total_tr = 0
    for i in ground_tr:
        try:
            if i == (i[0], answer[i[0]][0]):
                total_tr+=1
        except:
            pass
    
    return total_tr/len(ground_tr), answer
                    

def test_dl_models(df_and_ground, model):
    data_name1= df_and_ground[0]
    data_name2= df_and_ground[1]
    df1 = df_and_ground[2]
    df2 = df_and_ground[3]
    ground_t1 = df_and_ground[4]
    
    name = data_name1 +'_'+ data_name2
    
    data_test1 = get_data(df1,df2,ground_t1, data_name1=data_name1,data_name2=data_name2)
    df_data_test1= pd.DataFrame(data_test1,columns = ['dataset1_name','dataset2_name', 'attr1_name', 'attr2_name', 'attribute_match', 'constraints'] )
    df_data_test1.to_pickle('test_{}.p'.format(name))
    
    testset1 = Dataset('test_{}.p'.format(name), tokenizer,dat_fname='{0}_12_test.dat'.format(opt.dataset))
    
    test_data_loader1 = DataLoader(dataset=testset1, batch_size=len(testset1), shuffle=False)
    
    input_cols_1 = ['text_raw_indices1', 'aspect_indices1','text_raw_indices2', 'aspect_indices2', 'constraints']
    
    res = _test_model(test_data_loader1, model, input_cols_1, data_test1)
    
    acc = _test_on_ground_truth(ground_t1, res[1])
        
    return acc

def _test_on_ground_truth_valentine(ground_tr, predicted):
    i=0
    total_tr = 0
    for it in predicted.items():
        for gr in ground_tr:
            if gr == (it[0][0][1], it[0][1][1]):
                total_tr+=1
        i+=1
    return total_tr/len(ground_tr)

def test_valentine_models(df_and_ground, matcher):
    data_name1= df_and_ground[0]
    data_name2= df_and_ground[1]
    df1 = df_and_ground[2]
    df2 = df_and_ground[3]
    ground_t = df_and_ground[4]
    
    matches = valentine_match(df1, df2, matcher)
    return _test_on_ground_truth_valentine(ground_t, matches), matches



In [None]:
pth = r'C:\Users\shepe\Downloads\Valentine-datasets\prospect\Unionable'
dirs = os.listdir(pth)

class opt(object):
    def __init__(self):
        self.max_seq_len = 240
        self.dataset = 'tpc_3'
        self.embed_dim = 300
        self.hidden_dim = 300
        self.class_dim = 2
        self.constr_dim = 28
opt = opt()

with open('pathes.p', 'rb') as file:
    pathes = pickle.load(file)
pathes

In [None]:
all_acc = []
for i in range(1,17):
    number = str(i)
    fnames = ['./datasets/omap/train_tpc{}.p'.format(number)]

    tokenizer = build_tokenizer(
        fnames,
        max_seq_len=opt.max_seq_len,
        dat_fname='{0}_{1}_tokenizer.dat'.format(opt.dataset, number))
    embedding_matrix = build_embedding_matrix(
        word2idx=tokenizer.word2idx,
        embed_dim=opt.embed_dim,
        dat_fname='{0}_{1}_{2}_embedding_matrix.dat'.format(str(opt.embed_dim), opt.dataset, number))

    model_1 = AOA_4(embedding_matrix ,opt)
    model_1.load_state_dict(state_dict = torch.load(pathes[i-1]))
    model_1.eval()


    acc_models = {'AOA_4':[], 'COMA':[]}

    for dir_ in dirs:
        matcher_1 = Coma(strategy="COMA_OPT")
        if dir_ not in names[:i+1]:
            data = get_data_and_ground(dir_)

            acc1 = test_dl_models(data, model_1)
            acc2 = test_valentine_models(data, matcher_1)

            acc_models['AOA_4'] = acc_models['AOA_4']+[acc1[0]]
            acc_models['COMA'] = acc_models['COMA']+[acc2[0]]
            
    all_acc.append(acc_models)

In [None]:
for i in all_acc:
    print(np.mean(i['AOA_4']), np.mean(i['COMA']))

In [None]:
with open('accuracy.p', 'wb') as file:
    pickle.dump(all_acc, file)

In [None]:
import matplotlib.pyplot as plt
y1 = [np.mean(i['AOA_4']) for i in all_acc]
y2 = [np.mean(i['COMA']) for i in all_acc]
x = list(range(2,12))
plt.plot(x, y1[:10])
plt.plot(x, y2[:10])
plt.legend(['DL model','COMA'])
ax = plt.subplot()
ax.set_xlabel('количество обучающих данных')
ax.set_ylabel('точность моделей (accuracy)')
plt.show()

с 1 ограничением

In [None]:
def test_dl_models_c1(df_and_ground, model):
    data_name1= df_and_ground[0]
    data_name2= df_and_ground[1]
    df1 = df_and_ground[2]
    df2 = df_and_ground[3]
    ground_t1 = df_and_ground[4]
    
    name = data_name1 +'_'+ data_name2
    
    data_test1 = get_data_2(df1,df2,ground_t1, data_name1=data_name1,data_name2=data_name2)
    df_data_test1= pd.DataFrame(data_test1,columns = ['dataset1_name','dataset2_name', 'attr1_name', 'attr2_name', 'attribute_match', 'constraints'] )
    df_data_test1.to_pickle('test_{}.p'.format(name))
    
    testset1 = Dataset('test_{}.p'.format(name), tokenizer,dat_fname='{0}_12_test.dat'.format(opt.dataset))
    
    test_data_loader1 = DataLoader(dataset=testset1, batch_size=len(testset1), shuffle=False)
    
    input_cols_1 = ['text_raw_indices1', 'aspect_indices1','text_raw_indices2', 'aspect_indices2', 'constraints']
    
    res = _test_model(test_data_loader1, model, input_cols_1, data_test1)
    
    acc = _test_on_ground_truth(ground_t1, res[1])
        
    return acc

In [None]:
with open('pathes_with_1_constraint.p', 'rb') as file:
    pathes_c1 = pickle.load(file)

In [None]:
pth = r'C:\Users\shepe\Downloads\Valentine-datasets\prospect\Unionable'
dirs = os.listdir(pth)
class opt(object):
    def __init__(self):
        self.max_seq_len = 240
        self.dataset = 'tpc_c1'
        self.embed_dim = 300
        self.hidden_dim = 300
        self.class_dim = 2
        self.constr_dim = 2
opt = opt()

In [None]:
all_acc = []
for i in range(1,17):
    number = str(i)
    fnames = ['./datasets/with_1_constraint/train_tpc{}.p'.format(number)]

    tokenizer = build_tokenizer(
        fnames,
        max_seq_len=opt.max_seq_len,
        dat_fname='{0}_{1}_tokenizer.dat'.format(opt.dataset, number))
    embedding_matrix = build_embedding_matrix(
        word2idx=tokenizer.word2idx,
        embed_dim=opt.embed_dim,
        dat_fname='{0}_{1}_{2}_embedding_matrix.dat'.format(str(opt.embed_dim), opt.dataset, number))

    model_1 = AOA_6(embedding_matrix ,opt)
    model_1.load_state_dict(state_dict = torch.load(pathes_c1[i-1]))
    model_1.eval()


    acc_models = {'AOA_6':[]}

    for dir_ in dirs:
        matcher_1 = Coma(strategy="COMA_OPT")
        if dir_ not in names[:i+1]:
            data = get_data_and_ground(dir_)

            acc1 = test_dl_models_c1(data, model_1)

            acc_models['AOA_6'] = acc_models['AOA_6']+[acc1[0]]
            
    all_acc.append(acc_models)

In [None]:
with open('accuracy_c1.p', 'wb') as file:
    pickle.dump(all_acc, file)

instance-based+schema-based+тэги

In [None]:
def _test_on_ground_truth_f(ground_tr, predicted):
    predicted = [([pr[0][0].split('#')[0],pr[0][1].split('#')[0]],pr[1]) for pr in predicted]
    answer = {}
    predicted_unique = {i[0][1]:0 for i in predicted}
    
    for truth in ground_tr:
        chosen_pr=''
        for pr in predicted:
            if truth[0]==pr[0][0] and  predicted_unique[pr[0][1]]<1:
                if type(answer.get(truth[0], 0)) ==int: 
                    answer[truth[0]] = (pr[0][1], pr[1])
                    chosen_pr=pr[0][1]
                else:
                    prob_1 = nn.functional.softmax(answer[truth[0]][1], dim=0)
                    prob_2 = nn.functional.softmax(pr[1], dim=0)
                    if prob_1[1]<prob_2[1]:
                        answer[truth[0]] = (pr[0][1], pr[1])
                        chosen_pr=pr[0][1]
        predicted_unique[chosen_pr]=predicted_unique.get(chosen_pr,0)+1
    total_tr = 0
    for i in ground_tr:
        try:
            if i == (i[0], answer[i[0]][0]):
                total_tr+=1
        except:
            pass
    
    return total_tr/len(ground_tr), answer
                    

def test_dl_models_f(df_and_ground, model,hxl_tags):
    data_name1= df_and_ground[0]
    data_name2= df_and_ground[1]
    df1 = df_and_ground[2]
    df2 = df_and_ground[3]
    ground_t1 = df_and_ground[4]
    
    name = data_name1 +'_'+ data_name2
    
    data_test1 = get_data_f(df1,df2,ground_t1,hxl_tags, data_name1=data_name1,data_name2=data_name2)
    df_data_test1= pd.DataFrame(data_test1,columns = ['dataset1_name','dataset2_name', 'attr1_name', 'attr2_name', 'attribute_match', 'constraints'] )
    df_data_test1.to_pickle('test_{}.p'.format(name))
    
    testset1 = Dataset('test_{}.p'.format(name), tokenizer,dat_fname='{0}_1_test.dat'.format(opt.dataset))
    
    test_data_loader1 = DataLoader(dataset=testset1, batch_size=len(testset1), shuffle=False)
    
    input_cols_1 = ['text_raw_indices1', 'aspect_indices1','text_raw_indices2', 'aspect_indices2', 'constraints']
    
    res = _test_model(test_data_loader1, model, input_cols_1, data_test1)
    acc = _test_on_ground_truth_f(ground_t1, res[1])
        
    return acc

In [None]:
with open('pathes_with_feature_extraction.p', 'rb') as file:
    pathes_f = pickle.load(file)

In [None]:
pth = r'C:\Users\shepe\Downloads\Valentine-datasets\prospect\Unionable'
dirs = os.listdir(pth)
class opt(object):
    def __init__(self):
        self.max_seq_len = 240
        self.dataset = 'tpc_f'
        self.embed_dim = 300
        self.hidden_dim = 300
        self.class_dim = 2
        self.constr_dim = 30
opt = opt()


In [None]:
from hxl_tag import HXLTagger
hxl_tagger = HXLTagger()
hxl_tags = []
for dir_1 in dirs:
    if dir_1 not in names[:i+1]:
        dfs = get_data_and_ground(dir_1)
        df1 = dfs[2]
        df2 = dfs[3]
        t1 = hxl_tagger.get_hxl_tags(df1)
        t2 = hxl_tagger.get_hxl_tags(df2)
        hxl_tags.append([t1,t2])

In [None]:
with open('hxl_tags_test.p', 'wb') as file:
    pickle.dump(hxl_tags, file)

In [None]:
with open('hxl_tags_test.p', 'rb') as file:
    hxl_tags = pickle.load(file)

In [None]:
all_acc = []
for i in range(1,17):
    number = str(i)
    fnames = ['./datasets/with_feature_extraction/train_tpc{}.p'.format(number)]

    tokenizer = build_tokenizer(
        fnames,
        max_seq_len=opt.max_seq_len,
        dat_fname='{0}_{1}_tokenizer.dat'.format(opt.dataset, number))
    embedding_matrix = build_embedding_matrix(
        word2idx=tokenizer.word2idx,
        embed_dim=opt.embed_dim,
        dat_fname='{0}_{1}_{2}_embedding_matrix.dat'.format(str(opt.embed_dim), opt.dataset, number))

    model_1 = AOA_6(embedding_matrix ,opt)
    model_1.load_state_dict(state_dict = torch.load(pathes_f[i-1]))
    model_1.eval()


    acc_models = {'AOA_6_f':[]}
    

    j=0
    for dir_ in dirs:
        if dir_ not in names[:i+1]:
            data = get_data_and_ground(dir_)

            acc1 = test_dl_models_f(data, model_1, hxl_tags[j])

            acc_models['AOA_6_f'] = acc_models['AOA_6_f']+[acc1[0]]
            j+=1
            
    all_acc.append(acc_models)

In [None]:
with open('accuracy_f.p', 'wb') as file:
    pickle.dump(all_acc, file)

Федеративное обучение с 1 ограничением

In [None]:
from collections import OrderedDict

In [None]:
def set_parameters(net, state_dict):
    parameters = [val.cpu().numpy() for _, val in state_dict.items()]
    params_dict = zip(net.state_dict().keys(), parameters)
    state_dict = OrderedDict({k: torch.Tensor(v) for k, v in params_dict})
    net.load_state_dict(state_dict)

In [None]:
pth = r'C:\Users\shepe\Downloads\Valentine-datasets\prospect\Unionable'
dirs = os.listdir(pth)
class opt(object):
    def __init__(self):
        self.max_seq_len = 240
        self.dataset = 'tpc_c1'
        self.embed_dim = 300
        self.hidden_dim = 300
        self.class_dim = 2
        self.constr_dim = 2
opt = opt()

In [None]:
all_acc = []
for i in range(1,17):
    print("Iteration: ", i)
    number = str(i)
    fnames = ['./datasets/with_1_constraint/train_tpc{}.p'.format(number)]

    tokenizer = build_tokenizer(
        fnames,
        max_seq_len=opt.max_seq_len,
        dat_fname='{0}_{1}_tokenizer.dat'.format(opt.dataset, number))
    embedding_matrix = build_embedding_matrix(
        word2idx=tokenizer.word2idx,
        embed_dim=opt.embed_dim,
        dat_fname='{0}_{1}_{2}_embedding_matrix.dat'.format(str(opt.embed_dim), opt.dataset, number))

    model_1 = AOA_6(embedding_matrix ,opt)
    for j in range(1,11):
        model_1.load_state_dict(state_dict = torch.load(f"state_dict_2/tpc_c1/model_{i}_round_{j}.pth"))
        model_1.eval()


        acc_models_fed = {f'AOA_6_{j}':[]}

        for dir_ in dirs:
            matcher_1 = Coma(strategy="COMA_OPT")
            if dir_ not in names[:i+1]:
                data = get_data_and_ground(dir_)

                acc1 = test_dl_models_c1(data, model_1)

                acc_models_fed[f'AOA_6_{j}'] = acc_models_fed[f'AOA_6_{j}']+[acc1[0]]

        all_acc.append(acc_models_fed)
        

In [None]:
with open('accuracy_fed_c1.p', 'wb') as file:
    pickle.dump(all_acc, file)

In [None]:
with open('accuracy_fed_c1.p', 'rb') as file:
    all_acc = pickle.load(file)

Федеративное обучение instance-based+schema-based+тэги

In [None]:
with open('hxl_tags_test.p', 'rb') as file:
    hxl_tags = pickle.load(file)

In [None]:
pth = r'C:\Users\shepe\Downloads\Valentine-datasets\prospect\Unionable'
dirs = os.listdir(pth)
class opt(object):
    def __init__(self):
        self.max_seq_len = 240
        self.dataset = 'tpc_f'
        self.embed_dim = 300
        self.hidden_dim = 300
        self.class_dim = 2
        self.constr_dim = 30
opt = opt()

In [None]:
all_acc = []
for i in range(1,17):
    print("Iteration: ", i)
    number = str(i)
    fnames = ['./datasets/with_feature_extraction/train_tpc{}.p'.format(number)]

    tokenizer = build_tokenizer(
        fnames,
        max_seq_len=opt.max_seq_len,
        dat_fname='{0}_{1}_tokenizer.dat'.format(opt.dataset, number))
    embedding_matrix = build_embedding_matrix(
        word2idx=tokenizer.word2idx,
        embed_dim=opt.embed_dim,
        dat_fname='{0}_{1}_{2}_embedding_matrix.dat'.format(str(opt.embed_dim), opt.dataset, number))

    model_1 = AOA_6(embedding_matrix ,opt)
    for j in range(1,11):
        model_1.load_state_dict(state_dict = torch.load(f"state_dict_2/tpc_f/model_{i}_round_{j}.pth"))
        model_1.eval()


        acc_models_fed = {f'AOA_6_{j}':[]}
        ind=0
        for dir_ in dirs:
            if dir_ not in names[:i+1]:
                data = get_data_and_ground(dir_)

                acc1 = test_dl_models_f(data, model_1, hxl_tags[ind])
                ind+=1

                acc_models_fed[f'AOA_6_{j}'] = acc_models_fed[f'AOA_6_{j}']+[acc1[0]]

        all_acc.append(acc_models_fed)



In [None]:
with open('accuracy_fed_f.p', 'wb') as file:
    pickle.dump(all_acc, file)

Федеративное обучение instance-based+schema-based

In [None]:
pth = r'C:\Users\shepe\Downloads\Valentine-datasets\prospect\Unionable'
dirs = os.listdir(pth)
class opt(object):
    def __init__(self):
        self.max_seq_len = 240
        self.dataset = 'tpc_3'
        self.embed_dim = 300
        self.hidden_dim = 300
        self.class_dim = 2
        self.constr_dim = 28
opt = opt()

In [None]:
all_acc = []
for i in range(1,17):
    print("Iteration: ", i)
    number = str(i)
    fnames = ['./datasets/omap/train_tpc{}.p'.format(number)]

    tokenizer = build_tokenizer(
        fnames,
        max_seq_len=opt.max_seq_len,
        dat_fname='{0}_{1}_tokenizer.dat'.format(opt.dataset, number))
    embedding_matrix = build_embedding_matrix(
        word2idx=tokenizer.word2idx,
        embed_dim=opt.embed_dim,
        dat_fname='{0}_{1}_{2}_embedding_matrix.dat'.format(str(opt.embed_dim), opt.dataset, number))

    model_1 = AOA_5(embedding_matrix ,opt)
    for j in range(1,11):
        model_1.load_state_dict(state_dict = torch.load(f"state_dict_2/tpc_2/model_{i}_round_{j}.pth"))
        model_1.eval()


        acc_models_fed = {f'AOA_5_{j}':[]}

        for dir_ in dirs:
            matcher_1 = Coma(strategy="COMA_OPT")
            if dir_ not in names[:i+1]:
                data = get_data_and_ground(dir_)

                acc1 = test_dl_models(data, model_1)

                acc_models_fed[f'AOA_5_{j}'] = acc_models_fed[f'AOA_5_{j}']+[acc1[0]]

        all_acc.append(acc_models_fed)

In [None]:
with open('accuracy_fed.p', 'wb') as file:
    pickle.dump(all_acc, file)

In [None]:
with open('accuracy_fed.p', 'rb') as file:
    fed_acc = pickle.load(file)

In [None]:
with open('accuracy.p', 'rb') as file:
    all_acc_1 = pickle.load(file)

In [None]:
mean_acc = []
i = 0
for acc in fed_acc:
    mean_acc.append(np.mean(acc[f'AOA_5_{i % 10 + 1}']))
    i+=1
print(len(mean_acc))
max_acc = []
for j in range(0,len(mean_acc)-9, 10):
    max_acc.append(max(mean_acc[j:j+10]))
max_acc

In [None]:
import matplotlib.pyplot as plt
y1 = [np.mean(i['AOA_4']) for i in all_acc_1]
y2 = [np.mean(i['COMA']) for i in all_acc_1]
y3 = max_acc
x = list(range(2,17))
plt.plot(x, y1[:15])
plt.plot(x, y2[:15])
plt.plot(x, y3[:15])
plt.legend(['DL model','COMA','FED'])
ax = plt.subplot()
ax.set_xlabel('количество обучающих данных')
ax.set_ylabel('точность моделей (accuracy)')
plt.show()