# Data functions

In [2]:
import os
import copy
import numpy as np
import torch
from torch.utils.data import Dataset
# !pip install pytorch_transformers
from pytorch_transformers import BertTokenizer
import gc

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def pad_and_truncate(sequence, maxlen, dtype='int64', padding='post', truncating='post', value=0):
#is designed to adjust the length of a sequence (such as a list or array) to a specified maximum length
#by either padding or truncating the sequence as necessary
#padding='post' add words in the end of the sentence if necessary 
#trancating='post' trancate sentence in the end if the length of a sentence is longer than maxlen
    
    x = (np.ones(maxlen) * value).astype(dtype)
    
    if truncating == 'pre':
        trunc = sequence[-maxlen:] 
    else:
        trunc = sequence[:maxlen]
    
    trunc = np.asarray(trunc, dtype=dtype)

    if padding == 'post':
        x[:len(trunc)] = trunc
    else:
        x[-len(trunc):] = trunc
    return x

In [4]:
class Tokenizer4Bert:
#is designed to handle the tokenization of text for use with a BERT model
    def __init__(self, max_seq_len, pretrained_bert_name):
        self.tokenizer = BertTokenizer.from_pretrained(pretrained_bert_name)
        self.max_seq_len = max_seq_len

    def text_to_sequence(self, text, reverse=False, padding='post', truncating='post'):
        sequence = self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(text)) 
        #tokenize the text and thenmap it with the corresponding ids
        if len(sequence) == 0:
            sequence = [0]
        if reverse:
            sequence = sequence[::-1]
        return pad_and_truncate(sequence, self.max_seq_len, padding=padding, truncating=truncating)

    def id_to_sequence(self, sequence, reverse=False, padding='post', truncating='post'):
        if len(sequence) == 0:
            sequence = [0]
        if reverse:
            sequence = sequence[::-1]
        return pad_and_truncate(sequence, self.max_seq_len, padding=padding, truncating=truncating)

In [5]:
class DepInstanceParser():
    def __init__(self, basicDependencies, tokens):
        self.basicDependencies = basicDependencies
        self.tokens = tokens
        self.words = []
        self.dep_governed_info = []
        self.dep_parsing()


    def dep_parsing(self):
        if len(self.tokens) > 0:
            words = []
            for token in self.tokens:
                token['word'] = token
                words.append(self.change_word(token['word'])) 
            dep_governed_info = [
                {"word": word}
                for i,word in enumerate(words)
            ]
            self.words = words
        else:
            dep_governed_info = [{}] * len(self.basicDependencies)
        for dep in self.basicDependencies:
            dependent_index = dep['dependent'] - 1
            governed_index = dep['governor'] - 1
            dep_governed_info[dependent_index] = {
                "governor": governed_index,
                "dep": dep['dep']
            }
        self.dep_governed_info = dep_governed_info #contains detailed information about the dependencies among these tokens.
    def change_word(self, word):
    #designed to handle specific formatting issues within the text data it processes, particularly dealing 
    #with tokens representing left and right parentheses.
        
        if "-RRB-" in word:
        #The method first checks if the string "-RRB-" is present in the word. This string is often used in 
        #linguistic data to represent a right parenthesis ) to prevent misinterpretation during parsing processes. 
        #If "-RRB-" is found, it is replaced with ")".
            return word.replace("-RRB-", ")")
        
        if "-LRB-" in word:
        #Next, the method checks for the presence of "-LRB-" in the word. Similarly, this string represents a left 
        #parenthesis ( and is replaced by "(".
            return word.replace("-LRB-", "(")
        return word

    def get_first_order(self, direct=False):
        #designed to generate matrices representing the adjacency and types of dependency relationships between 
        #tokens in a sentence based on their parsed dependencies.
        
        #indicate whether there is a direct dependency link between the tokens
        dep_adj_matrix  = [[0] * len(self.dep_governed_info) for _ in range(len(self.dep_governed_info))]
        
        #indicate the type of dependency (like "subj", "obj") between tokens instead of binary indicators as in dep_adj_matrix
        dep_type_matrix = [["none"] * len(self.dep_governed_info) for _ in range(len(self.dep_governed_info))]
        
        
        for i, dep_info in enumerate(self.dep_governed_info):
            governor = dep_info["governor"] #the index of the token that governs the current token
            dep_type = dep_info["dep"] #the type of the dependency
            
            #indicate the existance of the dependency between tokens
            dep_adj_matrix[i][governor] = 1
            dep_adj_matrix[governor][i] = 1
            
            #If direct is False, both [i][governor] and [governor][i] are set to the dependency type.
            #If direct is True, the entries are suffixed to indicate the direction (_in for incoming, _out for outgoing 
            #dependencies relative to each token).
            
            dep_type_matrix[i][governor] = dep_type if direct is False else "{}_in".format(dep_type)
            dep_type_matrix[governor][i] = dep_type if direct is False else "{}_out".format(dep_type)
        
        return dep_adj_matrix, dep_type_matrix

    def get_next_order(self, dep_adj_matrix, dep_type_matrix):
        new_dep_adj_matrix = copy.deepcopy(dep_adj_matrix)
        new_dep_type_matrix = copy.deepcopy(dep_type_matrix)
        for target_index in range(len(dep_adj_matrix)):
            for first_order_index in range(len(dep_adj_matrix[target_index])):
                if dep_adj_matrix[target_index][first_order_index] == 0:
                    continue
                for second_order_index in range(len(dep_adj_matrix[first_order_index])):
                    if dep_adj_matrix[first_order_index][second_order_index] == 0:
                        continue
                    if second_order_index == target_index:
                        continue
                    if new_dep_adj_matrix[target_index][second_order_index] == 1:
                        continue
                    new_dep_adj_matrix[target_index][second_order_index] = 1
                    new_dep_type_matrix[target_index][second_order_index] = dep_type_matrix[first_order_index][second_order_index]
        return new_dep_adj_matrix, new_dep_type_matrix

    def get_second_order(self, direct=False):
        dep_adj_matrix, dep_type_matrix = self.get_first_order(direct=direct)
        return self.get_next_order(dep_adj_matrix, dep_type_matrix)

    def get_third_order(self, direct=False):
        dep_adj_matrix, dep_type_matrix = self.get_second_order(direct=direct)
        return self.get_next_order(dep_adj_matrix, dep_type_matrix)

    def search_dep_path(self, start_idx, end_idx, adj_max, dep_path_arr):
        for next_id in range(len(adj_max[start_idx])):
            if next_id in dep_path_arr or adj_max[start_idx][next_id] in ["none"]:
                continue
            if next_id == end_idx:
                return 1, dep_path_arr + [next_id]
            stat, dep_arr = self.search_dep_path(next_id, end_idx, adj_max, dep_path_arr + [next_id])
            if stat == 1:
                return stat, dep_arr
        return 0, []

    def get_dep_path(self, start_index, end_index, direct=False):
        dep_adj_matrix, dep_type_matrix = self.get_first_order(direct=direct)
        _, dep_path = self.search_dep_path(start_index, end_index, dep_type_matrix, [start_index])
        return dep_path

In [6]:
class DefaultConfig:
    def __init__(self):
        self.print_sent = False
        self.max_seq_len = 256 

def get_default_config():
    return DefaultConfig()

In [7]:
class ABSADataset(Dataset):
    def __init__(self, datafile, tokenizer, opt, deptype2id=None, dep_order="first"):
        self.datafile = datafile
        self.depfile = "{}.dep".format(datafile)
        self.tokenizer = tokenizer
        self.opt = opt 
        self.deptype2id = deptype2id
        self.dep_order = dep_order
        self.textdata = ABSADataset.load_datafile(self.datafile)
        self.depinfo = ABSADataset.load_depfile(self.depfile)
        self.polarity2id = self.get_polarity2id()
        self.feature = []
        self.use_knogcn = opt.modules['knogcn']
        for sentence,depinfo in zip(self.textdata, self.depinfo):
            self.feature.append(self.create_feature(sentence, depinfo, opt.print_sent))
            
        #print(self.feature[:1])

    def __getitem__(self, index):
        return self.feature[index]

    def __len__(self):
        return len(self.feature)

    def ws(self, text):
        tokens = []
        valid_ids = []
        for i, word in enumerate(text):
            if len(text) <= 0:
                continue
            token = self.tokenizer.tokenizer.tokenize(word)
            tokens.extend(token)
            for m in range(len(token)):
                if m == 0:
                    valid_ids.append(1)
                else:
                    valid_ids.append(0)
        token_ids = self.tokenizer.tokenizer.convert_tokens_to_ids(tokens)
        return tokens, token_ids, valid_ids

    def create_feature(self, sentence, depinfo, print_sent = False):
        
        text_left, text_right, aspect, polarity = sentence
        
        cls_id = self.tokenizer.tokenizer.vocab["[CLS]"]

        sep_id = self.tokenizer.tokenizer.vocab["[SEP]"]


        doc = text_left + " " + aspect + " " + text_right
        
        left_tokens, left_token_ids, left_valid_ids = self.ws(text_left.split(" "))
        
        right_tokens, right_token_ids, right_valid_ids = self.ws(text_right.split(" "))
        
        aspect_tokens, aspect_token_ids, aspect_valid_ids = self.ws(aspect.split(" "))
        
        tokens = left_tokens + aspect_tokens + right_tokens
        
        input_ids = [cls_id] + left_token_ids + aspect_token_ids + right_token_ids + [sep_id] + aspect_token_ids + [sep_id]
        valid_ids = [1] + left_valid_ids + aspect_valid_ids + right_valid_ids + [1] + aspect_valid_ids + [1]
        mem_valid_ids = [0] + [0] * len(left_tokens) + [1] * len(aspect_tokens) + [0] * len(right_tokens) # aspect terms mask
        
        segment_ids = [0] * (len(tokens) + 2) + [1] * (len(aspect_tokens)+1)
        
        
        dep_instance_parser = DepInstanceParser(basicDependencies=depinfo, tokens=[])
        if self.dep_order == "first":
            dep_adj_matrix, dep_type_matrix = dep_instance_parser.get_first_order()
        elif self.dep_order == "second":
            dep_adj_matrix, dep_type_matrix = dep_instance_parser.get_second_order()
        elif self.dep_order == "third":
            dep_adj_matrix, dep_type_matrix = dep_instance_parser.get_third_order()
        else:
            raise ValueError()
        
        
        token_head_list = []
        
        for input_id, valid_id in zip(input_ids, valid_ids):
            if input_id == cls_id:
                continue
            if input_id == sep_id:
                break
            if valid_id == 1:
                token_head_list.append(input_id)
                
        dep_adj_matrix_knogcn=copy.deepcopy(dep_adj_matrix)
        dep_type_matrix_knogcn=copy.deepcopy(dep_type_matrix)  
        
        if self.use_knogcn:
            self.onto_words=onto_words
            
            for i in range(len(token_head_list)):
                check=token_head_list[i] in self.onto_words
                if not check:
                    for j in range(len(dep_adj_matrix[i])):
                        dep_adj_matrix_knogcn[i][j]=0
                        dep_type_matrix_knogcn[i][j]='none'
        

        input_ids = self.tokenizer.id_to_sequence(input_ids)
        valid_ids = self.tokenizer.id_to_sequence(valid_ids)
        segment_ids = self.tokenizer.id_to_sequence(segment_ids)
        mem_valid_ids = self.tokenizer.id_to_sequence(mem_valid_ids)
       
        size = input_ids.shape[0]
        
        if print_sent:
            print(doc)
            print(len(dep_adj_matrix[0]))


        final_dep_adj_matrix = [[0] * size for _ in range(size)]
        final_dep_value_matrix = [[0] * size for _ in range(size)]
        

        
        for i in range(len(token_head_list)):
            for j in range(len(dep_adj_matrix[i])):
                if j >= size:
                    break
                final_dep_adj_matrix[i+1][j] = dep_adj_matrix[i][j]
                final_dep_value_matrix[i+1][j] = self.deptype2id[dep_type_matrix[i][j]]
        
        
        final_dep_adj_matrix_knogcn = [[0] * size for _ in range(size)]
        final_dep_value_matrix_knogcn = [[0] * size for _ in range(size)]
        
        
        for i in range(len(token_head_list)):
            for j in range(len(dep_adj_matrix_knogcn[i])):
                if j >= size:
                    break
                final_dep_adj_matrix_knogcn[i+1][j] = dep_adj_matrix_knogcn[i][j]
                final_dep_value_matrix_knogcn[i+1][j] = self.deptype2id[dep_type_matrix_knogcn[i][j]]
       
        
        return {
            "input_ids":torch.tensor(input_ids),
            "valid_ids":torch.tensor(valid_ids),
            "segment_ids":torch.tensor(segment_ids),
            "mem_valid_ids":torch.tensor(mem_valid_ids),
            "dep_adj_matrix":torch.tensor(final_dep_adj_matrix),
            "dep_value_matrix":torch.tensor(final_dep_value_matrix),
            "dep_adj_matrix_knogcn":torch.tensor(final_dep_adj_matrix_knogcn),
            "dep_value_matrix_knogcn":torch.tensor(final_dep_value_matrix_knogcn),
            "polarity": self.polarity2id[polarity],
            "raw_text": doc,
            "aspect": aspect
        }


    @staticmethod
    def load_depfile(filename):
        data = []
        with open(filename, 'r') as f:
            dep_info = []
            for line in f:
                line = line.strip()
                if len(line) > 0:
                    items = line.split("\t")
                    dep_info.append({
                        "governor": int(items[0]),
                        "dependent": int(items[1]),
                        "dep": items[2],
                    })
                else:
                    if len(dep_info) > 0:
                        data.append(dep_info)
                        dep_info = []
            if len(dep_info) > 0:
                data.append(dep_info)
                dep_info = []
        return data

    @staticmethod
    def load_datafile(filename):
        data = []
        with open(filename, 'r') as f:
            lines = f.readlines()
            for i in range(0, len(lines), 3):
                text_left, _, text_right = [s.lower().strip() for s in lines[i].partition("$T$")]
                aspect = lines[i + 1].lower().strip()
                text_right = text_right.replace("$T$", aspect)
                polarity = lines[i + 2].strip()
                data.append([text_left, text_right, aspect, polarity])

        return data

    @staticmethod
    def load_deptype_map(opt):
        deptype_set = set()
        for filename in [opt.train_file, opt.test_file, opt.val_file]:
            filename = "{}.dep".format(filename)
            if os.path.exists(filename) is False:
                continue
            data = ABSADataset.load_depfile(filename)
            for dep_info in data:
                for item in dep_info:
                    deptype_set.add(item['dep'])
        deptype_map = {"none": 0}
        for deptype in sorted(deptype_set, key=lambda x:x):
            deptype_map[deptype] = len(deptype_map)
        return deptype_map

    @staticmethod
    def get_polarity2id():
        polarity_label = ["-1","0","1"]
        return dict([(label, idx) for idx,label in enumerate(polarity_label)])


# TGCN Model

In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
from pytorch_transformers import BertPreTrainedModel,BertModel

In [9]:
class GraphConvolution(nn.Module):
    """
    Simple GCN layer
    """
    def __init__(self, in_features, out_features, bias=True):
        super(GraphConvolution, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = nn.Parameter(torch.FloatTensor(in_features, out_features))
        if bias:
            self.bias = nn.Parameter(torch.FloatTensor(out_features))
        else:
            self.register_parameter('bias', None)

    def forward(self, text, adj):
        hidden = torch.matmul(text, self.weight)
        denom = torch.sum(adj, dim=2, keepdim=True) + 1
        output = torch.matmul(adj, hidden) / denom
        if self.bias is not None:
            return output + self.bias
        else:
            return output

In [10]:
class TypeGraphConvolution(nn.Module):
    """
    TGCN Layer
    """
    def __init__(self, in_features, out_features, embedding_dim, bias=True):
        super(TypeGraphConvolution, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = nn.Parameter(torch.FloatTensor(in_features, out_features))
        self.dense = nn.Linear(embedding_dim, in_features, bias=False) 
        if bias:
            self.bias = nn.Parameter(torch.FloatTensor(out_features))
        else:
            self.register_parameter('bias', None)

    def forward(self, text, adj, dep_embed):
        batch_size, max_len, feat_dim = text.shape 
        val_us = text.unsqueeze(dim=2) 
        val_us = val_us.repeat(1, 1, max_len, 1) 
        val_sum = val_us + self.dense(dep_embed) 
        adj_us = adj.unsqueeze(dim=-1) 
        adj_us = adj_us.repeat(1, 1, 1, feat_dim) 
        hidden = torch.matmul(val_sum, self.weight) 
        output = hidden.transpose(1,2) * adj_us 

        output = torch.sum(output, dim=2) 

        if self.bias is not None:
            return output + self.bias
        else:
            return output

In [11]:
class SemGraphConvolution(nn.Module):
    """
    Semantic GCN layer with attention adjacency matrix 
    """
    def __init__(self, in_features, out_features, attention_heads = 1, bias=True):
        super(SemGraphConvolution, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = nn.Parameter(torch.FloatTensor(in_features, out_features))
        if bias:
            self.bias = nn.Parameter(torch.FloatTensor(out_features))
        else:
            self.register_parameter('bias', None)

    def forward(self, text, adj):
        hidden = torch.matmul(text, self.weight)
        denom = torch.sum(adj, dim=2, keepdim=True) + 1
        output = torch.matmul(adj, hidden) / denom
        if self.bias is not None:
            return output + self.bias
        else:
            return output

In [12]:
class SelfAttention(nn.Module):
    """
    it could be the functino fro the sintectic module?????
    """
    def __init__(self, input_dim):
        super(SelfAttention, self).__init__()
        self.input_dim = input_dim
        self.query = nn.Linear(input_dim, input_dim)
        self.key = nn.Linear(input_dim, input_dim)
        self.value = nn.Linear(input_dim, input_dim)
        self.softmax = nn.Softmax(dim=2)
        
    def forward(self, x):
        queries = self.query(x)
        keys = self.key(x)
        values = self.value(x) 
        scores = torch.bmm(queries, keys.transpose(1, 2)) / (self.input_dim ** 0.5) 
        attention = self.softmax(scores)
        return attention

In [13]:
class MultiHeadAttention(nn.Module):

    def __init__(self, h, d_model, dropout=0.1):
        super(MultiHeadAttention, self).__init__()
        assert d_model % h == 0 # --- devides and return the value of ther reminder --
        #we should have a size of d_odel and h equal each other

        self.d_k = d_model // h # devide with integral result -- rounds the devision
        self.h = h
        self.linears = self.clones(nn.Linear(d_model, d_model), 2)
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, query, key, mask=None):
        if mask is not None:
            mask = mask[:, :, :query.size(1)]
            mask = mask.unsqueeze(1)
            
        nbatches = query.size(0)
        query, key = [l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
                             for l, x in zip(self.linears, (query, key))]
        
        attn = self.attention(query, key, mask=mask, dropout=self.dropout)

        return attn
    

    def attention(self, query, key, mask=None, dropout=None):
        d_k = query.size(-1)
        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k) 
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)

        p_attn = F.softmax(scores, dim=-1)
        if dropout is not None:
            p_attn = dropout(p_attn)

        return p_attn
    
    def clones(self, module, N):
        return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [14]:
class AsaTgcnSem(BertPreTrainedModel):
    def __init__(self, config, modules, tokenizer, opt):
#     use_ensemble = True, fusion_type = 'concat', dropout = 0.2, concat_dropout = 0.5,
#                  cooc_path = 'cooc_matrix_ids.csv', cooc = None):
        """
        modules: dictionary of form {'tgcn': bool, 'semgcn': bool, 'lexgcn': bool} # place for the new module!!!!!
        cooc: cooc matrix as dataframe preloaded into memory. if not passed as argument,
        the matrix will be loaded from the specified path.
        """
        
        super(AsaTgcnSem, self).__init__(config)
        self.opt = opt
        self.modules = opt.modules
        
        
        self.use_tgcn, self.use_semgcn, self.use_lexgcn, self.use_knogcn = opt.modules['tgcn'], opt.modules['semgcn'], opt.modules['lexgcn'], opt.modules['knogcn']
        self.num_modules = sum((self.use_tgcn, self.use_semgcn, self.use_lexgcn, self.use_knogcn)) 
        self.use_ensemble = opt.use_ensemble
        self.layer_number_tgcn = opt.num_layers['tgcn'] 
        self.layer_number_sem = opt.num_layers['semgcn'] 
        self.layer_number_lex = opt.num_layers['lexgcn'] 
        self.layer_number_kno = opt.num_layers['knogcn'] 
        

        
        assert self.use_tgcn or self.use_semgcn or self.use_lexgcn or self.use_knogcn 
        assert opt.fusion_type == 'concat' or opt.fusion_type == 'gate' 
        self.fusion_type = opt.fusion_type
        
        self.num_labels = config.num_labels 
        self.num_types = config.num_types
        
        
        self.bert = BertModel(config)
        

        if self.use_tgcn:
            self.TGCNLayers = nn.ModuleList(([TypeGraphConvolution(config.hidden_size, config.hidden_size, config.hidden_size)
                                             for _ in range(self.layer_number_tgcn)]))
        if self.use_semgcn:
            self.SemGCNLayers = nn.ModuleList(([GraphConvolution(config.hidden_size, config.hidden_size)
                                            for _ in range(self.layer_number_sem)]))

            
        if self.use_lexgcn:
            self.LexGCNLayers = nn.ModuleList(([GraphConvolution(config.hidden_size, config.hidden_size)
                                           for _ in range(self.layer_number_lex)]))

        if self.use_knogcn:
            self.KnoGCNLayers = nn.ModuleList(([TypeGraphConvolution(config.hidden_size, config.hidden_size, config.hidden_size)
                                           for _ in range(self.layer_number_kno)]))


       
        
        if self.use_lexgcn:
            
            if opt.cooc is not None: #WHERE THE PATH IS SPECIFIED FOR PRELOADED DATA
                self.cooc = opt.cooc
                
            else:
                self.cooc = pd.read_csv(opt.cooc_path, index_col=0) #WHERE THE PATH IS SPECIFIED FOR PRELOADED DATA
                self.cooc.index = self.cooc.index.astype(int)
                self.cooc.columns = self.cooc.columns.astype(int)
        
        if self.use_knogcn:
            if opt.onto_words is not None:
                self.onto_words = opt.onto_words
            else:
                self.onto_words = pd.read_csv(opt.onto_words_path) 

        # multiplied by two if concat
        if self.fusion_type == 'concat':
            self.fc_single = nn.Linear(config.hidden_size*self.num_modules, self.num_labels)
        elif self.fusion_type == 'gate':
            self.fc_single = nn.Linear(config.hidden_size, self.num_labels)
        
        #WHY MULTIPLIED BY 2
        self.gate_weight = nn.Parameter(torch.FloatTensor(config.hidden_size, config.hidden_size * 2))
        self.gate_bias = nn.Parameter(torch.FloatTensor(config.hidden_size))
    
        self.dropout = nn.Dropout(opt.dropout)
        self.concat_dropout = nn.Dropout(opt.concat_dropout)
        self.ensemble_linear_tgcn = nn.Linear(1, self.layer_number_tgcn)
        self.ensemble_linear_semgcn = nn.Linear(1, self.layer_number_sem)
        self.ensemble_linear_lexgcn = nn.Linear(1, self.layer_number_lex)
        self.ensemble_linear_knogcn = nn.Linear(1, self.layer_number_kno)

        
        self.ensemble = nn.Parameter(torch.FloatTensor(3, 1))
        self.dep_embedding = nn.Embedding(self.num_types, config.hidden_size, padding_idx=0)

    def get_attention(self, val_out, dep_embed, adj):
        batch_size, max_len, feat_dim = val_out.shape
        val_us = val_out.unsqueeze(dim=2)
        val_us = val_us.repeat(1,1,max_len,1)
        val_cat = torch.cat((val_us, dep_embed), -1).float()
        atten_expand = (val_cat * val_cat.transpose(1,2))

        attention_score = torch.sum(atten_expand, dim=-1)
        attention_score = attention_score / np.power(feat_dim, 0.5)
        exp_attention_score = torch.exp(attention_score)
        exp_attention_score = torch.mul(exp_attention_score, adj.float()) # mask
        sum_attention_score = torch.sum(exp_attention_score, dim=-1).unsqueeze(dim=-1).repeat(1,1,max_len)

        attention_score = torch.div(exp_attention_score, sum_attention_score + 1e-10)
        if 'HalfTensor' in val_out.type():
            attention_score = attention_score.half()

        return attention_score
    
    def get_lex_adj(self, input_ids, batch_size, max_len):
        # Initialize an empty adjacency tensor
        adj_tensor = torch.zeros((batch_size, max_len, max_len))
        
        
        # number of non-zero input_ids for each sentence
        num_words = []
        
        # i refers to the sentence number 
        for i, id_sequence in enumerate(input_ids):
            num_words.append(int(torch.sum(id_sequence != 0)))
            
            
            for j in range(num_words[i]):
                for k in range(num_words[i]):
                    if j != k:
                        id_j, id_k = id_sequence[j].item(), id_sequence[k].item()
                        
                        if id_j in self.cooc and id_k in self.cooc:
                            adj_tensor[i, j, k] = self.cooc[id_j][id_k]
                        else:
                            adj_tensor[i, j, k] = 0
            
            
        # Calculate the sums of rows for each matrix
        row_sums = adj_tensor.sum(dim=2, keepdim=True).repeat(1, 1, max_len)

        # Calculate the sums of columns for each matrix
        column_sums = adj_tensor.sum(dim=1, keepdim=True).repeat(1, max_len, 1)

        # Create a diagonal mask for each matrix
        diagonal_mask = torch.eye(adj_tensor.size(-1)).bool().unsqueeze(0).repeat(batch_size, 1, 1)

        total_sum = row_sums + column_sums

        # Set the diagonal entries to the sum of all the row and column entries (will be averaged later)
        res = torch.where(diagonal_mask, total_sum, adj_tensor)
        
        adj_tensor = adj_tensor + res
        
        # Average 
        
        for i, num in enumerate(num_words):
            # Divide diagonal elements by 2
            diagonal = torch.diagonal(adj_tensor[i])
            diagonal_divided = diagonal / num

            # Assign divided diagonal elements back to the tensor
            adj_tensor[i].diagonal().copy_(diagonal_divided)

        return adj_tensor

    def get_avarage(self, aspect_indices, x):
        aspect_indices_us = torch.unsqueeze(aspect_indices, 2)
        x_mask = x * aspect_indices_us
        aspect_len = (aspect_indices_us != 0).sum(dim=1)
        x_sum = x_mask.sum(dim=1)
        x_av = torch.div(x_sum, aspect_len)

        return x_av
    
    def set_dropout(self, dropout):
        self.dropout = nn.Dropout(dropout)

        
    def forward(self, input_ids, segment_ids, valid_ids, mem_valid_ids, dep_adj_matrix, dep_value_matrix, dep_adj_matrix_knogcn,dep_value_matrix_knogcn):
        # Generate sentence representation with BERT
        sequence_output, pooled_output = self.bert(input_ids, segment_ids)
        
        # Dependency type embeddings
        dep_embed = self.dep_embedding(dep_value_matrix)
        dep_embed_knogcn = self.dep_embedding(dep_value_matrix_knogcn)
        
        
        # Initializing valid output tensor (i.e. 0 for padding, only keeping representations of tokens in sentence)
        batch_size, max_len, feat_dim = sequence_output.shape
        valid_output = torch.zeros(batch_size, max_len, feat_dim, device=input_ids.device).type_as(sequence_output)
        
        for i in range(batch_size):
            temp = sequence_output[i][valid_ids[i] == 1]
            valid_output[i][:temp.size(0)] = temp
        valid_output = self.dropout(valid_output)

        attention_score_for_output = [] # Useless code?
        attention_score_knogcn_for_output=[]
        tgcn_layer_outputs = []
        semgcn_layer_outputs = []
        lexgcn_layer_outputs = []
        knogcn_layer_outputs = []
        
        
        seq_out_tgcn = valid_output
        seq_out_semgcn = valid_output
        seq_out_lexgcn = valid_output
        seq_out_knogcn = valid_output
        
        
        if self.use_tgcn:
            for tgcn in self.TGCNLayers:
                # Computing attention
                attention_score = self.get_attention(seq_out_tgcn, dep_embed, dep_adj_matrix)
                
                attention_score_for_output.append(attention_score) # Useless code?

                # Applying GCN layer
                seq_out = F.relu(tgcn(seq_out_tgcn, attention_score, dep_embed))

                # Saving layer output to be used for layer ensemble later
                tgcn_layer_outputs.append(seq_out_tgcn)
                
            # Average aspect terms for each layer and combining into list 
            tgcn_layer_outputs_pool = [self.get_avarage(mem_valid_ids, x_out) for x_out in tgcn_layer_outputs]

        if self.use_semgcn:
            for semgcn in self.SemGCNLayers:
                # Computing attention
                attn = MultiHeadAttention(1, feat_dim)
                attn.to('cuda')
                attn_tensor = attn(seq_out_semgcn, seq_out_semgcn)
                attn_tensor = attn_tensor.squeeze(1)

                # Applying GCN layer
                seq_out_semgcn = F.relu(semgcn(seq_out_semgcn, attn_tensor))

                # Saving layer output
                semgcn_layer_outputs.append(seq_out_semgcn)
                
            # Average aspect terms for each layer and combining into list
            semgcn_layer_outputs_pool = [self.get_avarage(mem_valid_ids, x_out) for x_out in semgcn_layer_outputs]

        
        if self.use_lexgcn:
            for lexgcn in self.LexGCNLayers:
                # Compute adjaceny matrix
                adj_tensor = self.get_lex_adj(input_ids, batch_size, max_len)
                adj_tensor = adj_tensor.to('cuda')
                # Applying GCN layer
                seq_out_lexgcn = F.relu(lexgcn(seq_out_lexgcn, adj_tensor))
                
                # Saving layer output
                lexgcn_layer_outputs.append(seq_out_lexgcn)
            
            # Average aspect terms for each layer and combining into list
            lexgcn_layer_outputs_pool = [self.get_avarage(mem_valid_ids, x_out) for x_out in lexgcn_layer_outputs]
        
        if self.use_knogcn:
            for knogcn in self.KnoGCNLayers:
                # Computing attention
                attention_score_knogcn = self.get_attention(seq_out_knogcn, dep_embed_knogcn, dep_adj_matrix_knogcn)
                attention_score_knogcn_for_output.append(attention_score_knogcn) # Useless code?

                # Applying GCN layer
                seq_out_knogcn = F.relu(knogcn(seq_out_knogcn, attention_score_knogcn, dep_embed_knogcn))

                # Saving layer output to be used for layer ensemble later
                knogcn_layer_outputs.append(seq_out_knogcn)
                
            # Average aspect terms for each layer and combining into list 
            knogcn_layer_outputs_pool = [self.get_avarage(mem_valid_ids, x_out) for x_out in knogcn_layer_outputs]
            
        all_outputs = []
        
        if self.use_ensemble:
            if self.use_tgcn:
                # Layer ensemble for tgcn
                tgcn_pool = torch.stack(tgcn_layer_outputs_pool, -1) # stacking layer outputs 
                ensemble_tgcn = torch.matmul(tgcn_pool, F.softmax(self.ensemble_linear_tgcn.weight, dim=0))
                ensemble_tgcn = ensemble_tgcn.squeeze(dim=-1)
                ensemble_tgcn = self.dropout(ensemble_tgcn)
                all_outputs.append(ensemble_tgcn)
            
            if self.use_semgcn:
                # Layer ensemble for semgcn
                semgcn_pool = torch.stack(semgcn_layer_outputs_pool, -1)
                ensemble_semgcn = torch.matmul(semgcn_pool, F.softmax(self.ensemble_linear_semgcn.weight, dim = 0))
                ensemble_semgcn = ensemble_semgcn.squeeze(dim=-1)
                ensemble_semgcn = self.dropout(ensemble_semgcn)
                all_outputs.append(ensemble_semgcn)
            
            if self.use_lexgcn:
            # Layer ensemble for lexgcn
                lexgcn_pool = torch.stack(lexgcn_layer_outputs_pool, -1)
                ensemble_lexgcn = torch.matmul(lexgcn_pool, F.softmax(self.ensemble_linear_lexgcn.weight, dim = 0))
                ensemble_lexgcn = ensemble_lexgcn.squeeze(dim=-1)
                ensemble_lexgcn = self.dropout(ensemble_lexgcn)
                all_outputs.append(ensemble_lexgcn)
            
            if self.use_knogcn:
            # Layer ensemble for knogcn
                knogcn_pool = torch.stack(knogcn_layer_outputs_pool, -1)
                ensemble_knogcn = torch.matmul(knogcn_pool, F.softmax(self.ensemble_linear_knogcn.weight, dim = 0))
                ensemble_knogcn = ensemble_knogcn.squeeze(dim=-1)
                ensemble_knogcn = self.dropout(ensemble_knogcn)
                all_outputs.append(ensemble_knogcn)
            
        
        else:
            # Take only the last layer output
            if self.use_tgcn:
                ensemble_tgcn = tgcn_layer_outputs_pool[-1]
                all_outputs.append(ensemble_tgcn)
            if self.use_semgcn:
                ensemble_semgcn = semgcn_layer_outputs_pool[-1]
                all_outputs.append(ensemble_semgcn)
            if self.use_lexgcn:
                ensemble_lexgcn = lexgcn_layer_outputs_pool[-1]
                all_outputs.append(ensemble_lexgcn)
            if self.use_knogcn:
                ensemble_knogcn = knogcn_layer_outputs_pool[-1]
                all_outputs.append(ensemble_knogcn)
            
        # Stacking module outputs
        ensemble_out = torch.cat(all_outputs, dim=1)
        
        
        # gating only if 2 modules used
        # added additional combinations of modules used
        if self.fusion_type == 'gate' and self.num_modules == 2: 
            if self.use_tgcn and self.use_semgcn:
                concatenated = torch.cat((ensemble_tgcn, ensemble_semgcn), dim=1) 
                g = torch.matmul(concatenated, self.gate_weight.t()) + self.gate_bias  # Compute W_g[h0 ; h1] + b_g
                g = torch.sigmoid(g)
                ensemble_out = g * ensemble_tgcn + (1 - g) * ensemble_semgcn
            if self.use_tgcn and self.use_lexgcn:
                concatenated = torch.cat((ensemble_tgcn, ensemble_lexgcn), dim=1) 
                g = torch.matmul(concatenated, self.gate_weight.t()) + self.gate_bias  # Compute W_g[h0 ; h1] + b_g
                g = torch.sigmoid(g)
                ensemble_out = g * ensemble_tgcn + (1 - g) * ensemble_lexgcn
            if self.use_tgcn and self.use_knogcn:
                concatenated = torch.cat((ensemble_tgcn, ensemble_knogcn), dim=1) 
                g = torch.matmul(concatenated, self.gate_weight.t()) + self.gate_bias  # Compute W_g[h0 ; h1] + b_g
                g = torch.sigmoid(g)
                ensemble_out = g * ensemble_tgcn + (1 - g) * ensemble_knogcn
            if self.use_tgcn and self.use_knogcn:
                concatenated = torch.cat((ensemble_tgcn, ensemble_knogcn), dim=1) 
                g = torch.matmul(concatenated, self.gate_weight.t()) + self.gate_bias  # Compute W_g[h0 ; h1] + b_g
                g = torch.sigmoid(g)
                ensemble_out = g * ensemble_tgcn + (1 - g) * ensemble_knogcn
            if self.use_lexgcn and self.use_semgcn:
                concatenated = torch.cat((ensemble_semgcn, ensemble_lexgcn), dim=1) 
                g = torch.matmul(concatenated, self.gate_weight.t()) + self.gate_bias  # Compute W_g[h0 ; h1] + b_g
                g = torch.sigmoid(g)
                ensemble_out = g * ensemble_semgcn + (1 - g) * ensemble_lexgcn
            if self.use_knogcn and self.use_semgcn:
                concatenated = torch.cat((ensemble_semgcn, ensemble_knogcn), dim=1) 
                g = torch.matmul(concatenated, self.gate_weight.t()) + self.gate_bias  # Compute W_g[h0 ; h1] + b_g
                g = torch.sigmoid(g)
                ensemble_out = g * ensemble_semgcn + (1 - g) * ensemble_knogcn
            if self.use_knogcn and self.use_lexgcn:
                concatenated = torch.cat((ensemble_lexgcn, ensemble_knogcn), dim=1) 
                g = torch.matmul(concatenated, self.gate_weight.t()) + self.gate_bias  # Compute W_g[h0 ; h1] + b_g
                g = torch.sigmoid(g)
                ensemble_out = g * ensemble_lexgcn + (1 - g) * ensemble_knogcn
          
        # Additional dropout
        if (self.num_modules == 2 and self.fusion_type == 'concat') or self.num_modules == 4: #number of modules is changed
            ensemble_out = self.concat_dropout(ensemble_out)
            
        output = self.fc_single(ensemble_out)
        
        return output
    

In [15]:
class AsaTgcn(BertPreTrainedModel):
    
    def __init__(self, config, dropout = 0.2):
        super(AsaTgcn, self).__init__(config)
        self.config = config
        self.layer_number_tgcn = 3
        self.num_labels = config.num_labels
        self.num_types = config.num_types

        self.bert = BertModel(config)
        self.TGCNLayers = nn.ModuleList(([TypeGraphConvolution(config.hidden_size, config.hidden_size, config.hidden_size)
                                         for _ in range(self.layer_number_tgcn)]))
        self.fc_single = nn.Linear(config.hidden_size, self.num_labels)
        self.dropout = nn.Dropout(dropout)
        self.ensemble_linear_tgcn = nn.Linear(1, self.layer_number_tgcn)
        self.ensemble = nn.Parameter(torch.FloatTensor(3, 1))
        self.dep_embedding = nn.Embedding(self.num_types, config.hidden_size, padding_idx=0)

    def get_attention(self, val_out, dep_embed, adj):
        batch_size, max_len, feat_dim = val_out.shape
        val_us = val_out.unsqueeze(dim=2)
        val_us = val_us.repeat(1,1,max_len,1)
        val_cat = torch.cat((val_us, dep_embed), -1).float()
        atten_expand = (val_cat * val_cat.transpose(1,2))

        attention_score = torch.sum(atten_expand, dim=-1)
        attention_score = attention_score / np.power(feat_dim, 0.5)
        exp_attention_score = torch.exp(attention_score)
        exp_attention_score = torch.mul(exp_attention_score, adj.float()) # mask
        sum_attention_score = torch.sum(exp_attention_score, dim=-1).unsqueeze(dim=-1).repeat(1,1,max_len)

        attention_score = torch.div(exp_attention_score, sum_attention_score + 1e-10)
        if 'HalfTensor' in val_out.type():
            attention_score = attention_score.half()

        return attention_score

    def get_avarage(self, aspect_indices, x):
        aspect_indices_us = torch.unsqueeze(aspect_indices, 2)
        x_mask = x * aspect_indices_us
        aspect_len = (aspect_indices_us != 0).sum(dim=1)
        x_sum = x_mask.sum(dim=1)
        x_av = torch.div(x_sum, aspect_len)
        return x_av
    
    def set_dropout(self, dropout):
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_ids, segment_ids, valid_ids, mem_valid_ids, dep_adj_matrix, dep_value_matrix):
        # Generate sentence representation with BERT
        sequence_output, pooled_output = self.bert(input_ids, segment_ids)
        
        # Dependency type embeddings
        dep_embed = self.dep_embedding(dep_value_matrix)
        
        # Initializing valid output tensor (i.e. 0 for padding, only keeping representations of tokens in sentence)
        batch_size, max_len, feat_dim = sequence_output.shape
        valid_output = torch.zeros(batch_size, max_len, feat_dim, device=input_ids.device).type_as(sequence_output)
        for i in range(batch_size):
            temp = sequence_output[i][valid_ids[i] == 1]
            valid_output[i][:temp.size(0)] = temp
        valid_output = self.dropout(valid_output)

        attention_score_for_output = [] 
        tgcn_layer_outputs = []
        semgcn_layer_outputs = []
        seq_out_tgcn = valid_output
        seq_out_semgcn = valid_output
        for tgcn in self.TGCNLayers:
            # Computing attention
            attention_score = self.get_attention(seq_out_tgcn, dep_embed, dep_adj_matrix)
            attention_score_for_output.append(attention_score) # Useless code?
            
            # Applying GCN layer
            seq_out = F.relu(tgcn(seq_out_tgcn, attention_score, dep_embed))
            
            # Saving layer output to be used for layer ensemble later
            tgcn_layer_outputs.append(seq_out_tgcn)
        
        # Average aspect terms for each layer and combining into list
        tgcn_layer_outputs_pool = [self.get_avarage(mem_valid_ids, x_out) for x_out in tgcn_layer_outputs]
        
        # Layer ensemble for tgcn
        tgcn_pool = torch.stack(tgcn_layer_outputs_pool, -1) # stacking layer outputs 
        ensemble_tgcn = torch.matmul(tgcn_pool, F.softmax(self.ensemble_linear_tgcn.weight, dim=0))
        ensemble_tgcn = ensemble_tgcn.squeeze(dim=-1)
        ensemble_tgcn = self.dropout(ensemble_tgcn)
        
        output = self.fc_single(ensemble_tgcn)

        return output

# Main code

In [16]:
import logging
import argparse
import math
import os
import sys
from time import strftime, localtime
import random
import numpy as np
import subprocess

from pytorch_transformers import BertModel, BertConfig
# from data_utils import Tokenizer4Bert, ABSADataset
# from asa_tgcn_model import AsaTgcn

# !pip install scikit-learn
from sklearn import metrics
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, random_split


CONFIG_NAME = 'config.json'
WEIGHTS_NAME = 'pytorch_model.bin'

logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler(sys.stdout))

In [17]:
class Instructor:
    def __init__(self, opt):
        self.opt = opt
        logger.info(opt)
        deptype2id = ABSADataset.load_deptype_map(opt)
        polarity2id = ABSADataset.get_polarity2id()
        logger.info(deptype2id)
        logger.info(polarity2id)
        self.deptype2id = deptype2id
        self.polarity2id = polarity2id
        
        self.vocab_path = os.path.join(opt.bert_model, 'vocab.txt')
        self.tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.bert_model)
        config = BertConfig.from_pretrained("bert-large-uncased") 
        config.num_labels=opt.polarities_dim
        config.num_types=len(self.deptype2id)
        logger.info(config)
        print('check the type of the model...')
        if opt.model_type == 'tgcn': # WHAT IS MY NAME OF THE MODEL
            self.model = AsaTgcn.from_pretrained(opt.bert_model, config=config, dropout = opt.dropout)
        else:
            self.model = AsaTgcnSem.from_pretrained(opt.bert_model, config=config, modules = opt.modules,
                                                    tokenizer = self.tokenizer, opt=self.opt) 
#                                                 use_ensemble = opt.use_ensemble,
#                                                     fusion_type = opt.fusion_type, dropout = opt.dropout, 
#                                                     concat_dropout = opt.concat_dropout,
#                                                    cooc_path = opt.cooc_path, cooc = opt.cooc)
        self.model.set_dropout(opt.dropout)
        self.model.to(opt.device)
        
        print('downloading the files...')
        self.fulltrainset = ABSADataset(opt.train_file, self.tokenizer, self.opt, deptype2id=deptype2id)
        self.trainset = ABSADataset(opt.train_file, self.tokenizer, self.opt, deptype2id=deptype2id)
        self.testset = ABSADataset(opt.test_file, self.tokenizer, self.opt, deptype2id=deptype2id)
        
        print('check if the val exist...')
        if os.path.exists(opt.val_file):
            self.valset = ABSADataset(opt.val_file, self.tokenizer, self.opt, deptype2id=deptype2id)
        elif opt.valset_ratio > 0:
            valset_len = int(len(self.trainset) * opt.valset_ratio)
            self.trainset, self.valset = random_split(self.trainset, (len(self.trainset)-valset_len, valset_len))
        else:
            self.valset = self.testset
        
        print("check device opt.device.type == cuda")
        print("opt.device.type == cuda")
        if opt.device.type == 'cuda':
            logger.info('cuda memory allocated: {}'.format(torch.cuda.memory_allocated(device=opt.device.index)))

    def _print_args(self):
        n_trainable_params, n_nontrainable_params = 0, 0
        for p in self.model.parameters():
            n_params = torch.prod(torch.tensor(p.shape))
            if p.requires_grad:
                n_trainable_params += n_params
            else:
                n_nontrainable_params += n_params
        logger.info('n_trainable_params: {0}, n_nontrainable_params: {1}'.format(n_trainable_params, n_nontrainable_params))
        logger.info('> training arguments:')
        for arg in vars(self.opt):
            logger.info('>>> {0}: {1}'.format(arg, getattr(self.opt, arg)))

    def _reset_params(self):
        for child in self.model.children():
            if type(child) != BertModel:  # skip bert params
                for p in child.parameters():
                    if p.requires_grad:
                        if len(p.shape) > 1:
                            torch.nn.init.xavier_uniform_(p)
                        else:
                            stdv = 1. / math.sqrt(p.shape[0])
                            torch.nn.init.uniform_(p, a=-stdv, b=stdv)

    def save_model(self, save_path, model, args):
        print('function save_model starts...')
        # Save a trained model, configuration and tokenizer
        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
        # If we save using the predefined names, we can load using `from_pretrained`
        output_model_file = os.path.join(save_path, WEIGHTS_NAME)
        output_config_file = os.path.join(save_path, CONFIG_NAME)
        torch.save(model_to_save.state_dict(), output_model_file)

        config = model_to_save.config
        config.__dict__["deptype2id"] = self.deptype2id
        config.__dict__["polarity2id"] = self.polarity2id
        with open(output_config_file, "w", encoding='utf-8') as writer:
            writer.write(config.to_json_string())
        output_args_file = os.path.join(save_path, 'training_args.bin')
        torch.save(args, output_args_file)
        subprocess.run(['cp', self.vocab_path, os.path.join(save_path, 'vocab.txt')])

    def _train(self, criterion, optimizer, train_data_loader, val_data_loader, test_data_loader):
        print('function _train starts...')
        max_val_acc = -1
        max_val_f1 = -1
        global_step = 0
        path = None

        model_home = self.opt.model_path 
#         model_home += '-' + strftime("%y%m%d-%H%M", localtime())

        results = {"bert_model": self.opt.bert_model, "batch_size": self.opt.batch_size,
                   "learning_rate": self.opt.learning_rate, "seed": self.opt.seed,
                  "num_epoch": self.opt.num_epoch, "l2reg": self.opt.l2reg,
                  "dropout": self.opt.dropout}
        for epoch in range(self.opt.num_epoch):
            logger.info('>' * 100)
            logger.info('epoch: {}'.format(epoch))
            n_correct, n_total, loss_total = 0, 0, 0
            
            self.model.train() 
            
            for i_batch, t_sample_batched in enumerate(train_data_loader):

                global_step += 1
                optimizer.zero_grad()

                n_correct, n_total, loss_total = self.train_step(optimizer, i_batch, t_sample_batched, criterion, n_correct, n_total, loss_total)
                if global_step % self.opt.log_step == 0:
                    train_acc = n_correct / n_total
                    train_loss = loss_total / n_total
                    logger.info('epoch: {}, loss: {:.4f}, train acc: {:.4f}'.format(epoch, train_loss, train_acc))

                gc.collect()
                torch.cuda.empty_cache() # try optimization
                
            
            #OLD LINES
            val_acc, val_f1 = Instructor._evaluate_acc_f1(self.model, val_data_loader, device=self.opt.device)
            logger.info('>epoch: {}, val_acc: {:.4f}, val_f1: {:.4f}'.format(epoch, val_acc, val_f1))
            results["{}_val_acc".format(epoch)] = val_acc
            results["{}_val_f1".format(epoch)] = val_f1
            saving_path = os.path.join(model_home, "epoch_{}".format(epoch))
            
            if not os.path.exists(saving_path):
                os.makedirs(saving_path)
            if val_acc > max_val_acc or (val_acc == max_val_acc and val_f1 > max_val_f1):
                max_val_acc = val_acc
                max_val_f1 = val_f1
                
                if opt.save_models == 'last':
                    best_path = saving_path
                    best_model = self.model
                elif opt.save_models == 'all':
                    self.save_model(saving_path, self.model, self.opt)
                elif opt.save_models == 'none':
                    pass 

#                 self.model.eval() #old part we don't need that because inside _evaluate_acc_f1 we have model.eval()
                saving_path = os.path.join(model_home, "epoch_{}_eval.txt".format(epoch))
                test_acc, test_f1 = self._evaluate_acc_f1(self.model, test_data_loader, device=self.opt.device,
                                                          saving_path=saving_path)
                logger.info('>> epoch: {}, test_acc: {:.4f}, test_f1: {:.4f}'.format(epoch, test_acc, test_f1))

                results["max_val_acc"] = max_val_acc
                results["test_acc"] = test_acc
                results["test_f1"] = test_f1
            
            output_eval_file = os.path.join(model_home, "eval_results.txt")
            
            with open(output_eval_file, "w") as writer:
                for k,v in results.items():
                    writer.write("{}={}\n".format(k,v))
        
        acc_file = os.path.join(model_home, "acc-{:.4f}".format(test_acc))
        
        if opt.save_models == 'last':
            self.save_model(best_path, best_model, self.opt)
        
        with open(acc_file, 'w') as f:
            f.write(f"accuracy: {test_acc}")
        return max_val_acc, test_acc, test_f1
    
    def train_step(self, optimizer, i_batch, t_sample_batched, criterion, n_correct, n_total, loss_total):
        # t_sample_batched["raw_text"],
        outputs = self.model(t_sample_batched["input_ids"].to(self.opt.device),
                             t_sample_batched["segment_ids"].to(self.opt.device),
                             t_sample_batched["valid_ids"].to(self.opt.device),
                             t_sample_batched["mem_valid_ids"].to(self.opt.device),
                             t_sample_batched["dep_adj_matrix"].to(self.opt.device),
                             t_sample_batched["dep_value_matrix"].to(self.opt.device),
                             t_sample_batched["dep_adj_matrix_knogcn"].to(self.opt.device),
                             t_sample_batched["dep_value_matrix_knogcn"].to(self.opt.device))
        targets = t_sample_batched['polarity'].to(self.opt.device)

        loss = criterion(outputs, targets)
        loss.backward()

        optimizer.step()

        n_correct += (torch.argmax(outputs, -1) == targets).sum().item()
        n_total += len(outputs)
        loss_total += loss.item() * len(outputs)

        return n_correct, n_total, loss_total

    @staticmethod
    def _evaluate_acc_f1(model, data_loader, device, saving_path=None):
        model.eval()
        
        n_correct, n_total = 0, 0
        t_targets_all, t_outputs_all = None, None
        
        #model.eval() #the old place

        saving_path_f = open(saving_path, 'w') if saving_path is not None else None

        with torch.no_grad():
            for t_batch, t_sample_batched in enumerate(data_loader):
                t_targets = t_sample_batched['polarity'].to(device)
                t_raw_texts = t_sample_batched['raw_text']
                t_aspects = t_sample_batched['aspect']

                t_outputs = model(t_sample_batched["input_ids"].to(device),
                                  t_sample_batched["segment_ids"].to(device),
                                  t_sample_batched["valid_ids"].to(device),
                                  t_sample_batched["mem_valid_ids"].to(device),
                                  t_sample_batched["dep_adj_matrix"].to(device),
                                  t_sample_batched["dep_value_matrix"].to(device),
                                  t_sample_batched["dep_adj_matrix_knogcn"].to(device),
                                  t_sample_batched["dep_value_matrix_knogcn"].to(device))
                
                n_correct += (torch.argmax(t_outputs, -1) == t_targets).sum().item()
                n_total += len(t_outputs)

                if t_targets_all is None:
                    t_targets_all = t_targets
                    t_outputs_all = t_outputs
                else:
                    t_targets_all = torch.cat((t_targets_all, t_targets), dim=0)
                    t_outputs_all = torch.cat((t_outputs_all, t_outputs), dim=0)

                if saving_path_f is not None:
                    for t_target, t_output, t_raw_text, t_aspect in zip(t_targets.detach().cpu().numpy(),
                                                                        torch.argmax(t_outputs, -1).detach().cpu().numpy(),
                                                                        t_raw_texts, t_aspects):
                        saving_path_f.write("{}\t{}\t{}\t{}\n".format(t_target, t_output, t_raw_text, t_aspect))
        acc = n_correct / n_total
        f1 = metrics.f1_score(t_targets_all.cpu(), torch.argmax(t_outputs_all, -1).cpu(), labels=[0, 1, 2], average='macro', zero_division=0)
        return acc, f1

    def train(self):
        # Loss and Optimizer
        criterion = nn.CrossEntropyLoss()
        _params = filter(lambda p: p.requires_grad, self.model.parameters())
        optimizer = torch.optim.Adam(_params, lr=self.opt.learning_rate, weight_decay=self.opt.l2reg)

        train_data_loader = DataLoader(dataset=self.trainset, batch_size=self.opt.batch_size, shuffle=True)
        test_data_loader = DataLoader(dataset=self.testset, batch_size=self.opt.batch_size, shuffle=True) 
        val_data_loader = DataLoader(dataset=self.valset, batch_size=self.opt.batch_size, shuffle=True) 
        full_train_data_loader = DataLoader(dataset = self.fulltrainset, batch_size = self.opt.batch_size, shuffle=True)

        self._reset_params()
        max_val_acc, test_acc, test_f1 = self._train(criterion, optimizer, train_data_loader, val_data_loader, test_data_loader)
        return max_val_acc, test_acc, test_f1
    
   

In [19]:
def test(opt):
    logger.info(opt)
    config = BertConfig.from_json_file(os.path.join(opt.model_path, CONFIG_NAME))
    logger.info(config)

    tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.model_path)
    if opt.model_type == 'tgcn':
        model = AsaTgcn.from_pretrained(opt.model_path)
    elif opt.model_type == 'tgcn+sem':
        model = AsaTgcnSem.from_pretrained(opt.model_path)
    model.set_dropout(opt.dropout)
    model.to(opt.device)

    deptype2id = config.deptype2id
    logger.info(deptype2id)
    testset = ABSADataset(opt.test_file, tokenizer, opt, deptype2id=deptype2id)
    test_data_loader = DataLoader(dataset=testset, batch_size=opt.batch_size, shuffle=False)
    test_acc, test_f1 = Instructor._evaluate_acc_f1(model, test_data_loader, device=opt.device)
    logger.info('>> test_acc: {:.4f}, test_f1: {:.4f}'.format(test_acc, test_f1))

In [20]:
def get_args(model_type = 'tgcn', # tgcn, tgcn+sem, tri_gcn
             # Select which modules to use for hybrid model
             tgcn = True,
             semgcn = True, 
             lexgcn = True,
             knogcn = True,
             tgcn_layers = 2,
             semgcn_layers = 2,
             lexgcn_layers = 2,
             knogcn_layers = 2,
             path = None, 
             year='2015',
             val_file='val.txt',
             log = 'log',
             bert_model='bert-large-uncased', # (change underscore to the dash)
             #bert_model='bert_large_uncased',
             cooc_path = 'cooc_matrix_final2.csv', # Path to co-occurrence matrix file
             cooc = None, # Pandas DataFrame co-occurrence matrix. If not specified, it will be loaded from cooc_path
             onto_words=None,
             onto_words_path='test_ontology_keys.csv',
             learning_rate=2e-5,
             dropout=0.2,
             concat_dropout = 0.4,
             bert_dropout=0.2,
             l2reg=0.01,
             num_epoch=50,
             batch_size=6, 
             log_step=100,
             max_seq_len=100,
             polarities_dim=3,
             device='cuda',
             seed=50,
             valset_ratio=0.2, # the percentage fo the validation set
             do_train=True,
             do_eval=True,
             eval_epoch_num=0,
             fusion_type = 'concat', # 'concat' or 'gate'
             use_ensemble = True, 
            save_models='last',
            print_sentences = False, #changed to check the results
             optim = 'adam'
            ):
    
    assert model_type == 'tgcn' or model_type == 'tgcn+sem' or model_type == 'tri_gcn'
    
    opt = argparse.Namespace()
    opt.model_type = model_type
    opt.modules = {'tgcn': tgcn, 'semgcn': semgcn, 'lexgcn': lexgcn, 'knogcn': knogcn}
    opt.num_layers = {'tgcn': tgcn_layers, 'semgcn': semgcn_layers, 'lexgcn': lexgcn_layers, 'knogcn': knogcn_layers}
    
    opt.year = year
    
    fusion = "" if model_type == 'tgcn' else "+" + fusion_type
    opt.train_file = f'data/train{year}restaurant.txt'
    opt.test_file = f'data/test{year}restaurant.txt'
    opt.model_path = f'test_models/{year}{model_type}{fusion}_seed{seed}_reg{l2reg}_drop{dropout}_cdrop{concat_dropout}_lr{learning_rate}_tgcn{tgcn}_semgcn{semgcn}_lexgcn{lexgcn}_knogcn{knogcn}_epochs{num_epoch}_{optim.lower()}'
#     if model_type == 'tgcn':
#         opt.model_path = f'models/rest_{year}/BERT.L_seed{seed}_reg{l2reg}_drop{dropout}_lr{learning_rate}_epochs{num_epoch}' 
#     elif model_type == 'tgcn+sem':
#         opt.model_path = f'models/rest_{year}/{model_type}/{model_type}_seed{seed}_reg{l2reg}_drop{dropout}_lr{learning_rate}_epochs{num_epoch}'
    if do_eval and not do_train:
        opt.model_path += f'/epoch_{eval_epoch_num}'
    if path:
        opt.model_path = path
    opt.val_file = val_file
    opt.log = log
    opt.bert_model = bert_model
    opt.cooc_path = cooc_path
    opt.cooc = cooc
    opt.onto_words=onto_words
    opt.onto_words_path=onto_words_path
    opt.learning_rate = learning_rate
    opt.dropout = dropout
    opt.concat_dropout = concat_dropout
    opt.bert_dropout = bert_dropout
    opt.l2reg = l2reg
    opt.num_epoch = num_epoch
    opt.batch_size = batch_size
    opt.log_step = log_step
    opt.max_seq_len = max_seq_len
    opt.polarities_dim = polarities_dim
    opt.device = device
    opt.seed = seed
    opt.valset_ratio = valset_ratio
    opt.do_train = do_train
    opt.do_eval = do_eval
    opt.eval_epoch_num = eval_epoch_num
    opt.fusion_type = fusion_type
    opt.use_ensemble = True
    opt.save_models = save_models
    opt.print_sent = print_sentences
    opt.optim = optim
    return opt

In [21]:
def set_seed(opt):
    if opt.seed is not None:
        random.seed(opt.seed)
        np.random.seed(opt.seed)
        torch.manual_seed(opt.seed)
        torch.cuda.manual_seed(opt.seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

In [22]:
def main(opt):
    opt = opt
    set_seed(opt)

    opt.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') \
        if opt.device is None else torch.device(opt.device)
    opt.n_gpu = torch.cuda.device_count()

    if not os.path.exists(opt.log):
        os.makedirs(opt.log)

    log_file = '{}/log-{}.log'.format(opt.log, strftime("%y%m%d-%H%M", localtime()))
    logger.addHandler(logging.FileHandler(log_file))
    
    print('strat of the do train...')
    if opt.do_train:
        ins = Instructor(opt)
        max_val_acc, test_acc, test_f1 = ins.train()
    elif opt.do_eval:
        test(opt)
    
    return max_val_acc, test_acc, test_f1

# Run program

In [33]:
#FULL COOC MATRIX THAT NEEDED TO BE USED IN THE CODE
cooc = pd.read_csv('cooc_matrix_final2.csv',index_col=0)

In [35]:
onto_words =pd.read_csv('test_ontology_keys.csv', sep=';')

In [36]:
#NEW CODE TO AVOID SEVERAL PARAMETERS
opt = get_args(batch_size = 6 # CHANGED BECAUSE OF THE MEMORY ERROR
                  ,num_epoch = 12, model_type = 'tri_gcn', save_models = 'none', fusion_type = 'concat',
                   tgcn = True, semgcn = True, lexgcn = False, knogcn=False, # changed from True
                   use_ensemble = False,
                  tgcn_layers = 2, semgcn_layers = 2, lexgcn_layers = 2, knogcn_layers = 2, optim = 'adam')
deptype2id = ABSADataset.load_deptype_map(opt)
tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.bert_model)

loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt from cache at C:\Users\bromi\.cache\torch\pytorch_transformers\9b3c03a36e83b13d5ba95ac965c9f9074a99e14340c523ab405703179e79fc46.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


In [37]:
for i in range(onto_words.shape[0]):
    for j in range(onto_words.shape[1]):
        word=(onto_words.iloc[i][j])
        if type(word)==str:
            tokenized_word=tokenizer.tokenizer.convert_tokens_to_ids(tokenizer.tokenizer.tokenize(word))[0]
            onto_words.iloc[i][j]=tokenized_word
        if type(word)==float:
            onto_words.iloc[i][j]=-1 #word

In [38]:
onto_words = [item for sublist in onto_words.values.tolist() for item in sublist]

In [39]:
onto_words=list(dict.fromkeys(onto_words))

Hyperparameter searching

In [40]:
import csv

In [41]:
# csv_file = 'final_results.csv'

# res = {'2015': [], '2016': []}
# torch.cuda.empty_cache() # try optimization

# for i in range(20):
#     print('{} loop'.format(i))
    
#     lr = np.random.choice(np.logspace(-6, -3))
#     d = np.random.choice([0.1, 0.2, 0.4])
#     cdrop = np.random.choice([0.1, 0.2, 0.4])
#     w_decay = np.random.choice(np.logspace(-5, -3))
#     seed = np.random.randint(1000)
    
#     year = '2016'
#     opt = get_args(batch_size = 6 #CHANGED FROM 16 BECAUSE OF THE MEMORY ERROR
#                    , seed = seed, dropout = d,
#                   l2reg = w_decay, learning_rate = lr, year = year,
#                   num_epoch = 15, model_type = 'tri_gcn', save_models = 'none', fusion_type = 'concat',
#                   concat_dropout = cdrop, cooc = cooc, onto_words = onto_words,
#                     tgcn = True, semgcn = True, lexgcn = True, knogcn = True, use_ensemble = False,
#                   tgcn_layers = 2, semgcn_layers = 2, lexgcn_layers = 2, knogcn_layers = 2, optim = 'adam')
    
#     opt.device = torch.device('cuda')
    
#     max_val_acc, test_acc, test_f1 = main(opt)
    
#     res[year].append((opt, max_val_acc, test_acc, test_f1))
    
#     try:
#         with open(csv_file, 'a', newline='') as file:
#                 writer = csv.writer(file)
#                 writer.writerow((year, max_val_acc, test_acc, test_f1, seed, lr, d, cdrop, w_decay))
#     finally:
#         print('FINISH')

Training with best hyperparameters

In [43]:
import numpy as np

In [None]:
num_trials = 1

# lr_space = np.linspace(5e-6, 2e-5, num = 50)
# cd_space = np.linspace(0, 0.6, 14)
# d_space = np.linspace(0, 0.4, 8)
# reg_space = np.logspace(-2.5, -1.2, num=100)

for i in range(num_trials):
    
    print('{} loop'.format(i))
    
#     c_d = np.random.choice(cd_space)
#     d = np.random.choice(d_space)
#     reg = np.random.choice(reg_space)
#     lr = np.random.choice(lr_space)
#     seed = np.random.randint(1000) 

    c_d=0.2285714285714286
    d=0.2285714285714286
    reg=0.027059715881067578
    lr=1.1122448979591838e-05
    seed=65
    
    fusion_type = 'concat'
    opt = get_args(batch_size = 4 
                   , seed = seed, dropout = d,
              l2reg = reg, learning_rate = lr, year = '2015',
              num_epoch = 15, model_type = 'tri_gcn', save_models = 'none', fusion_type = fusion_type, use_ensemble=False,
                   cooc=cooc, onto_words=onto_words,
              concat_dropout = c_d, tgcn = True, semgcn = True, lexgcn = True, knogcn = True)
    main(opt)


0 loop
strat of the do train...
Namespace(batch_size=4, bert_dropout=0.2, bert_model='bert-large-uncased', concat_dropout=0.2285714285714286, cooc=          13325          2013      3025      8466          2023      2109  \
13325  0.000000  8.880995e-04  0.021739  0.200000  0.000000e+00  0.000000   
2013   0.000888  2.488321e-12  0.000097  0.000355  2.238906e-05  0.000013   
3025   0.021739  9.653255e-05  0.000000  0.008696  3.425283e-05  0.000308   
8466   0.200000  3.552398e-04  0.008696  0.000000  2.100840e-04  0.002837   
2023   0.000000  2.238906e-05  0.000034  0.000210  2.282724e-12  0.000030   
...         ...           ...       ...       ...           ...       ...   
7108   0.000000  0.000000e+00  0.000000  0.000000  0.000000e+00  0.000000   
28297  0.000000  0.000000e+00  0.000000  0.000000  0.000000e+00  0.000000   
29425  0.000000  0.000000e+00  0.000000  0.000000  0.000000e+00  0.000000   
19337  0.000000  0.000000e+00  0.000000  0.000000  0.000000e+00  0.000000   
28946 

check the type of the model...
loading weights file https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-pytorch_model.bin from cache at C:\Users\bromi\.cache\torch\pytorch_transformers\54da47087cc86ce75324e4dc9bbb5f66c6e83a7c6bd23baea8b489acc8d09aa4.4d5343a4b979c4beeaadef17a0453d1bb183dd9b084f58b84c7cc781df343ae6
Weights of AsaTgcnSem not initialized from pretrained model: ['gate_weight', 'gate_bias', 'ensemble', 'TGCNLayers.0.weight', 'TGCNLayers.0.bias', 'TGCNLayers.0.dense.weight', 'TGCNLayers.1.weight', 'TGCNLayers.1.bias', 'TGCNLayers.1.dense.weight', 'TGCNLayers.2.weight', 'TGCNLayers.2.bias', 'TGCNLayers.2.dense.weight', 'SemGCNLayers.0.weight', 'SemGCNLayers.0.bias', 'SemGCNLayers.1.weight', 'SemGCNLayers.1.bias', 'LexGCNLayers.0.weight', 'LexGCNLayers.0.bias', 'LexGCNLayers.1.weight', 'LexGCNLayers.1.bias', 'KnoGCNLayers.0.weight', 'KnoGCNLayers.0.bias', 'KnoGCNLayers.0.dense.weight', 'KnoGCNLayers.1.weight', 'KnoGCNLayers.1.bias', 'KnoGCNLayers.1.dense.weig

function _train starts...
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
epoch: 0
epoch: 0, loss: 0.8070, train acc: 0.6950
epoch: 0, loss: 0.7198, train acc: 0.7262
>epoch: 0, val_acc: 0.7490, val_f1: 0.2966
>> epoch: 0, test_acc: 0.6064, test_f1: 0.2972
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
epoch: 1
epoch: 1, loss: 0.4597, train acc: 0.8295
epoch: 1, loss: 0.4410, train acc: 0.8472
epoch: 1, loss: 0.4123, train acc: 0.8545
>epoch: 1, val_acc: 0.8627, val_f1: 0.5603
>> epoch: 1, test_acc: 0.8208, test_f1: 0.5758
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
epoch: 2
epoch: 2, loss: 0.2850, train acc: 0.8977
epoch: 2, loss: 0.2471, train acc: 0.9136
>epoch: 2, val_acc: 0.9059, val_f1: 0.5917
>> epoch: 2, test_acc: 0.7990, test_f1: 0.5550
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>