Put libraries here

In [1]:
# sys.path

In [102]:
import torch, os, pickle, sys
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import Parameter

# from GAT import GAT
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
script_path = os.path.abspath("utils\\constans.py")  # Replace __file__ with the path to your script if in a notebook

# Determine the project directory by moving up two levels (adjust as needed)
project_directory = os.path.dirname(os.path.dirname(script_path))

print("Script Path:", script_path)
print("Project Directory:", project_directory)

Script Path: C:\Users\edayo\Downloads\4y2t\THSST-2\ug_thesis\ER_GAT\utils\constans.py
Project Directory: C:\Users\edayo\Downloads\4y2t\THSST-2\ug_thesis\ER_GAT


Code related to GAT

In [4]:

class DialogueGraphDataLoader(DataLoader):
    def __init__(self, node_features_list, edge_index_list, batch_size=1, shuffle=False):
        graph_dataset = DialogueGraphDataset(node_features_list, edge_index_list)
        super().__init__(graph_dataset, batch_size, shuffle, collate_fn=dialogue_graph_collate_fn)

class DialogueGraphDataset(Dataset):
    def __init__(self, node_features_list, edge_index_list):
        self.node_features_list = node_features_list
        self.edge_index_list = edge_index_list

    def __len__(self):
        return len(self.edge_index_list)

    def __getitem__(self, idx):
        return self.node_features_list[idx], self.edge_index_list[idx]

def dialogue_graph_collate_fn(batch):
    node_features_list, edge_index_list = zip(*batch)
    
    node_features_list_combined = []
    num_nodes_seen = 0

    for node_features, edge_index in zip(node_features_list, edge_index_list):
        # Assuming node_features is a tuple (text_embeddings, speakers_list)
        text_embeddings, speakers_list = node_features
        combined_features = (text_embeddings, speakers_list)

        node_features_list_combined.append(combined_features)

        # Translate the range of edge_index
        edge_index_list.append(edge_index + num_nodes_seen)
        num_nodes_seen += len(text_embeddings)

    # Merge the dialogue graphs into a single graph with multiple connected components
    node_features_combined = [torch.cat(features, 1) for features in zip(*node_features_list_combined)]
    edge_index = torch.cat(edge_index_list, 1)

    return node_features_combined, edge_index


In [301]:
class GATLayerWithEdgeType(nn.Module):
    def __init__(self, num_in_features, num_out_features, num_heads, num_edge_types):
        super(GATLayerWithEdgeType, self).__init__()

        self.num_in_features_per_head = num_in_features // num_heads
        self.num_out_features_per_head = num_out_features // num_heads
        self.num_heads = num_heads
        self.num_edge_types = num_edge_types

        # Linear projection for each head
        self.linear_proj = nn.Linear(self.num_in_features_per_head, self.num_heads * self.num_out_features_per_head, bias=False)
        # Define the final linear projection layer
        self.final_linear_proj = nn.Linear(num_heads * self.num_out_features_per_head, num_in_features)

        # Edge type embedding
        self.edge_type_embedding = nn.Embedding(self.num_edge_types, self.num_heads)


    def reset_parameters(self):
        nn.init.xavier_uniform_(self.linear_proj.weight)
        nn.init.xavier_uniform_(self.edge_type_embedding.weight)
        nn.init.xavier_uniform_(self.final_linear_proj.weight)
        nn.init.zeros_(self.attention_weights_self)
        nn.init.zeros_(self.attention_weights_edge_types)

    def forward(self, input_data, edge_type):
        node_features, edge_indices = input_data

        # Assuming the last dimension of h_linear is the one to be multiplied with attention_coefficients
        h_linear = self.linear_proj(node_features.view(-1, self.num_in_features_per_head))
        h_linear = h_linear.view(-1, self.num_heads, self.num_out_features_per_head)
        edge_type_embedding = self.edge_type_embedding(edge_type).unsqueeze(1)

        # Ensure that the dimensions are compatible for broadcasting
        # Expand the dimensions of edge_type_embedding to match h_linear
        edge_type_embedding = edge_type_embedding.unsqueeze(-1).expand(-1, -1, -1, h_linear.size(-1))

        # Perform element-wise multiplication
        attention_scores = (h_linear.unsqueeze(2) * edge_type_embedding.unsqueeze(1)).sum(dim=-1)

        # Continue with the rest of your attention mechanism
        attention_coefficients = F.softmax(attention_scores, dim=-1)

        # Adjust dimensions for torch.matmul
        attention_coefficients = attention_coefficients.unsqueeze(-2)  # Add an extra dimension before the last dimension

        print("attention_coefficients size:", attention_coefficients.size())
        print("h_linear size:", h_linear.size())
        # Weighted sum using attention coefficients
        # Assuming attention_coefficients and h_linear have correct dimensions
        # Perform matrix multiplication step by step

        # Transpose h_linear to make it compatible with matmul
        h_linear_transposed = h_linear.transpose(-2, -1)

        
        # Reshape attention_coefficients for matmul
#         attention_coefficients_reshaped = attention_coefficients.view(-1, attention_coefficients.size(-1))
        
        # Convert tensors to float16
        attention_coefficients_reshaped_transposed = attention_coefficients.transpose(-2, -1).unsqueeze(-3).to(torch.float16)
        h_linear_reshaped = h_linear_reshaped.unsqueeze(-1).to(torch.float16)

        # Perform batch matrix multiplication
        h_prime_flat = torch.bmm(attention_coefficients_reshaped_transposed, h_linear_reshaped).squeeze(-1)

        # Convert back to float32 if necessary
        h_prime_flat = h_prime_flat.to(torch.float32)

        # Reshape h_prime to the desired shape
        h_prime = h_prime_flat.view(attention_coefficients.size(0), attention_coefficients.size(1), attention_coefficients.size(2), -1)


        # Final linear projection
        h_out = self.final_linear_proj(h_prime.view(-1, self.num_heads * self.num_out_features_per_head))


        return h_out, attention_coefficients
    
class GATWithEdgeType(nn.Module):
    def __init__(self, num_of_layers, num_heads_per_layer, num_features_per_layer, num_edge_types):
        super(GATWithEdgeType, self).__init__()

        self.gat_net = nn.ModuleList()

        for layer in range(num_of_layers):
            num_in_features = num_heads_per_layer[layer - 1] * num_features_per_layer[layer - 1] if layer > 0 else num_features_per_layer[0]
            num_out_features = num_heads_per_layer[layer] * num_features_per_layer[layer]
            self.gat_net.append(GATLayerWithEdgeType(num_in_features, num_out_features, num_heads_per_layer[layer], num_edge_types))

    def forward(self, node_features, edge_indices, edge_types):
        h = node_features

        attention_scores = []

        for layer in self.gat_net:
            h, attention_coefficients = layer((h, edge_indices), edge_types)
            attention_scores.append(attention_coefficients)

        return h, attention_scores

<h3>Methods definition

In [8]:
def create_node_pairs_list(start_idx, end_idx):
    # Initialize an empty list to store pairs
    list_node_i = []
    list_node_j = []
#     node_pairs_dict = {}
    end_idx = end_idx - start_idx
    start_idx = 0
    for i in range(start_idx, end_idx+1):
        val = 3
        while(val >= 0):
            target_idx = i-val
#                 print(target_idx)
            if target_idx >= 0:
                list_node_i.append(i)
                list_node_j.append(target_idx)
#                 node_pairs_dict[i] = target_idx
            val = val-1
    
    return [list_node_i, list_node_j]

def create_adjacency_dict(node_pairs):
    adjacency_list_dict = {}

    # Iterate through pairs of nodes
    for i in range(0, len(node_pairs[0])):
        source_node, target_node = node_pairs[0][i], node_pairs[1][i]

#         # Add source node to target node's neighbors
#         if target_node not in adjacency_list_dict:
#             adjacency_list_dict[target_node] = [source_node]
#         else:
#             adjacency_list_dict[target_node].append(source_node)

        # Add target node to source node's neighbors
        if source_node not in adjacency_list_dict:
            adjacency_list_dict[source_node] = [target_node]
        else:
            adjacency_list_dict[source_node].append(target_node)

    return adjacency_list_dict
# print(ranges[:1])

def get_all_adjacency_list(ranges, key=0):
    all_adjacency_list = []
    for range_pair in ranges:
        start_idx, end_idx = range_pair
        
        if key == 0:
            output = create_node_pairs_list(start_idx, end_idx)
            output = create_adjacency_dict(output)
        elif key == 1:
            output = create_node_pairs_list(start_idx, end_idx)
            output = torch.tensor(output)
        else:
            print("N/A")
        all_adjacency_list.append(output)
    return all_adjacency_list

def get_all_edge_type_list(edge_indices, encoded_speaker_list):
    dialogs_len = len(edge_indices)
    whole_edge_type_list = []
    
    for i in range(dialogs_len): #2140 dialogs
        dialog_nodes_pairs = edge_indices[i]
        dialog_speakers = list(encoded_speaker_list[i])
        dialog_len = len(dialog_nodes_pairs.keys())
        edge_type_list = []
#         print(i, " th dialogue")
#         print(i, dialog_speakers)
        for j in range(dialog_len): #num utterances
            src_node = dialog_nodes_pairs[j] # j = key = src node
            node_i_idx = j
            win_len = len(src_node)
            for k in range(win_len):
                node_j_idx = src_node[k] # k = value = targ node
                # edge_types = torch.tensor([0, 1, 2]) 
                # 0: cur-self, 1: past-self, 2: past-other/past-inter
                                
                if node_i_idx == node_j_idx:
                    edge_type_list.append(0)
#                     print("This is 0 ", node_i_idx, node_j_idx)
                else:
                    if dialog_speakers[node_i_idx] != dialog_speakers[node_j_idx]:
                        edge_type_list.append(1)
#                         print("This is 1 ", node_i_idx, node_j_idx)
                    else:
                        edge_type_list.append(2)
#                         print("This is 2 ", node_i_idx, node_j_idx)
        whole_edge_type_list.append(torch.tensor(edge_type_list).to(torch.int64))  
                    
    return whole_edge_type_list

In [None]:
# print(edge_indices[0][0][3])
# len(edge_indices[0].keys())
# list(encoded_speaker_list[1])

In [None]:
# assume this is working
# edge_indices = get_all_adjacency_list(ranges)
# edge_types = get_all_edge_type_list(edge_indices, encoded_speaker_list)
# edge_indices = get_all_adjacency_list(ranges, key=1)

In [None]:
# print((edge_types[:10]))
# edge_indices[:10]
# (updated_representations[0].shape)
# edge_indices[0]
# edge_types[0]

In [9]:
checkFile = os.path.isfile("data/dump/speaker_encoder.pkl")
encoded_speaker_list = []
if checkFile is False:
    print("Run first the prototype_context_encoder to generate this file")
else:
    file = open('data/dump/speaker_encoder.pkl', "rb")
    encoded_speaker_list, ranges = pickle.load(file)
    file.close()

In [None]:
# need update
# checkFile = os.path.isfile("data/dump/all_adjacency_list.pkl")
# adjacency_list = []
# if checkFile is False:
#     adjacency_list = get_all_adjacency_list(ranges)
# else:
#     file = open('data/dump/all_adjacency_list.pkl', "rb")
#     adjacency_list = pickle.load(file)
#     file.close()

In [None]:
# adjacency_list[:2]

In [None]:
# len(adjacency_list)

In [10]:
file_path = 'embed/updated_representation_list.pkl'

# Load the list from the file using pickle
with open(file_path, 'rb') as file:
    updated_representations = pickle.load(file)

In [11]:
print(updated_representations[0].shape)
print(updated_representations[0])

torch.Size([14, 300])
tensor([[-2.8721e-01,  5.8134e-01, -1.3142e-01,  ...,  1.8101e-02,
         -4.6824e-04,  1.9901e-02],
        [-1.6920e-01,  1.8220e-01, -1.2245e-01,  ...,  1.3620e-02,
         -2.0732e-03,  8.3473e-03],
        [-8.1502e-02,  7.7161e-02, -6.6144e-02,  ...,  1.3882e-02,
          3.4588e-03, -1.4834e-03],
        ...,
        [-4.1162e-03,  2.6335e-02,  2.8706e-02,  ..., -1.6475e-01,
         -1.3978e-01,  2.8344e-02],
        [-1.7579e-02,  1.8380e-02,  3.3130e-02,  ..., -2.5659e-01,
         -2.2489e-01,  1.5857e-02],
        [-2.9680e-02,  8.5039e-03,  3.3814e-02,  ..., -3.8804e-01,
         -2.8153e-01,  1.1250e-03]], requires_grad=True)


<h3> Making Progress...

In [12]:
edge_indices = get_all_adjacency_list(ranges)
edge_types = get_all_edge_type_list(edge_indices, encoded_speaker_list)
edge_indices = get_all_adjacency_list(ranges, key=1)

In [None]:
# num_heads_per_layer

In [302]:
num_of_layers = 2
num_heads_per_layer = [4, 2]  # Adjusted for 2 layers
num_features_per_layer = [300, 150]
num_edge_types = 3

gat_model = GATWithEdgeType(num_of_layers, num_heads_per_layer, num_features_per_layer, num_edge_types)

outputs = []

output, attention_scores = gat_model(updated_representations[0], 
                                     edge_indices[0], 
                                     edge_types[0])

attention_coefficients size: torch.Size([50, 56, 4, 1, 4])
h_linear size: torch.Size([56, 4, 300])


UnboundLocalError: local variable 'h_linear_reshaped' referenced before assignment

In [113]:
# updated_representations[0]

In [None]:
# # Example usage:
# num_of_layers = 2
# num_heads_per_layer = [4, 2]
# num_features_per_layer = [300, 150, 64]
# num_edge_types = 4  # Change this according to your specific edge types

# gat_model = GATWithEdgeType(num_of_layers, num_heads_per_layer, num_features_per_layer, num_edge_types)

# # Assuming you have input data 'node_features', 'edge_indices', and 'edge_types'
# output, attention_scores = gat_model(updated_representations, edge_indices, edge_types)

In [None]:
# sample_reps = updated_representations[:3]
# sample_edge_idx_list = ranges[:3]

In [None]:
# Create instances of DialogueGraphDataset and DialogueGraphDataLoader
# dataset = DialogueGraphDataset(node_features_list= [updated_representations, encoded_speaker_list],
# #                                node_labels_list, 
#                                edge_index_list = adjacency_list,
#                               )
# dataloader = DialogueGraphDataLoader(node_features_list = dataset, 
#                                      edge_index_list = adjacency_list, 
#                                      batch_size=2, 
#                                      shuffle=True)


In [None]:
# # Initialize your GAT model
# gat_model = GAT(
#     num_of_layers=3,
#     num_heads_per_layer=[4, 4, 6],
#     num_features_per_layer=[len(dataset.node_features_list), 64, 64, dataset.num_classes],
#     add_skip_connection=True,
#     bias=True,
#     dropout=0.6,
# #     layer_type="your_layer_type",  default 3
#     log_attention_weights=False,
# )


In [None]:
# BEST_VAL_ACC = 0
# BEST_VAL_LOSS = 0
# PATIENCE_CNT = 0

# BINARIES_PATH = os.path.join(os.getcwd(), 'models', 'binaries')
# CHECKPOINTS_PATH = os.path.join(os.getcwd(), 'models', 'checkpoints')

# # Make sure these exist as the rest of the code assumes it
# os.makedirs(BINARIES_PATH, exist_ok=True)
# os.makedirs(CHECKPOINTS_PATH, exist_ok=True)

In [None]:
device = torch.device("cpu") 

In [None]:
# (dataloader.sampler.num_samples)

In [None]:
# dir(dataloader.dataset)

In [None]:
# # Define hyperparameters
# num_layers = 2
# num_heads_per_layer = [8, 8]
# num_features_per_layer = [300, 128, num_classes]  # Adjust num_classes based on your task
# add_skip_connection = True
# bias = True
# dropout = 0.6
# layer_type = LayerType.IMP3  # Choose the desired implementation
# log_attention_weights = False  # Set to True if you want to log attention weights


In [None]:
# class CustomGATLayer(nn.Module):
#     def __init__(self, in_features, out_features, num_edge_types, dropout=0.6, alpha=0.2):
#         super(CustomGATLayer, self).__init__()
#         self.num_edge_types = num_edge_types

#         # Node feature transformation
#         self.W = nn.Linear(in_features, out_features)

#         # Edge attention mechanism for each edge type
#         self.attention_weights = nn.ModuleList([nn.Linear(2 * out_features, 1) for _ in range(num_edge_types)])
#         self.leaky_relu = nn.LeakyReLU(alpha)
#         self.dropout = nn.Dropout(dropout)

#     def forward(self, node_features, edge_index, edge_type):
#         # Node feature transformation
#         h = self.W(node_features)

#         # Attention mechanism for each edge type
#         attention_weights = [torch.exp(self.leaky_relu(att(torch.cat([h[edge_index[0]], h[edge_index[1]]], dim=-1))))
#                              for att in self.attention_weights]

#         # Compute weighted sum of neighbor features
#         aggregated_features = sum(attention_weights[i] * h[edge_index[1]] for i in range(self.num_edge_types))

#         # Apply dropout
#         aggregated_features = self.dropout(aggregated_features)

#         return aggregated_features


In [None]:
# class GATLayerWithEdgeType(GATLayer):
#     def __init__(self, num_in_features, num_out_features, num_of_heads, num_edge_types, concat=True, activation=nn.ELU(),
#                  dropout_prob=0.6, add_skip_connection=True, bias=True, log_attention_weights=False):

#         super().__init__(num_in_features, num_out_features, num_of_heads, concat, activation, dropout_prob,
#                          add_skip_connection, bias, log_attention_weights)

#         # New trainable parameters for edge type embeddings
#         self.edge_type_embeddings = nn.Parameter(torch.Tensor(num_edge_types, num_of_heads, num_out_features))
#         self.init_params(LayerType.WITH_EDGE_TYPE)

#     def forward(self, data, edge_type):
#         in_nodes_features, edge_index = data  # unpack data
#         num_of_nodes = in_nodes_features.shape[self.nodes_dim]

#         in_nodes_features = self.dropout(in_nodes_features)

#         # Project node features to NH independent output features
#         nodes_features_proj = self.linear_proj(in_nodes_features).view(-1, self.num_of_heads, self.num_out_features)
#         nodes_features_proj = self.dropout(nodes_features_proj)

#         # Calculate attention scores for source and target nodes based on edge type
#         scores_source = (nodes_features_proj * self.scoring_fn_source).sum(dim=-1)
#         scores_target = (nodes_features_proj * self.scoring_fn_target).sum(dim=-1)

#         # Lift the scores based on edge index
#         scores_source_lifted, scores_target_lifted, nodes_features_proj_lifted = self.lift(scores_source, scores_target, nodes_features_proj, edge_index)

#         # Embedding for edge type
#         edge_type_embedding = self.edge_type_embeddings[edge_type]

#         # Apply the scoring function with edge type embeddings
#         scores_per_edge = self.leakyReLU(scores_source_lifted + scores_target_lifted + edge_type_embedding)

#         # Neighborhood-aware softmax
#         attentions_per_edge = self.neighborhood_aware_softmax(scores_per_edge, edge_index[self.trg_nodes_dim], num_of_nodes)
#         attentions_per_edge = self.dropout(attentions_per_edge)

#         # Element-wise product with weighted and projected neighborhood feature vectors
#         nodes_features_proj_lifted_weighted = nodes_features_proj_lifted * attentions_per_edge

#         # Aggregate neighbors
#         out_nodes_features = self.aggregate_neighbors(nodes_features_proj_lifted_weighted, edge_index, in_nodes_features, num_of_nodes)

#         # Residual/skip connections, concat, and bias
#         out_nodes_features = self.skip_concat_bias(attentions_per_edge, in_nodes_features, out_nodes_features)
#         return (out_nodes_features, edge_index)
