# Libraries

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
import numpy as np
from sklearn.metrics import f1_score, roc_auc_score
from time import time

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

# Exploratory Data Analysis

## preparing train data

In [None]:
df = pd.read_excel('/content/gdrive/MyDrive/GNN Intern/Training_Data.xlsx')
print(df.shape)
df.head()

In [None]:
df.isnull().sum()

In [None]:
df[df["Business Description"].isnull() == True]

In [None]:
df['Business Description'] = np.where(df["Business Description"].isnull() == True,df["Company Name"],df["Business Description"])
df.isnull().sum()

In [None]:
df.drop_duplicates(keep=False,inplace=True)

In [None]:
df.info()
df.describe()

In [None]:
df["Business Description"].str.len().describe()

In [None]:
df["Business Description"].str.len().plot()

In [None]:
df["Business Description"].str.len().plot.box()

In [None]:
classes = {i:typ for i,typ in enumerate(df.iloc[:,2].unique())}

In [None]:
df.iloc[:,1] = df.iloc[:,1].str.lower()
df.iloc[:,2] = df.iloc[:,2].str.lower()
df.columns = df.columns.str.strip()
df.columns = df.columns.str.lower()

In [None]:
df.drop('company name',1,inplace=True)

In [None]:
dum = df['industry classification tag']
df.drop(columns='industry classification tag',inplace=True)
df = pd.concat([dum,df],axis=1)
df.head()

In [None]:
df.columns = ['','']
df.head()

In [None]:
train = df

In [None]:
df = pd.read_excel('/content/gdrive/MyDrive/GNN Intern/Testing_Data.xlsx')
df.shape

In [None]:
df.columns = ['0','']
df.drop(columns='0',axis=1,inplace=True)

In [None]:
df.iloc[:,0] = df.iloc[:,0].str.lower()

In [None]:
test = df

In [None]:
train.to_csv('/content/gdrive/MyDrive/GNN Intern/train.csv',index=False)
test.to_csv('/content/gdrive/MyDrive/GNN Intern/test.csv',index=False)

In [None]:
del df, test, train

# Tokenizer and embedding matrix from pretrained embeddings

In [None]:
class Tokenizer:
  def __init__(self, pretrained_file):
    self.pretrained_file = pretrained_file
    self.str_to_int = dict() #for fast lookups
    self.int_to_str = dict()

    self.padding = '#pad#'
    self.rand_str = '#rand#'
    self.embedding_matrix = list()

    #preparing embedding matrix
    with open(pretrained_file,'r', encoding='utf8') as f:
      for i, line in enumerate(f):
        values = line.split()
        self.str_to_int[values[0]] = i
        self.int_to_str[i] = values[0]
        self.embedding_matrix.append([float(v) for v in values[1:]])
    i += 1
    self.str_to_int[self.rand_str] = i
    self.int_to_str[i] = self.rand_str
    self.embedding_matrix.append(np.random.rand(len(self.embedding_matrix[0])))

    i += 1
    self.str_to_int[self.padding] = i
    self.int_to_str[i] = self.padding
    self.embedding_matrix.append(np.zeros(len(self.embedding_matrix[0])))

    self.embedding_matrix = np.array(self.embedding_matrix).astype(np.float32)

  def encode(self, sentence):
    if len(sentence): list(sentence)
    else: sentence = sentence.split(" ")

    encoded_sentence = list()
    for word in sentence:
      encoded_sentence.append(self.str_to_int.get(word,self.str_to_int[self.rand_str]))
    return encoded_sentence

  def decode(self,en_sentence):
    if type(en_sentence) == list:
      sentence = list()
      for en_word in en_sentence:
        sentence.append(self.int_to_str[en_word])
      return senetnce

  def embedding(self, en_sentence):
    return self.embedding_matrix[np.array(en_sentence)]


In [None]:
tokenizer = Tokenizer('/content/gdrive/MyDrive/GNN Intern/glove.6B.300d.txt')

# Preparing Train, Validation and Test Data Loaders

In [None]:
def create_neighbor_set(node_set, p=3):
  sequence_length = len(node_set)
  neighbor_set = []
  for i in range(sequence_length):
      neighbor = []
      for j in range(-p, p+1):
          if 0 <= i + j < sequence_length:
              neighbor.append(node_set[i+j])
      neighbor_set.append(neighbor)
  return neighbor_set

In [None]:
class GNN_Dataset(Dataset):
    def __init__(self, node_sets, neighbor_sets, public_edge_mask, labels):
      super(GNN_Dataset).__init__()
      self.node_sets = node_sets
      self.neighbor_sets = neighbor_sets
      self.public_edge_mask = public_edge_mask
      self.labels = labels

    def __getitem__(self, i):
      if self.labels:
        return torch.LongTensor(self.node_sets[i]), \
              torch.nn.utils.rnn.pad_sequence([torch.LongTensor(neighbor) for neighbor in self.neighbor_sets[i]], batch_first=True, padding_value=1), \
              self.public_edge_mask[torch.LongTensor(self.node_sets[i]).unsqueeze(-1).repeat(1, torch.nn.utils.rnn.pad_sequence([torch.LongTensor(neighbor) for neighbor in self.neighbor_sets[i]], batch_first=True, padding_value=1).shape[-1]), \
                                    torch.nn.utils.rnn.pad_sequence([torch.LongTensor(neighbor) for neighbor in self.neighbor_sets[i]], batch_first=True, padding_value=1)], \
              torch.FloatTensor(self.labels[i])
      else:
        return torch.LongTensor(self.node_sets[i]), \
              torch.nn.utils.rnn.pad_sequence([torch.LongTensor(neighbor) for neighbor in self.neighbor_sets[i]], batch_first=True, padding_value=1), \
              self.public_edge_mask[torch.LongTensor(self.node_sets[i]).unsqueeze(-1).repeat(1, torch.nn.utils.rnn.pad_sequence([torch.LongTensor(neighbor) for neighbor in self.neighbor_sets[i]], batch_first=True, padding_value=1).shape[-1]), \
                                    torch.nn.utils.rnn.pad_sequence([torch.LongTensor(neighbor) for neighbor in self.neighbor_sets[i]], batch_first=True, padding_value=1)]


    def __len__(self):
      return len(self.node_sets)

In [None]:
class GNN_Dataset_class:
  def __init__(self, train_filename, test_filename, tokenizer, MAX_LENGTH=70, p=3, min_freq=2, train_validation_split=0.8):
    self.train_filename = train_filename
    self.test_filename = test_filename
    self.tokenizer = tokenizer
    self.MAX_LENGTH = MAX_LENGTH
    self.p = p
    self.min_freq = min_freq
    self.train_validation_split = train_validation_split

    self.train_data = pd.read_csv(self.train_filename, header=None)
    self.train_data.dropna(0,inplace=True)
    self.test_data = pd.read_csv(self.test_filename, header=None)
    self.test_data.dropna(0,inplace=True)

    self.str_to_int = {'#rand#': 0, '#pad#': 1} 
    self.int_to_str = {0: '#rand#', 1: '#pad#'}
    self.vocab_count = len(self.str_to_int)
    self.embedding_matrix = None
    self.label_dict = dict(zip(self.train_data[0].unique(), pd.get_dummies(self.train_data[0].unique()).values.tolist()))

    self.train_dataset, self.validation_dataset = random_split(self.train_data.to_numpy(), [int(len(self.train_data) * train_validation_split), len(self.train_data) - int(len(self.train_data) * train_validation_split)])
    self.test_dataset = self.test_data.to_numpy()

    self.build_vocab()

    self.train_dataset, self.validation_dataset, self.test_dataset, self.edge_stat, self.public_edge_mask = self.prepare_dataset()

  def build_vocab(self):
    scrap = [".",",",";","&","'s", ":", "?", "!","(",")",\
            "'","`","''","\"","“"," ","'m","'no","***","--","...","[","]","{","}","~","@","#","$","%","^","*","/","<",">","+","-","="]
    vocab_list = [sentence.split(' ') for _, sentence in self.train_dataset]
    unique_vocab = [] 
    for vocab in vocab_list:
      if vocab not in scrap: unique_vocab.extend(vocab)
    unique_vocab = list(set(unique_vocab))
    for vocab in unique_vocab:
      if vocab in self.tokenizer.str_to_int.keys():
        self.str_to_int[vocab] = self.vocab_count
        self.int_to_str[self.vocab_count] = vocab
        self.vocab_count += 1
    self.embedding_matrix = self.tokenizer.embedding(self.tokenizer.encode(list(self.str_to_int.keys())))

  def prepare_dataset(self):
    node_sets = [[self.str_to_int.get(vocab, 0) for vocab in sentence.strip().split(' ')][:self.MAX_LENGTH] for _, sentence in self.train_dataset] 
    neighbor_sets = [create_neighbor_set(node_set, p=self.p) for node_set in node_sets]
    labels = [self.label_dict[label] for label, _ in self.train_dataset]

    edge_stat, public_edge_mask = self.build_public_edge_mask(node_sets, neighbor_sets, min_freq=self.min_freq)

    train_dataset = GNN_Dataset(node_sets, neighbor_sets, public_edge_mask, labels)

    node_sets = [[self.str_to_int.get(vocab, 0) for vocab in sentence.strip().split(' ')][:self.MAX_LENGTH] for _, sentence in self.validation_dataset]
    neighbor_sets = [create_neighbor_set(node_set, p=self.p) for node_set in node_sets]
    labels = [self.label_dict[label] for label, _ in self.validation_dataset]
    validation_dataset = GNN_Dataset(node_sets, neighbor_sets, public_edge_mask, labels)

    node_sets = [[self.str_to_int.get(vocab, 0) for vocab in sentence.item().strip().split(' ')][:self.MAX_LENGTH] for sentence in self.test_dataset]
    neighbor_sets = [create_neighbor_set(node_set, p=self.p) for node_set in node_sets]
    test_dataset = GNN_Dataset(node_sets, neighbor_sets, public_edge_mask, labels=None)

    return train_dataset, validation_dataset, test_dataset, edge_stat, public_edge_mask

  def build_public_edge_mask(self, node_sets, neighbor_sets, min_freq=2):
    edge_stat = torch.zeros(self.vocab_count, self.vocab_count)
    for node_set, neighbor_set in zip(node_sets, neighbor_sets):
      for neighbor in neighbor_set:
        for to_node in neighbor:
          edge_stat[node_set, to_node] += 1
    public_edge_mask = edge_stat < min_freq
    return edge_stat, public_edge_mask
  

In [None]:
dataset = GNN_Dataset_class(train_filename='/content/gdrive/MyDrive/GNN Intern/train.csv',
                                   test_filename='/content/gdrive/MyDrive/GNN Intern/test.csv',
                                   tokenizer=tokenizer)

In [None]:
def pad_custom_sequence(sequences):
  node_sets_sequence = []
  neighbor_sets_sequence = []
  public_edge_mask_sequence = []
  label_sequence = []
  for node_sets, neighbor_sets, public_edge_mask, label in sequences:
    node_sets_sequence.append(node_sets)
    neighbor_sets_sequence.append(neighbor_sets)
    public_edge_mask_sequence.append(public_edge_mask)
    label_sequence.append(label)
  node_sets_sequence = torch.nn.utils.rnn.pad_sequence(node_sets_sequence, batch_first=True, padding_value=1)
  neighbor_sets_sequence, _ = padding_tensor(neighbor_sets_sequence)
  public_edge_mask_sequence, _ = padding_tensor(public_edge_mask_sequence)
  label_sequence = torch.nn.utils.rnn.pad_sequence(label_sequence, batch_first=True, padding_value=1)
  return node_sets_sequence, neighbor_sets_sequence, public_edge_mask_sequence, label_sequence

def pad_custom_for_test(sequences):
  node_sets_sequence = []
  neighbor_sets_sequence = []
  public_edge_mask_sequence = []
  for node_sets, neighbor_sets, public_edge_mask in sequences:
    node_sets_sequence.append(node_sets)
    neighbor_sets_sequence.append(neighbor_sets)
    public_edge_mask_sequence.append(public_edge_mask)
  node_sets_sequence = torch.nn.utils.rnn.pad_sequence(node_sets_sequence, batch_first=True, padding_value=1)
  neighbor_sets_sequence, _ = padding_tensor(neighbor_sets_sequence)
  public_edge_mask_sequence, _ = padding_tensor(public_edge_mask_sequence)
  return node_sets_sequence, neighbor_sets_sequence, public_edge_mask_sequence
  

def padding_tensor(sequences, padding_idx=1):
  num = len(sequences)
  max_len_0 = max([s.shape[0] for s in sequences])
  max_len_1 = max([s.shape[1] for s in sequences])
  out_dims = (num, max_len_0, max_len_1)
  out_tensor = sequences[0].data.new(*out_dims).fill_(padding_idx)
  for i, tensor in enumerate(sequences):
    len_0 = tensor.size(0)
    len_1 = tensor.size(1)
    out_tensor[i, :len_0, :len_1] = tensor
  mask = out_tensor == padding_idx
  return out_tensor, mask

In [None]:
train_loader = DataLoader(dataset.train_dataset, batch_size=32, shuffle=True, collate_fn=pad_custom_sequence)
validation_loader = DataLoader(dataset.validation_dataset, batch_size=32, shuffle=True, collate_fn=pad_custom_sequence)
test_loader = DataLoader(dataset.test_dataset, batch_size=32, collate_fn=pad_custom_for_test)

# Text Level GNN

In [None]:
class MessagePassing(nn.Module):
  def __init__(self, vertice_count, input_size, out_size, dropout_rate=0, padding_idx=1):
    super(MessagePassing, self).__init__()
    self.vertice_count = vertice_count 
    self.input_size = input_size 
    self.out_size = out_size 
    self.dropout_rate = dropout_rate
    self.padding_idx = padding_idx
    self.information_rate = nn.Parameter(torch.rand(self.vertice_count, 1)) 
    self.linear = nn.Linear(self.input_size, self.out_size)
    self.dropout = nn.Dropout(self.dropout_rate)

  def forward(self, node_sets, embedded_node, edge_weight, embedded_neighbor_node):
    tmp_tensor = (edge_weight.view(-1, 1) * embedded_neighbor_node.view(-1, self.input_size)).view(embedded_neighbor_node.shape) 
    tmp_tensor = tmp_tensor.masked_fill(tmp_tensor == 0, -1e18) 
    tmp_tensor = self.dropout(tmp_tensor)
    M = tmp_tensor.max(dim=2)[0] 
    information_rate = self.information_rate[node_sets] 
    information_rate = information_rate.masked_fill((node_sets == self.padding_idx).unsqueeze(-1), 1) 
    embedded_node = (1 - information_rate) * M + information_rate * embedded_node 
    sum_embedded_node = embedded_node.sum(dim=1) 
    x = F.relu(self.linear(sum_embedded_node)) 
    #x = self.dropout(x)
    y = F.softmax(x, dim=1)
    return y

In [None]:
class TextLevelGNN(nn.Module):
  def __init__(self, pretrained_embeddings, out_size, dropout_rate=0, padding_idx=1):
    super(TextLevelGNN, self).__init__()
    self.out_size = out_size # c
    self.padding_idx = padding_idx
    self.weight_matrix = nn.Parameter(torch.randn(pretrained_embeddings.shape[0], pretrained_embeddings.shape[0])) 
    self.embedding = nn.Embedding.from_pretrained(pretrained_embeddings, freeze=False, padding_idx=self.padding_idx) 
    self.message_passing = MessagePassing(vertice_count=pretrained_embeddings.shape[0], input_size=pretrained_embeddings.shape[1], out_size=self.out_size, dropout_rate=dropout_rate, padding_idx=self.padding_idx) 
    self.public_edge_weight = nn.Parameter(torch.randn(1, 1)) 

  def forward(self, node_sets, neighbor_sets, public_edge_mask):
    embedded_node = self.embedding(node_sets)
    edge_weight = model.weight_matrix[node_sets.unsqueeze(2).repeat(1, 1, neighbor_sets.shape[-1]), neighbor_sets] 
    a = edge_weight * ~public_edge_mask 
    b = self.public_edge_weight.unsqueeze(2).expand(1, public_edge_mask.shape[-2], public_edge_mask.shape[-1]) * public_edge_mask # (batch_size, max_sentence_length, max_neighbor_count)
    edge_weight = a + b 
    embedded_neighbor_node = self.embedding(neighbor_sets)

    # Apply mask to edge_weight, to mask and cut-off any relationships to the padding nodes
    edge_weight = edge_weight.masked_fill((node_sets.unsqueeze(2).repeat(1, 1, neighbor_sets.shape[-1]) == self.padding_idx) | (neighbor_sets == self.padding_idx), 0) # (batch_size, max_sentence_length, max_neighbor_count)
    x = self.message_passing(node_sets, embedded_node, edge_weight, embedded_neighbor_node) # (batch_size, c)
    return x

In [None]:
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')

# Optimization

In [None]:
model = TextLevelGNN(pretrained_embeddings=torch.tensor(dataset.embedding_matrix), out_size = 62,dropout_rate=0).to(device)
criterion = nn.BCELoss()

In [None]:
lr = 1e-2
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-4)
ni = nf = 0

In [None]:
"""
model_save_name = 'Tt_lvl_company description.pt'
path = F"/content/gdrive/My Drive/GNN Intern/saved_models/{model_save_name}"
model.load_state_dict(torch.load(path, map_location=device))
model.to(device)
"""

In [None]:
ni += nf 
nf += 1
for epoch in range(ni,nf):
  model.train()
  train_loss = 0
  train_correct_items = 0
  previous_epoch_timestamp = time()

  if (epoch+1) % 5 == 0:
    if epoch: 
      lr *= 0.95
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-4)

  train_preds = []
  validation_preds = []
  if epoch == 0:
    train_labels = []
    validation_labels = []
  for i, (node_sets, neighbor_sets, public_edge_masks, labels) in enumerate(train_loader):
    node_sets = node_sets.to(device)
    neighbor_sets = neighbor_sets.to(device)
    public_edge_masks = public_edge_masks.to(device)
    labels = labels.to(device)
    if epoch == 0: train_labels.append(labels.argmax(dim=1))
    prediction = model(node_sets, neighbor_sets, public_edge_masks)
    train_preds.append(prediction)
    loss = criterion(prediction, labels).to(device)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    train_loss += loss.item()
    train_correct_items += (prediction.argmax(dim=1) == labels.argmax(dim=1)).sum().item()
  train_accuracy = train_correct_items / len(dataset.train_dataset)

  model.eval()
  validation_loss = 0
  validation_correct_items = 0
  for i, (node_sets, neighbor_sets, public_edge_masks, labels) in enumerate(validation_loader):
    node_sets = node_sets.to(device)
    neighbor_sets = neighbor_sets.to(device)
    public_edge_masks = public_edge_masks.to(device)
    labels = labels.to(device)
    if epoch == 0: validation_labels.append(labels.argmax(dim=1))
    prediction = model(node_sets, neighbor_sets, public_edge_masks)
    validation_preds.append(prediction)
    loss = criterion(prediction, labels).to(device)
    validation_loss += loss.item()
    validation_correct_items += (prediction.argmax(dim=1) == labels.argmax(dim=1)).sum().item()
  validation_accuracy = validation_correct_items / len(dataset.validation_dataset)

  print(f'Epoch: {epoch+1}, Training Loss: {train_loss:.4f}, Validation Loss: {validation_loss:.4f}, Training Accuracy: {train_accuracy:.4f}, Validation Accuracy: {validation_accuracy:.4f}, Time Used: {time()-previous_epoch_timestamp:.2f}s')


In [None]:
model.eval()
test_preds = []
for i, (node_sets, neighbor_sets, public_edge_masks) in enumerate(test_loader):
  node_sets = node_sets.to(device)
  neighbor_sets = neighbor_sets.to(device)
  public_edge_masks = public_edge_masks.to(device)
  prediction = model(node_sets, neighbor_sets, public_edge_masks)
  test_preds.append(prediction)

In [None]:
"""model_save_name = 'Tt_lvl_company description.pt'
path = F"/content/gdrive/My Drive/GNN Intern/saved_models/{model_save_name}"
torch.save(model.state_dict(),path)
# model has lot of parameters, so saving is not possible with current colab runtime
"""

# Saving Train, Validation, Test Predictions and labels

In [None]:
train_labels = [label.cpu().data.numpy() for label in train_labels]
train_labels = np.concatenate(train_labels)

validation_labels = [label.cpu().data.numpy() for label in validation_labels]
validation_labels = np.concatenate(validation_labels)

np.savez('/content/gdrive/My Drive/GNN Intern/labels.npz',train_labels = train_labels, validation_labels = validation_labels)
del train_labels, validation_labels

In [None]:
train_preds = [pred.cpu().data.numpy() for pred in train_preds]
train_preds = np.vstack(train_preds)

validation_preds = [pred.cpu().data.numpy() for pred in validation_preds]
validation_preds = np.vstack(validation_preds)

test_preds = [i.cpu().data.numpy() for i in test_preds]
test_preds = np.vstack(test_preds)

np.savez('/content/gdrive/My Drive/GNN Intern/predictions.npz',train_preds = train_preds, validation_preds = validation_preds, test_preds = test_preds)
del train_preds, validation_preds, test_preds

# MRR, F1 and ROC_AUC Scores for Train and Validation

In [None]:
labels = np.load('/content/gdrive/My Drive/GNN Intern/labels.npz')
preds = np.load('/content/gdrive/My Drive/GNN Intern/predictions.npz')

train_preds = preds['train_preds']
validation_preds = preds['validation_preds']

train_labels = labels['train_labels']
validation_labels = labels['validation_labels']

train_preds = train_preds.argmax(axis=1)
validation_preds = validation_preds.argmax(axis=1)

train_f1 = f1_score(train_labels,train_preds,average='weighted')
validation_f1 = f1_score(validation_labels,validation_preds,average='weighted')
print("F1_score of train_set: ",train_f1, " F1_score of validation_set: ",validation_f1)

#Mean Reciprocal Rank (MRR)
train_mrr = [np.array(arr).argsort()[::-1] for arr in preds["train_preds"]]
validation_mrr = [np.array(arr).argsort()[::-1] for arr in preds['validation_preds']]
t_mrr = np.mean([1/(1 + np.where(arr == i)[0].item()) for arr,i in zip(train_mrr,train_labels)])
v_mrr = np.mean([1/(1+np.where(arr == i)[0].item()) for arr,i in zip(validation_mrr,validation_labels)])
print("MRR of train_set: ",t_mrr," MRR of validation_set",v_mrr)

train_roc_ovr = roc_auc_score(train_labels,preds['train_preds'],multi_class='ovr',average='weighted')
train_roc_ovo = roc_auc_score(train_labels,preds['train_preds'],multi_class='ovo',average='weighted')
validation_roc_ovr = roc_auc_score(validation_labels,preds['validation_preds'],multi_class='ovr',average='weighted')
validation_roc_ovo = roc_auc_score(validation_labels,preds['validation_preds'],multi_class='ovo',average='weighted')
print("Area under ROC curve:\n\t one versus rest weighted average of AUC:")
print("\t AUC of train_set: ",train_roc_ovr, "AUC of validation_set: ",validation_roc_ovr)
print("\t one versus one weighted average of AUC: ")
print("\t AUC of train_set: ",train_roc_ovo, "AUC of validation_set: ",validation_roc_ovo)

# Output for Test data

In [None]:
#upto 5 tags for each description
test_args = [np.array(arr).argsort()[::-1][:5] for arr in preds["test_preds"]]
df = pd.read_excel('/content/gdrive/MyDrive/GNN Intern/Testing_Data.xlsx')
df['First order tag'] = [classes[str(tags[0])] for tags in test_args]
df['Second order tag'] = [classes[str(tags[1])] for tags in test_args]
df['Third order tag'] = [classes[str(tags[2])] for tags in test_args]
df['Fourth order tag'] = [classes[str(tags[3])] for tags in test_args]
df['Fifth order tag'] = [classes[str(tags[4])] for tags in test_args]
df.to_csv('/content/gdrive/MyDrive/GNN Intern/Output.csv',index=False)