# Import our modules and torch


In [1]:
import create_vocab
import data_to_tensors
import model_implementation
from train_class import TrainingModule

import torch
import torch.nn as nn
import torch.nn.functional as F
import random 

from torch.utils.data import DataLoader

# Because their data looks like this: we have 3 main dictionaries:
* dictionary of all words in the sheets
* dictionary of all paths(they are already hashed and converted to digits)
* dictionary of targets(method names)


In [2]:
dict_path = 'data/CODES_emb/CODES_emb.dict.c2v'
word2idx, path2idx, target2idx, idx2target = create_vocab.create_vocab(dict_path)

* 1) data_iterator Returns 1024 batches each(the size can be changed when calling the function)
* 2) from below you can see how the data is accessed, then 3 vectors are fed to the model(sorry for such names, I think you will understand from the arguments of the model that the output is)
* 3) well, at the output we have code_vector and the second vector-probability distribution between all labels in target_vocab(dictionary of method names)

In [3]:
# path_for_train = 'data/functions_emb/functions_emb.train.c2v'
path_for_train = 'data/CODES_emb/CODES_emb.train.c2v'
train_dataset = data_to_tensors.TextDataset(path_for_train, 
                                                    word2idx, 
                                                    path2idx, 
                                                    target2idx)

# path_for_val = 'data/functions_emb/functions_emb.val.c2v'
path_for_val = 'data/CODES_emb/CODES_emb.val.c2v'
val_dataset = data_to_tensors.TextDataset(path_for_val, 
                                                    word2idx, 
                                                    path2idx, 
                                                    target2idx)
# path_for_test = 'data/functions_emb/functions_emb.test.c2v'
path_for_test = 'data/CODES_emb/CODES_emb.test.c2v'
test_dataset = data_to_tensors.TextDataset(path_for_test, 
                                                    word2idx, 
                                                    path2idx, 
                                                    target2idx)

train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=512, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=512, shuffle=False)

# Train

In [4]:
SEED = 1337
random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Model choose

* if you want to use the original model: bert = False
* if you want to use Bert model: bert = True


In [5]:
bert = False

In [6]:
# In case of bert
bert_params = dict()
bert_params['num_attention_heads'] = 1
bert_params['num_transformer_layers'] = 1
bert_params['intermediate_size'] = 32

if bert:
    model = model_implementation.code2vec_model(values_vocab_size = len(word2idx), 
                             paths_vocab_size = len(path2idx), 
                             labels_num = len(target2idx), bert=bert, bert_params=bert_params)

In [7]:
# In case if not bert
if not bert:
    model = model_implementation.code2vec_model(values_vocab_size = len(word2idx), 
                                 paths_vocab_size = len(path2idx), 
                                 labels_num = len(target2idx))

In [8]:
N_EPOCHS = 40
LR = 3e-3
WD = 0.8e-5

optimizer = torch.optim.Adam(model.parameters(), lr=LR, weight_decay=WD)
criterion = nn.CrossEntropyLoss()

train_class = TrainingModule(model, optimizer, criterion, train_loader, val_loader, test_loader, N_EPOCHS, idx2target)
list_train_loss, list_val_loss, list_train_precision, list_val_precision,list_train_recall, list_val_recall, list_train_f1, list_val_f1 = train_class.train(bert, 'a3_q1_emb')


Epoch 1: train loss - 1.74724, validation loss - 1.2532
	 Validation: precision - 0.48437, recall - 0.48437, f1_score - 0.48437
Elapsed time: 2.629
----------------------------------------------------------------------
Epoch 2: train loss - 1.18073, validation loss - 1.2257
	 Validation: precision - 0.42187, recall - 0.42187, f1_score - 0.42187
Elapsed time: 3.183
----------------------------------------------------------------------
Epoch 3: train loss - 1.08439, validation loss - 1.19816
	 Validation: precision - 0.51562, recall - 0.51562, f1_score - 0.51562
Elapsed time: 4.885
----------------------------------------------------------------------
Epoch 4: train loss - 1.0657, validation loss - 1.18909
	 Validation: precision - 0.51562, recall - 0.51562, f1_score - 0.51562
Elapsed time: 7.386
----------------------------------------------------------------------
Epoch 5: train loss - 1.07028, validation loss - 1.17959
	 Validation: precision - 0.5, recall - 0.5, f1_score - 0.5
Elapse

# Predict and show names!

Go to weights of our models in order to take a view on how they pridect names: https://drive.google.com/drive/u/2/folders/1Q5ixv8dQ_qYqHg6w4Ep_XNeCJYZE6Cl2

You need to:
* Visit the above Drive folder.
* Right click on folder name -> "Add a shortcut to Drive" -> select "My Drive" -> "Add shortcut".

### Have an access to these weights

In [9]:
# Detect if we are in Google Colaboratory
try:
    import google.colab
    IN_COLAB = True
except ImportError:
    IN_COLAB = False

from pathlib import Path
# Determine the locations of auxiliary libraries and datasets.
# `AUX_DATA_ROOT` is where 'notmnist.py', 'animation.py' and 'tiny-imagenet-2020.zip' are.
if IN_COLAB:
    google.colab.drive.mount("/content/drive/")
    
    # Change this if you created the shortcut in a different location
    AUX_DATA_ROOT = Path("/content/drive/My Drive/best_models/")
    
    assert AUX_DATA_ROOT.is_dir(), "Have you forgot to 'Add a shortcut to Drive'?"
else:
    AUX_DATA_ROOT = Path(".")

#### DEFINE model
* If you want to test original paper model:

In [10]:
if bert == False:
  state_dict = torch.load('/Users/alishbahiqbal/Desktop/c2v_pytorch/Code2vec_Pytorch_pipeline/a3_q1_emb_article_model.pth')
else:
  state_dict = torch.load('best_model.pth')

model.load_state_dict(state_dict)

DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model = model.to(DEVICE)

#### Let's test it on the test dataset

#  \<UNK\> means that this name doen't exist in our dictionary of labels:(

In [11]:
import pandas as pd
from IPython.display import display

d = {'Original names': [], 'Predicted names': []}

for start, path, end, label in iter(test_loader):
    # get from model
    code, y_pred = model(start.to(DEVICE), path.to(DEVICE), end.to(DEVICE))
    # get probability
    y_pred = F.softmax(y_pred)
    # get best name index
    y_pred = torch.argmax(y_pred, dim = 1)
    
    for i, j in zip(label, y_pred):
      d['Original names'].append(idx2target[i.item()])
      d['Predicted names'].append(idx2target[j.item()])
    break

df = pd.DataFrame(data=d)
display(df,)

  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,Original names,Predicted names
0,<unk>,7
1,10,10
2,10,7
3,7,10
4,7,7
...,...,...
59,10,7
60,2,7
61,7,7
62,10,7
