# Source code modelling using LSTM



In [0]:
# Install a Drive FUSE wrapper.
# https://github.com/astrada/google-drive-ocamlfuse
!apt-get install -y -qq software-properties-common python-software-properties module-init-tools
!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
!apt-get update -qq 2>&1 > /dev/null
!apt-get -y install -qq google-drive-ocamlfuse fuse

E: Package 'python-software-properties' has no installation candidate
Selecting previously unselected package google-drive-ocamlfuse.
(Reading database ... 131304 files and directories currently installed.)
Preparing to unpack .../google-drive-ocamlfuse_0.7.3-0ubuntu3~ubuntu18.04.1_amd64.deb ...
Unpacking google-drive-ocamlfuse (0.7.3-0ubuntu3~ubuntu18.04.1) ...
Setting up google-drive-ocamlfuse (0.7.3-0ubuntu3~ubuntu18.04.1) ...
Processing triggers for man-db (2.8.3-2ubuntu0.1) ...


In [0]:
from google.colab import auth
auth.authenticate_user()

from oauth2client.client import GoogleCredentials
creds = GoogleCredentials.get_application_default()
import getpass
!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
vcode = getpass.getpass()
!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}

Please, open the following URL in a web browser: https://accounts.google.com/o/oauth2/auth?client_id=32555940559.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&response_type=code&access_type=offline&approval_prompt=force
··········
Please, open the following URL in a web browser: https://accounts.google.com/o/oauth2/auth?client_id=32555940559.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&response_type=code&access_type=offline&approval_prompt=force
Please enter the verification code: Access token retrieved correctly.


In [0]:
!mkdir -p drive
!google-drive-ocamlfuse drive

In [0]:

!pip install torch
!pip install torchvision



In [0]:
import os
import numpy as np
import pandas as pd
import torch
from torch import nn
import torch.nn.functional as F

In [0]:

os.listdir("drive/datasets")

['flower_data',
 'anna.txt',
 'cat_to_name.json',
 'checkpoint.pth',
 'checkpoint_classifier.pth',
 'intermediate_weights.csv',
 'smells_train.csv',
 'smells_valid.csv',
 'smells_test.csv',
 'code_corpus.csv',
 'dictionary.csv',
 'dictionary_large.csv',
 'code_corpus_large.csv',
 'SCRNN.net',
 'smells-dataset']

## Load in Data



In [0]:
with open('drive/datasets/dictionary_large.csv', 'r') as f:
    text = f.read()
    dictionary = text.split('\n')
    dictionary = list(filter(None, dictionary))

with open('drive/datasets/code_corpus_large.csv', 'r') as f:
    text = f.read()
    methods = text.split('\n')
    
    methods = list(filter(None, methods))
    for i in range(len(methods)):
      methods[i] = ','.join(list(filter(None, methods[i].split(','))))
    
dictionary.append('<unk>')
dictionary.append('<num>')

In [0]:
print(len(dictionary))
print(len(methods))

1001
26039


### Encoding using dictionary


In [0]:
int2token = dict(enumerate(dictionary))
token2int = {ch: ii for ii, ch in int2token.items()}

#encoded = np.array([token2int[ch] for ch in text])

In [0]:
methods = methods[:-1]

In [0]:
encoded_methods = [[token2int[el] for el in method.split(',')] for method in methods]

In [0]:
unknown_token = token2int['<unk>']

batch_size = 50
sequence_length = 100

In [0]:
len(encoded_methods[0])

530

In [0]:
encoded_methods_all = []
for i, el in enumerate(encoded_methods):
  if len(el) < sequence_length:
    encoded_methods[i].extend([unknown_token] * (sequence_length - len(el)))
    
  enough = int(len(encoded_methods[i]) / sequence_length)
  encoded_methods_all.extend(encoded_methods[i][:enough * sequence_length])

print(len(encoded_methods_all))

print(encoded_methods_all[:5])

  


34462100
[973, 3, 55, 56, 999]


In [0]:
def get_batches(arr, batch_size, seq_length):

    n_batches = int(np.floor(arr.size / (batch_size * seq_length)))

    arr = arr[:n_batches * (batch_size * seq_length)]
    arr = arr.reshape(batch_size, -1)

    for n in range(0, arr.shape[1], seq_length):

        x = arr[:,n:n+seq_length]

        if n+seq_length == arr.shape[1]:
          y = np.zeros_like(x)
          y[:,:seq_length - 1] = arr[:,n+1:n+seq_length]
          y[:,seq_length - 1] = arr[:,0]
        else:
          y = arr[:,n+1:n+seq_length+1]
        yield x, y

In [0]:
encoded_methods_array = np.array(encoded_methods_all)

batches = get_batches(encoded_methods_array, 50, 100)
x, y = next(batches)

In [0]:
print(x.shape)
print(y.shape)
print(x[:5, :5])
print(y[:5, :5])

(50, 100)
(50, 100)
[[977   5 883  25  27]
 [ 51 451  49 991 848]
 [999  25 708 186  27]
 [977   5 113  25  27]
 [977   5 113  25  27]]
[[  5 883  25  27 108]
 [451  49 991 848  51]
 [ 25 708 186  27  49]
 [  5 113  25  27 108]
 [  5 113  25  27 108]]


### LSTM Neural Network Definition


In [0]:

train_on_gpu =  True
if(train_on_gpu):
    print('Training on GPU!')
else: 
    print('No GPU available, training on CPU; consider making n_epochs very small.')

Training on GPU!


In [0]:
class SCRNN(nn.Module):
    
    def __init__(self, tokens, embedding_dim = 50, n_hidden=50, n_layers=2,
                               drop_prob=0.7, lr=0.001, dictionary_size=5001):
        super().__init__()
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        
        self.embedding = nn.Embedding(len(tokens), embedding_dim)
        
        self.lstm = nn.LSTM(embedding_dim, n_hidden, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        self.dropout = nn.Dropout(drop_prob)
        
        self.fc = nn.Linear(n_hidden, len(tokens))
        
        
    
    def forward(self, x, hidden):

        embedding = self.embedding(x)

        out_r, hidden = self.lstm(embedding, hidden)
        
        out_features = out_r.reshape([out_r.shape[0] * out_r.shape[1], self.n_hidden])
        
        out = self.dropout(out_r)

        out = out.reshape([out.shape[0] * out.shape[1], self.n_hidden])
        
        out = self.fc(out)
        
        return out, hidden, out_features
    
    
    def init_hidden(self, batch_size):

        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                      weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
        
        return hidden
        

## Training


In [0]:
def train(net, train_data, val_data, epochs=10, batch_size=batch_size, seq_length=sequence_length, lr=0.001, clip=5, print_every=10):

    best_loss = 100000
    
    net.train()
    
    opt = torch.optim.Adam(net.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    
    if(train_on_gpu):
        net.cuda()
    
    counter = 0
    for e in range(epochs):

        h = net.init_hidden(batch_size)
        
        for x, y in get_batches(train_data, batch_size, seq_length):
            
            counter += 1
            
            inputs, targets = torch.from_numpy(x), torch.from_numpy(y)
            
            if(train_on_gpu):
                inputs, targets = inputs.cuda(), targets.cuda()

            # Creating new variables for the hidden state, otherwise
            # we'd backprop through the entire training history
            h = tuple([each.data for each in h])

            net.zero_grad()

            output, h, _ = net(inputs, h)
            
            loss = criterion(output, targets.contiguous().view(batch_size*seq_length))
            loss.backward()
            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            nn.utils.clip_grad_norm_(net.parameters(), clip)
            opt.step()
            
            if counter % print_every == 0:
                val_h = net.init_hidden(batch_size)
                val_losses = []
                net.eval()
                for x, y in get_batches(val_data, batch_size, seq_length):

                    x, y = torch.from_numpy(x), torch.from_numpy(y)
                    
                    val_h = tuple([each.data for each in val_h])
                    
                    inputs, targets = x, y
                    if(train_on_gpu):
                        inputs, targets = inputs.cuda(), targets.cuda()

                    output, val_h, _ = net(inputs, val_h)

                    val_loss = criterion(output, targets.contiguous().view(batch_size*seq_length))
                
                    val_losses.append(val_loss.item())
                
                net.train() 
                
                val_loss = np.mean(val_losses)
                if val_loss < best_loss:
                  save_model(net)
                  best_loss = val_loss
                  print('Persisted the best model.')
                
                print("Epoch: {}/{}...".format(e+1, epochs),
                      "Step: {}...".format(counter),
                      "Loss: {:.4f}...".format(loss.item()),
                      "Val Loss: {:.4f}".format(val_loss),
                      "Perplexity: {:.4f}".format(np.exp(val_loss)))

In [0]:

n_hidden= 512
n_layers= 2
embedding_dim = 50

net = SCRNN(dictionary, embedding_dim, n_hidden, n_layers)
print(net)

print('Number of parameters: ')
np.sum([np.prod(x.shape) for x in net.parameters()])

SCRNN(
  (embedding): Embedding(1001, 50)
  (lstm): LSTM(50, 512, num_layers=2, batch_first=True, dropout=0.7)
  (dropout): Dropout(p=0.7)
  (fc): Linear(in_features=512, out_features=1001, bias=True)
)
Number of parameters: 


3819883

### Set your training hyperparameters!

In [0]:
import numpy as np

val_frac = 0.1

encoded_methods_array = np.array(encoded_methods_all)

seqs = int(encoded_methods_array.shape[0] / sequence_length)

sep_point = int(seqs * (1 - val_frac)) * sequence_length
train_methods, val_methods = encoded_methods_array[:sep_point], encoded_methods_array[sep_point:]

unknown_token_id = token2int['<unk>']

train_tokens = set(train_methods)
val_tokens = set(val_methods)

print(len(train_tokens))
print(len(val_tokens))

unknown_tokens = val_tokens - train_tokens

print(len(unknown_tokens))

val_methods = np.array([token if token not in unknown_tokens else unknown_token_id for token in val_methods])

print(len(set(val_methods))) 

983
612
17
595


In [0]:
CUDA_LAUNCH_BLOCKING=1
n_epochs = 3

train(net, train_methods, val_methods, epochs=n_epochs, batch_size=batch_size, seq_length=sequence_length, lr=0.001, print_every=50)

Persisted the best model.
Epoch: 1/3... Step: 50... Loss: 3.8200... Val Loss: 3.8854 Perplexity: 48.6861
Persisted the best model.
Epoch: 1/3... Step: 100... Loss: 3.1288... Val Loss: 3.2197 Perplexity: 25.0211
Persisted the best model.
Epoch: 1/3... Step: 150... Loss: 2.7573... Val Loss: 2.9215 Perplexity: 18.5700
Persisted the best model.
Epoch: 1/3... Step: 200... Loss: 2.4581... Val Loss: 2.7650 Perplexity: 15.8786
Persisted the best model.
Epoch: 1/3... Step: 250... Loss: 2.3514... Val Loss: 2.6477 Perplexity: 14.1216
Persisted the best model.
Epoch: 1/3... Step: 300... Loss: 2.2373... Val Loss: 2.5657 Perplexity: 13.0104
Persisted the best model.
Epoch: 1/3... Step: 350... Loss: 2.2097... Val Loss: 2.4719 Perplexity: 11.8450
Persisted the best model.
Epoch: 1/3... Step: 400... Loss: 2.1161... Val Loss: 2.4204 Perplexity: 11.2508
Persisted the best model.
Epoch: 1/3... Step: 450... Loss: 1.9679... Val Loss: 2.3663 Perplexity: 10.6580
Persisted the best model.
Epoch: 1/3... Step: 5

In [0]:

def save_model(net):
  model_name = 'drive/datasets/SCRNN.net'

  checkpoint = {'n_hidden': net.n_hidden,
                'n_layers': net.n_layers,
                'state_dict': net.state_dict(),
                'tokens': dictionary,
                'int2token': int2token,
                'token2int': token2int}

  with open(model_name, 'wb') as f:
      torch.save(checkpoint, f)

In [0]:
import torch
model_name = 'drive/datasets/SCRNN.net'

checkpoint = {}

with open(model_name, 'wb') as f:
    torch.save(checkpoint, f)

In [0]:
def predict(token2int, int2token, net, token, h=None, top_k=None):
        
        x = np.array([[token2int[token]]])
        
        inputs = torch.from_numpy(x)

        # detach hidden state from history
        h = tuple([each.data for each in h])

        out, h, out_features = net(inputs, h)

        p = F.softmax(out, dim=1).data

        p, top_tokens = p.topk(top_k)
        top_tokens = top_tokens.numpy().squeeze()
        
        p = p.numpy().squeeze()
        token = np.random.choice(top_tokens, p=p/p.sum())
        
        return int2token[token], h, out_features

In [0]:
def sample(token2int, int2token, net, size, prime=[], top_k=5):
        
    net.cpu()
    net.eval() # eval mode
    
    h = net.init_hidden(1)
    
    features = []
    tokens = prime
    for token in prime:
        token, h, out_features = predict(token2int, int2token, net, token, h, top_k=top_k)
        features.append(out_features)

    if size > 0:
      tokens.append(token)

      for ii in range(size):
          token, h, out_features = predict(token2int, int2token, net, tokens[-1], h, top_k=top_k)
          tokens.append(token)
          features.append(out_features)

    return ''.join(tokens), features

## Loading a checkpoint

In [0]:
with open('drive/datasets/SCRNN.net', 'rb') as f:
    checkpoint = torch.load(f)
    
loaded = SCRNN(checkpoint['tokens'], n_hidden=checkpoint['n_hidden'], n_layers=checkpoint['n_layers'])
loaded.load_state_dict(checkpoint['state_dict'])

In [0]:
train_on_gpu = False

code, features = sample(checkpoint['token2int'], checkpoint['int2token'], loaded, 0, top_k=5, prime=['while', '('])

print(features[0].shape)
print(features[1].shape)


torch.Size([1, 512])
torch.Size([1, 512])


In [0]:
''.join([int2token[x] for x in val_methods[510001:510100]])

'next(t2-t1);<unk>();<unk>();break;case<unk>next(t2-t1);<unk>();<unk>();boolean<unk>=true;for(inti=num;<unk>i<<unk>;i){<unk>=(<unk>[i]num)(<unk>[i]num);}if(<unk>){state=<unk>;if(!<unk>[num]){<unk>();}}if(<unk>'

In [0]:
[int2token[x] for x in val_methods[510001:510010]]

['next', '(', 't2', '-', 't1', ')', ';', '<unk>', '(']

**Feature extraction**

In [0]:
path = 'drive/datasets/smells-dataset'
target = 'drive/datasets/features_lstm.csv'
unknown_token = '<unk>'
features_num = 512
print_every = 5

with open('drive/datasets/SCRNN.net', 'rb') as f:
    checkpoint = torch.load(f)
    
loaded = SCRNN(checkpoint['tokens'], n_hidden=checkpoint['n_hidden'], n_layers=checkpoint['n_layers'])
loaded.load_state_dict(checkpoint['state_dict'])

i = 0
_, dirnames, _ = next(os.walk(path))

output_dataset = pd.DataFrame(index=range(len(dirnames)), columns=list(range(features_num)) + ['sevrity', 'smell', 'id'])
for dir in dirnames:
  parts = dir.split('_')
  smell = parts[0]
  severity = parts[1]
  id = parts[2]
  
  subpath = os.path.join(path, dir)
  _, _, filenames = next(os.walk(subpath))
  
  methods_features = np.zeros((1,features_num))
  for file in filenames:
    file_path = os.path.join(subpath, file)
    with open(file_path, 'r') as f:
      content = f.read()
      content_tokens = content.split(',')
      content_tokens = [x.replace('\n', '') if x.replace('\n','') in checkpoint['tokens'] else unknown_token for x in content_tokens]

      code, features = sample(checkpoint['token2int'], checkpoint['int2token'], loaded, 0, prime=content_tokens)

      method_features = np.mean(list(map(lambda x: x.detach().numpy(), features)), axis = 0)
      methods_features += method_features

  methods_features /= len(filenames)
  output_dataset.iloc[i] = np.concatenate((methods_features, np.array([severity, smell, id]).reshape(1,-1)), axis = 1)

  i += 1
  if i % print_every == 0:
    print(f'Processed {i}/{len(dirnames)}')

output = output_dataset.to_csv()
with open(target, "w") as f:
  f.write(output)

        

Processed 5/1379
Processed 10/1379
Processed 15/1379
Processed 20/1379
Processed 25/1379
Processed 30/1379
Processed 35/1379
Processed 40/1379
Processed 45/1379
Processed 50/1379
Processed 55/1379
Processed 60/1379
Processed 65/1379
Processed 70/1379
Processed 75/1379
Processed 80/1379
Processed 85/1379
Processed 90/1379
Processed 95/1379
Processed 100/1379
Processed 105/1379
Processed 110/1379
Processed 115/1379
Processed 120/1379
Processed 125/1379
Processed 130/1379
Processed 135/1379
Processed 140/1379
Processed 145/1379
Processed 150/1379
Processed 155/1379
Processed 160/1379
Processed 165/1379
Processed 170/1379
Processed 175/1379
Processed 180/1379
Processed 185/1379
Processed 190/1379
Processed 195/1379
Processed 200/1379
Processed 205/1379
Processed 210/1379
Processed 215/1379
Processed 220/1379
Processed 225/1379
Processed 230/1379
Processed 235/1379
Processed 240/1379
Processed 245/1379
Processed 250/1379
Processed 255/1379
Processed 260/1379
Processed 265/1379
Processed 270