In [1]:
import os
import random
from dotenv import load_dotenv
from huggingface_hub import login
from datasets import Dataset, load_dataset
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.preprocessing import StandardScaler

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
%load_ext autoreload
%autoreload 2

import utils.evaluator as evaluator
evaluate = evaluator.evaluate

In [3]:
load_dotenv(override=True)
hf_token = os.environ['HF_TOKEN']
login(hf_token, add_to_git_credential=True)

dataset = load_dataset("aslam-naseer/js-function-complexity-processed")

Token has not been saved to git credential helper.
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential helper as default.

git config --global credential.helper store

Read https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more details.[0m


### Neural Network

In [4]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if using multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

In [5]:
train_ds = dataset['train']
val_ds = dataset['validation']
test_ds = dataset['test']

feature_cols = [
    'param_count',
    'local_statement_count',
    'total_statement_count',
    'local_variable_count',
    'total_variable_count',
    'local_nesting_depth',
    'total_nesting_depth'
]

train_array = np.array([train_ds[col] for col in feature_cols]).T
scaler = StandardScaler()
scaler.fit(train_array)

0,1,2
,"copy  copy: bool, default=True If False, try to avoid a copy and do inplace scaling instead. This is not guaranteed to always work inplace; e.g. if the data is not a NumPy array or scipy.sparse CSR matrix, a copy may still be returned.",True
,"with_mean  with_mean: bool, default=True If True, center the data before scaling. This does not work (and will raise an exception) when attempted on sparse matrices, because centering them entails building a dense matrix which in common use cases is likely to be too large to fit in memory.",True
,"with_std  with_std: bool, default=True If True, scale the data to unit variance (or equivalently, unit standard deviation).",True


In [23]:
def normalise_features(data):
  mat = np.array([data[col] for col in feature_cols]).reshape(1,-1)
  scaled_mat = scaler.transform(mat)
  return {"features": scaled_mat}

train_ds = train_ds.map(normalise_features, num_proc=4)
val_ds = val_ds.map(normalise_features, num_proc=4)

train_ds.set_format(type='torch', columns=['features', 'complexity'])
val_ds.set_format(type='torch', columns=['features', 'complexity'])

Map (num_proc=4): 100%|██████████| 4993/4993 [00:01<00:00, 3997.98 examples/s]
Map (num_proc=4): 100%|██████████| 500/500 [00:00<00:00, 1506.49 examples/s]


In [7]:
class NeuralNetwork(nn.Module):
    def __init__(self, input_size):
        super(NeuralNetwork, self).__init__()
        self.layer1 = nn.Linear(input_size, 64)
        self.layer2 = nn.Linear(64, 32)
        self.layer3 = nn.Linear(32, 1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.2) 

    def forward(self, x):
        x = self.relu(self.layer1(x))
        x = self.dropout(x)
        x = self.relu(self.layer2(x))
        x = self.layer3(x)
        return x

In [34]:
def train_model(num_epochs, model, train_loader, val_ds, criterion, optimizer):
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0

        for batch in train_loader:
            inputs = batch['features'].float()
            labels = batch['complexity'].float().unsqueeze(1)

            if inputs.dim() == 3:
                inputs = inputs.squeeze(1)

            optimizer.zero_grad()
            outpus = model(inputs)
            loss = criterion(outpus, labels)

            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            val_loader = DataLoader(val_ds, batch_size=32)
        for batch in val_loader:
            inputs = batch['features'].float()
            labels = batch['complexity'].float().unsqueeze(1)

            if inputs.dim() == 3:
                inputs = inputs.squeeze(1)
            
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

        if (epoch + 1) % 10 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], '
                f'Train Loss: {train_loss/len(train_loader):.4f}, '
                f'Val Loss: {val_loss/len(val_loader):.4f}')

In [50]:
num_epochs = 50
set_seed(42)

input_size = len(feature_cols)
model = NeuralNetwork(input_size)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)

train_model(num_epochs, model, train_loader, val_ds, criterion, optimizer)

Epoch [10/50], Train Loss: 1.7190, Val Loss: 1.3172
Epoch [20/50], Train Loss: 2.4708, Val Loss: 1.2932
Epoch [30/50], Train Loss: 1.5058, Val Loss: 1.2111
Epoch [40/50], Train Loss: 1.4355, Val Loss: 1.2254
Epoch [50/50], Train Loss: 1.3653, Val Loss: 1.1810


In [51]:
def nn_predict(data):
  model.eval()
  with torch.no_grad():
    input = normalise_features(data)['features']
    output = model(torch.tensor(input).float())
    return output[0].item()
  
evaluate(nn_predict, test_ds)

 22%|██▏       | 108/500 [00:00<00:00, 1077.35it/s]

[92m0.8 [91m1.7 [92m0.4 [93m1.5 [92m0.4 [93m0.9 [92m0.3 [93m0.9 [92m0.4 [93m0.9 [92m0.3 [93m0.8 [93m1.6 [92m0.2 [92m0.2 [91m5.2 [91m5.6 [92m0.0 [92m0.3 [92m0.4 [92m0.4 [93m1.5 [92m0.2 [93m0.8 [92m0.1 [91m1.1 [91m2.2 [92m0.0 [92m0.5 [92m0.1 [92m0.5 [91m1.8 [91m1.9 [92m0.2 [92m0.4 [92m0.4 [92m0.2 [93m1.1 [91m2.3 [92m0.1 [93m1.0 [91m1.6 [91m1.3 [91m1.4 [93m0.9 [92m0.5 [92m0.2 [92m0.2 [92m0.5 [91m1.9 [92m0.1 [92m0.2 [92m0.9 [93m0.6 [92m0.3 [92m0.0 [91m1.3 [93m1.0 [93m0.8 [92m0.2 [93m0.5 [92m0.2 [93m0.8 [92m0.1 [93m0.8 [92m0.0 [92m0.3 [92m0.2 [92m0.0 [92m0.1 [93m0.7 [92m0.2 [91m1.3 [91m3.9 [92m0.4 [93m0.6 [92m0.4 [92m0.2 [92m0.2 [92m0.2 [92m0.5 [92m0.2 [93m0.7 [92m0.2 [92m0.3 [92m0.4 [93m0.9 [92m0.0 [92m0.1 [92m0.2 [91m1.8 [93m0.8 [93m1.1 [93m1.2 [93m0.7 [91m2.1 [92m0.2 [92m0.4 [92m0.1 [91m2.2 [92m0.2 [93m0.8 [92m0.3 [93m0.9 [92m0.3 [92m0.7 [91m2.3 [91m1.8 [92m0.2 [93m1.0 [93m0.6 

 64%|██████▎   | 318/500 [00:00<00:00, 1001.91it/s]

[92m0.5 [91m1.5 [92m0.8 [91m3.2 [93m0.9 [93m0.5 [92m0.4 [93m0.7 [92m0.0 [93m0.6 [92m0.6 [92m0.0 [92m0.5 [91m1.5 [92m0.0 [92m0.4 [93m0.9 [91m2.1 [92m0.3 [93m0.8 [92m0.1 [92m0.4 [92m0.6 [92m0.1 [92m0.1 [93m0.7 [92m0.6 [93m1.6 [93m0.5 [92m0.1 [92m0.3 [92m0.0 [91m1.5 [92m0.1 [93m0.6 [93m0.6 [91m1.7 [93m0.7 [93m0.8 [93m0.5 [92m0.2 [92m0.2 [92m0.5 [92m0.2 [93m0.8 [92m0.1 [92m0.3 [93m0.5 [93m0.8 [92m0.4 [92m0.0 [93m0.7 [93m0.9 [92m0.5 [92m0.3 [93m0.7 [92m0.6 [92m0.0 [92m0.2 [92m0.5 [92m0.2 [92m0.2 [92m0.1 [92m0.4 [92m0.4 [91m2.2 [91m1.3 [93m1.2 [93m0.8 [92m0.4 [92m0.1 [93m0.6 [92m0.4 [93m0.5 [93m0.6 [91m2.1 [92m0.1 [91m3.5 [93m0.7 [92m0.1 [91m1.3 [93m1.2 [91m1.2 [93m1.6 [92m0.0 [92m0.4 [91m3.0 [92m0.5 [93m0.7 [91m1.6 [92m0.5 [91m2.3 [92m0.3 [93m1.4 [92m0.2 [93m0.7 [92m0.4 [91m2.7 [93m0.6 [92m0.2 [93m0.8 [92m0.1 [93m0.7 [91m1.0 [92m0.2 [93m1.0 [93m0.8 [93m0.7 [92m0.2 [93m1.0 [91m1.0 

100%|██████████| 500/500 [00:00<00:00, 1080.65it/s]

[91m1.1 [91m1.5 [93m0.9 [92m0.3 [93m1.6 [93m0.5 [93m0.9 [92m0.4 [92m0.2 [92m0.5 [93m0.7 [93m0.5 [92m0.1 [92m0.2 [93m0.5 [92m0.7 [93m1.0 [93m0.8 [93m0.8 [93m0.5 [92m0.4 [93m1.0 [92m0.3 [91m1.1 [92m0.2 [92m0.2 [91m1.1 [92m0.2 [92m0.2 [92m0.1 [93m0.8 [91m1.3 [92m0.1 [93m0.7 [92m0.5 [91m3.7 [92m0.3 [91m1.4 [92m0.4 [92m0.6 [91m2.9 [91m2.2 [91m2.7 [92m0.1 [92m0.3 [92m0.2 [92m0.4 [92m0.4 [91m1.2 [92m0.2 [93m0.5 [91m2.2 [91m2.9 [92m0.0 [92m0.0 [92m0.2 [92m0.5 [92m0.4 [93m0.7 [91m3.1 [92m0.2 [93m0.7 [92m0.0 [91m1.5 [91m2.5 [91m1.4 [92m0.5 [92m0.3 [91m4.1 [92m0.1 [92m0.3 [92m0.4 


