The purpose of this script is to create the tensor files necessary to remove any needed preprocessing from the model training/evaluation scripts that will be done on the remote cluster. 


In [35]:
# import the required libraries:
import sentencepiece as spm
import pandas as pd
from keras.utils import pad_sequences
import torch
import torch.nn as nn
import numpy as np
from tqdm.notebook import tqdm
from torch.utils.data import TensorDataset, DataLoader
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from torch.optim import Adam
from sklearn.preprocessing import LabelEncoder
import torch.nn.functional as F
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from Juliet2_Schema import Session,Cases, VLW, VLW2

In [36]:
sp = spm.SentencePieceProcessor()
sp.Load('tokenizer.model')

True

In [45]:
from sqlalchemy import create_engine, func
session = Session()
records = session.query(VLW2).all()

query = session.query(VLW.cwe, func.count(VLW.cwe).label('total')).group_by(VLW.cwe).order_by(func.count(VLW.cwe).desc())

for cwe_class, count in query:
    print(f"CWE: {cwe_class}, Count: {count}")

CWE: CWE-190: Integer Overflow or Wraparound, Count: 3906
CWE: CWE-122: Heap-based Buffer Overflow, Count: 3793
CWE: CWE-078: Improper Neutralization of Special Elements used in an OS Command ('OS Command Injection'), Count: 3316
CWE: CWE-121: Stack-based Buffer Overflow, Count: 3086
CWE: CWE-191: Integer Underflow (Wrap or Wraparound), Count: 2836
CWE: CWE-762: Mismatched Memory Management Routines, Count: 2232
CWE: CWE-134: Uncontrolled Format String, Count: 2142
CWE: CWE-036: Absolute Path Traversal, Count: 1766
CWE: CWE-023: Relative Path Traversal, Count: 1766
CWE: CWE-590: Free of Memory not on the Heap, Count: 1675
CWE: CWE-369: Divide By Zero, Count: 1490
CWE: CWE-789: Uncontrolled Memory Allocation, Count: 1373
CWE: CWE-129: Improper Validation of Array Index, Count: 1368
CWE: CWE-127: Buffer Under-read, Count: 1300
CWE: CWE-124: Buffer Underwrite ('Buffer Underflow'), Count: 1300
CWE: CWE-400: Uncontrolled Resource Consumption ('Resource Exhaustion'), Count: 1208
CWE: CWE-197

In [38]:
data = [record.vlw_content for record in records]
labels  = [record.vulnerability_location  for record in records]

# Split the data into train and test sets
train_data, test_data, train_labels, test_labels = train_test_split(data, labels, test_size=0.2, random_state=42)
# Encode sentences 
train_sequences = [[sp.EncodeAsIds(line) for line in text.split('\n')[:40]] for text in train_data]
test_sequences = [[sp.EncodeAsIds(line) for line in text.split('\n')[:40]] for text in test_data]

# Finding the longest indivudial tokenized sentences
max_length_train = max(max(len(seq) for seq in text) for text in train_sequences)
max_length_test = max(max(len(seq) for seq in text) for text in test_sequences)

# Get the maximum length across both sets
max_length = max(max_length_train, max_length_test)

# Convert labels to one-hot vectors

num_classes = 40  
train_labels = torch.stack([torch.nn.functional.one_hot(torch.tensor(label), num_classes=num_classes) for label in train_labels])
test_labels = torch.stack([torch.nn.functional.one_hot(torch.tensor(label), num_classes=num_classes) for label in test_labels])

print('max_length:', max_length)
print('Train Labels:', train_labels[0])
print('Test Labels:', test_labels[0])



max_length: 98
Train Labels: tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
Test Labels: tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])


In [39]:
# Pad each individual sequence to a length of 98
train_sequences_padded = [pad_sequences(text, maxlen=98, padding='post') for text in train_sequences]
test_sequences_padded = [pad_sequences(text, maxlen=98, padding='post') for text in test_sequences]

print('Train sequences shape:', train_sequences_padded[0].shape)
print('Test sequences shape:', test_sequences_padded[0].shape)

Train sequences shape: (40, 98)
Test sequences shape: (40, 98)


In [40]:
# Create a list of indices where the sequence length is not 40
drop_indices = [i for i, seq in enumerate(train_sequences_padded) if len(seq) != 40]

# Drop these indices from train_sequences_padded and train_labels
train_sequences_padded = [seq for i, seq in enumerate(train_sequences_padded) if i not in drop_indices]
train_labels = [label for i, label in enumerate(train_labels) if i not in drop_indices]

# Print the number of remaining sequences and labels
print(f"Number of remaining train sequences: {len(train_sequences_padded)}")
print(f"Number of remaining train labels: {len(train_labels)}")

Number of remaining train sequences: 43335
Number of remaining train labels: 43335


In [41]:
drop_indices_test = [i for i, seq in enumerate(test_sequences_padded) if len(seq) != 40]

# Drop these indices from test_sequences_padded and test_labels
test_sequences_padded = [seq for i, seq in enumerate(test_sequences_padded) if i not in drop_indices_test]
test_labels = [label for i, label in enumerate(test_labels) if i not in drop_indices_test]

# Print the number of remaining sequences and labels
print(f"Number of remaining test sequences: {len(test_sequences_padded)}")
print(f"Number of remaining test labels: {len(test_labels)}")

Number of remaining test sequences: 10838
Number of remaining test labels: 10838


In [42]:
train_sequences_tensor = torch.tensor(train_sequences_padded)
# Assuming train_labels needs to be stacked for a specific reason
train_labels = torch.stack(train_labels)  # Keep this if it's necessary for your model
train_dataset = TensorDataset(train_sequences_tensor, train_labels)

# Create a DataLoader with a specific batch size and drop_last option
batch_size = 40
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)

# Print the shape of the first batch
first_batch = next(iter(train_loader))
print('First train batch shape:', first_batch[0].shape)

test_sequences_tensor = torch.tensor(test_sequences_padded)
test_labels = torch.stack(test_labels)
test_dataset = TensorDataset(test_sequences_tensor, test_labels)
batch_size = 40

test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, drop_last=True)

first_batch = next(iter(test_loader))
print('First test batch shape:', first_batch[0].shape)


First train batch shape: torch.Size([40, 40, 98])
First test batch shape: torch.Size([40, 40, 98])


In [None]:
print('train_sequences_tensor shape:', train_sequences_tensor.shape)
print('train_sequences_tensor dtype:', train_sequences_tensor.dtype)
print('train_labels_tensor shape:', train_labels.shape)
print('train_labels_tensor dtype:', train_labels.dtype)

In [43]:
torch.save(train_sequences_tensor, 'vlw2_train_sequences_tensor.pt')
torch.save(train_labels, 'vlw2_train_labels.pt')

# Save testing sequences and labels
torch.save(test_sequences_tensor, 'vlw2_test_sequences_tensor.pt')
torch.save(test_labels, 'vlw2_test_labels.pt')