In [1]:
import os
import re
import glob
import string
import pathlib
import numpy as np
from tqdm.auto import tqdm
from collections import Counter
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as Func
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, Subset
from Constants import *

In [2]:
np.random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = True

In [3]:
output_dir = os.path.join("../outputs", "stackoveflow_query")
os.makedirs(output_dir, exist_ok = True)

print(os.listdir(train_dir))

['csharp', 'java', 'javascript', 'python']


In [4]:
sample_file = os.path.join(train_dir, 'python/1.txt')
with open(sample_file) as f:
    print(f.read())
    
# modify to be able to take more than one and randomly and also one randomly from each class

"how to create an infinite loop with apply_async? i havea pool of processes with apply_async in which the different processes take different times to provide output. once one process is finished i do some calculations with their output. after i want to launch another process. in this way i want to create an infinite loop which launches processes, reads the output of the recently finished process, does some calculations and relaunches another process...so far i have been able to do what i want except that the main process gets stuck in the get() function. this because i don't know which process terminated and hence which entry of results i should do get()...some attempt code:..import multiprocessing as mp.import numpy as np.from time import sleep...def squared(x,y):.    result = np.array((x,x)).    if x%2 == 0:.    sleep(2) .return result.....if __name__ == ""__main__"":..    pool = mp.pool() ..    pool_r = [].    for i in xrange(0,8):.        pool_r.append(pool.apply_async(squared, (i,

In [5]:
def longest_query(data_paths):
    max_length = 0
    for path in data_paths:
        with open(path, 'r') as f:
            text = f.read()
            text = ''.join([character for character in text if character not in string.punctuation])
            text = re.sub('<[^>]+>+', '', text)
            corpus = [word for word in text.split()]
        if len(corpus) > max_length:
            max_length = len(corpus)
    return max_length

file_paths = []
file_paths.extend(glob.glob(os.path.join(dataset_dir, "train", "csharp", "*.txt")))
file_paths.extend(glob.glob(os.path.join(dataset_dir, "train", "java", "*.txt")))
file_paths.extend(glob.glob(os.path.join(dataset_dir, "train", "javascript", "*.txt")))
file_paths.extend(glob.glob(os.path.join(dataset_dir, "train", "python", "*.txt")))

longest_query = longest_query(file_paths)
print(f"Longest query: {longest_query} words")

Longest query: 2256 words


In [6]:
def avg_query_length(data_paths):
    query_length = []
    for path in data_paths:
        with open(path, 'r') as f:
            text = f.read()
            text = ''.join([character for character in text if character not in string.punctuation])
            #text = re.sub('<[^>]+>+', '', text)
            corpus = [word for word in text.split()]
        query_length.append(len(corpus))
    return sum(query_length) / len(query_length)

file_paths = []
file_paths.extend(glob.glob(os.path.join(dataset_dir, "train", "csharp", "*.txt")))
file_paths.extend(glob.glob(os.path.join(dataset_dir, "train", "java", "*.txt")))
file_paths.extend(glob.glob(os.path.join(dataset_dir, "train", "javascript", "*.txt")))
file_paths.extend(glob.glob(os.path.join(dataset_dir, "train", "python", "*.txt")))

average_query_length = avg_query_length(file_paths)
print(f"Average query length: {average_query_length} words")

Average query length: 123.774375 words


In [7]:
MAX_LEN = int(longest_query)
NUM_WORDS = -1
VALIDATION_SPLIT = 0.25
EMBED_DIM = 50
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
epochs = 35

In [8]:
def word_frequency(file_paths, most_common = None):
    corpus = []
    for path in file_paths:
        with open(path, 'r') as f:
            text = f.read()
            text = re.sub('<[^>]+>+', '', text)
            corpus.extend([word for word in text.split()])
    word_count = Counter(corpus)
    common_words = word_count.most_common(n = most_common)
    return common_words

In [9]:
def word_to_int(input_words, num_words):
    if num_words > -1:
        int_mapping = {w:i + 1 for i, (w, c) in enumerate(input_words) if i <= num_words - 1}
    else:
        int_mapping = {w:i + 1 for i, (w, c) in enumerate(input_words)}
    return int_mapping

In [10]:
class ClassificationBlock(Dataset):
    def __init__(self, file_paths, common_words, int_mapping, max_len):
        self.word_frequency = word_frequency
        self.int_mapping = int_mapping
        self.file_paths = file_paths
        self.max_len = max_len
        
    def text_standardiser(self, input_text):
        text = input_text.lower()
        text = re.sub('<[^>]+>+', '', text)
        text = ''.join([character for character in text if character not in string.punctuation])
        return text
    
    def get_word_vectors(self, int_mapping, file_path):
        with open(file_path, 'r') as f:
            text = f.read()
            text = self.text_standardiser(text)
            corpus = [word for word in text.split()]
        int_vector = [int_mapping[word] for word in text.split() if word in int_mapping]
        return int_vector
    
    def pad_features(self, int_vector, max_len):
        features = np.zeros((1, max_len), dtype = int)
        if len(int_vector) <=  max_len:
            zeros = list(np.zeros(max_len - len(int_vector)))
            new = zeros + int_vector
        else:
            new = int_vector[: max_len]
        features = np.array(new)
        return features
    
    def encode_labels(self, file_path):
        file_path = pathlib.Path(file_path)
        class_label = str(file_path).split(os.path.sep)[-2]
        if class_label == "csharp":
            int_label = 0
        elif class_label == "java":
            int_label = 1
        elif class_label == "javascript":
            int_label = 2
        elif class_label == "python":
            int_label = 3
        return int_label
    
    def __len__(self):
        return len(self.file_paths)
    
    def __getitem__(self, index):
        file_path = self.file_paths[index]
        int_vector = self.get_word_vectors(self.int_mapping, file_path)
        padded_features = self.pad_features(int_vector, self.max_len)
        label = self.encode_labels(file_path)
        return {
            "text": torch.tensor(padded_features, dtype = torch.int32),
            "label": torch.tensor(label, dtype = torch.long)
        }

In [11]:
file_paths = []
file_paths.extend(glob.glob(os.path.join(dataset_dir, "train", "csharp", "*.txt")))
file_paths.extend(glob.glob(os.path.join(dataset_dir, "train", "java", "*.txt")))
file_paths.extend(glob.glob(os.path.join(dataset_dir, "train", "javascript", "*.txt")))
file_paths.extend(glob.glob(os.path.join(dataset_dir, "train", "python", "*.txt")))

test_file_paths = []
test_file_paths.extend(glob.glob(os.path.join(dataset_dir, "train", "csharp", "*.txt")))
test_file_paths.extend(glob.glob(os.path.join(dataset_dir, "train", "java", "*.txt")))
test_file_paths.extend(glob.glob(os.path.join(dataset_dir, "train", "javascript", "*.txt")))
test_file_paths.extend(glob.glob(os.path.join(dataset_dir, "train", "python", "*.txt")))

In [12]:
word_frequency = word_frequency(file_paths)
int_mapping = word_to_int(word_frequency, num_words = NUM_WORDS)

In [13]:
dataset = ClassificationBlock(file_paths, word_frequency, int_mapping, MAX_LEN)
dataset_size = len(dataset)
validation_size = int(VALIDATION_SPLIT * dataset_size)
indices = torch.randperm(len(dataset)).tolist()

train_set = Subset(dataset, indices[:-validation_size])
validation_set = Subset(dataset, indices[-validation_size:])
test_set = ClassificationBlock(test_file_paths, word_frequency, int_mapping, MAX_LEN)
print(f"Training samples: {len(train_set)}")
print(f"Testing samples: {len(test_set)}")
print(f"Validation samples: {len(validation_set)}")

Training samples: 6000
Testing samples: 8000
Validation samples: 2000


In [14]:
print(train_set[0])

{'text': tensor([  0,   0,   0,  ...,   4, 380,  15], dtype=torch.int32), 'label': tensor(0)}


In [15]:
int_to_word = {value: key for key, value in int_mapping.items()}
class_labels = {0: "csharp",
                1: "java",
                2: "javascript",
                3: "python"}

inputs = ''

for query in train_set[0]["text"]:
    if query != 0:
        inputs += f" {int_to_word[int(query)]}"

print(inputs)
#label = class_labels[train_set[0]["label"]]
#print(label)

 how add a list of elements into a list without reference i cannot find an answer because it seems too much specific so heres my issue with can add another list to another one as a clone like new i want to know is how can i add a list into another one without any reference of the wirecoords new coord new i 0 i lt i as soon the list change it also change inside can i fix this


In [16]:
plt.style.use('ggplot')
    
def save_plots(train_accuracy, validation_accuracy, train_loss, validation_loss):
    # Accuracy plot
    plt.figure(figsize = (12, 10))
    plt.plot(train_accuracy, color = 'black', label = 'training accuracy')
    plt.plot(validation_accuracy, color = 'blue', label = 'validation accuracy')
    plt.xlabel("Epochs")
    plt.ylabel("Accuracy")
    plt.legend()
    plt.savefig(f"../output/accuracy.png")
    
    # Loss plot
    plt.figure(figsize = (12, 10))
    plt.plot(train_loss, color = 'black', label = 'training loss')
    plt.plot(validation_loss, color = 'blue', label = 'validation loss')
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.legend()
    plt.savefig(f"../output/loss.png")

In [17]:
train_loader = DataLoader(train_set,
                          batch_size = batch_size,
                          shuffle = True,
                          num_workers = 4)

validation_loader = DataLoader(validation_set,
                               batch_size = batch_size,
                               shuffle = False,
                               num_workers = 4)

test_loader = DataLoader(test_set,
                         batch_size = batch_size,
                         shuffle = False,
                         num_workers = 4)

In [18]:
"""def multi_accuracy(labels, outputs):
    outputs = torch.argmax(outputs, dim = 1)
    running_correct = torch.eq(outputs, labels)
    accuracy = torch.mean(running_correct.float())
    return accuracy"""

'def multi_accuracy(labels, outputs):\n    outputs = torch.argmax(outputs, dim = 1)\n    running_correct = torch.eq(outputs, labels)\n    accuracy = torch.mean(running_correct.float())\n    return accuracy'

In [19]:
def train(model, trainloader, optimizer, criterion):
    model.train()
    print("Training...")
    train_running_loss = 0.0
    train_running_correct = 0
    counter = 0
    
    for i, data in tqdm(enumerate(trainloader), total = len(trainloader)):
        counter += 1
        inputs, labels = data["text"], data["label"]
        inputs = inputs.to(device)
        labels = torch.tensor(labels, dtype = torch.float32).to(device)
        optimizer.zero_grad()
        
        # Forward prop
        outputs = model(inputs)
        outputs = torch.squeeze(outputs, -1)
        
        # Calculating loss
        loss = criterion(outputs, labels)
        train_running_loss += loss.item()
        
        # Calculating accuracy
        _, prediction = torch.max(outputs.data, 1)
        train_running_correct += (prediction == labels).sum().item()
        
        # Backward prop
        loss.backward()

        # Update weights
        optimizer.step()
        
    epoch_loss = train_running_loss / counter
    epoch_accuracy = 100 * (train_running_correct / len(trainloader.dataset))
    
    return epoch_loss, epoch_accuracy

In [20]:
def validate(model, testloader, criterion):
    model.eval()
    print('Validation...')
    validation_running_loss = 0.0
    validation_running_correct = 0
    counter = 0
    
    with torch.no_grad():
        for i, data in tqdm(enumerate(testloader), total = len(testloader)):
            counter += 1
            inputs, labels = data
            inputs = inputs.to(device)
            labels = torch.tensor(labels, dtype = torch.float32).to(device)
            optimizer.zero_grad()
            
            # Forward prop
            outputs = model(inputs)
            outputs = torch.squeeze(outputs, -1)
            
            # Calculating loss
            loss = criterion(outputs, labels)
            validation_running_loss += loss.item()
            
            # Calculating accuracy
            _, prediction = torch.max(outputs.data, 1)
            validation_running_correct += (prediction == labels).sum().item()
            
    epoch_loss = validation_running_loss / counter
    epoch_accuracy = 100 * (validation_running_correct / len(testloader.dataset))
    
    return epoch_loss, epoch_accuracy


In [21]:
class EmbeddingModel(nn.Module):
    def __init__(self, vocab_size, max_len, embed_dim):
        super(EmbeddingModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim = embed_dim)
        self.linear1 = nn.Linear(max_len, 1)
        self.dropout = nn.Dropout(0.3)
        
    def forward(self, x):
        x = self.embedding(x)
        x = self.dropout(x)
        bs, _, _ = x.shape
        x = Func.adaptive_avg_pool1d(x, 1).reshape(bs, -1)
        out = self.linear1(x)
        return out

In [22]:
model = EmbeddingModel(len(int_mapping) + 1,
                       MAX_LEN,
                       EMBED_DIM).to(device)
print(model)


EmbeddingModel(
  (embedding): Embedding(187861, 50)
  (linear1): Linear(in_features=2256, out_features=1, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
)


In [23]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = 0.0001)

total_parameters = sum(p.numel() for p in model.parameters())
print(f"{total_parameters:,} parameters.")
trainable_parameters = sum(
        p.numel() for p in model.parameters() if p.requires_grad)
print(f"{trainable_parameters:,} training parameters.")


9,395,307 parameters.
9,395,307 training parameters.


In [24]:
%%time

learning_rate = 0.0001
print(f"Running on: {device}")
print(f"Learning rate: {learning_rate}")
print(f"Epochs: {epochs}\n")

for epoch in range(epochs):
    print(f"Epoch {epoch + 1} of {epochs}")
    train_loss_per_epoch, train_accuracy_per_epoch = train(model,
                                                               train_loader,
                                                               optimizer,
                                                               criterion)
    val_loss_per_epoch, val_accuracy_per_epoch = validate(model,
                                                              validation_loader,
                                                              criterion)
    train_loss.append(train_loss_per_epoch)
    validation_loss.append(val_loss_per_epoch)
    train_accuracy.append(train_accuracy_per_epoch)
    validation_accuracy.append(val_accuracy_per_epoch)
    print(f"Training loss: {train_loss_per_epoch:.3f}, Training accuracy: {train_accuracy_per_epoch:.3f}")
    print(f"Validation loss: {val_loss_per_epoch:.3f}, Validation accuracy: {val_accuracy_per_epoch:.3f}")
    print("~ " * 100)
    
    torch.save(
        model, os.path.join(output_dir, 'model.pth')
    )    
print("Done")

Running on: cpu
Learning rate: 0.0001
Epochs: 35

Epoch 1 of 35
Training...


In [None]:
s

In [None]:
model = torch.load(os.path.join(output_dir, 'model.pth'))

In [None]:
test_loss, test_accuracy = validate(model,
                                    test_loader,
                                    criterion)

print(f"Test accuracy:{test_accuracy}, Test loss: {test_loss}")

In [None]:
def get_int_vector(int_mapping, text):
    corpus = [word for word in text.split()]
    int_vector = [int_mapping[word] for word in text.split() if word in int_mapping]
    return int_vector

def pad_features(int_vector, max_len):
    features = np.zeros((1, max_len), dtype = int)
    if len(int_vector) <= max_len:
        zeros = list(np.zeros)(max_len - len(int_vector))
        new = zeros + int_vector
    else:
        new = int_vector[:max_len]
    features = np.array(new)
    return features

In [None]:
#correct the last part to predict for multiclas