In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from transformers import T5Tokenizer

In [21]:
# Load the CSV file
upload = pd.read_csv("/content/New.csv")

data = upload.iloc[:,0:2]
data = data.sample(frac=1).reset_index(drop=True)

data['Code'] = data['Code'].astype(str)

In [6]:
data.head()

Unnamed: 0,Code,Label
0,"int removeElement(int* nums, int numsSize, int...",1
1,class Solution2 {\npublic:\n vector<vector<...,1
2,class Solution {\npublic:\n vector<int> app...,1
3,//#define _USE_MATH_DEFINES\n//#include <cmath...,0
4,class Solution {\npublic:\n int boxDeliveri...,1


In [22]:
# Initialize the T5 Tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [23]:
# Tokenize the code column and convert it into vectors
input_ids = []
for code in data['Code']:
    encoded_code = tokenizer.encode(code, return_tensors='pt', padding='max_length', truncation=True, max_length=512)
    input_ids.append(encoded_code)

In [24]:
# Convert the list of tensors into a single tensor
input_ids = torch.cat(input_ids, dim=0)

# Prepare the labels
labels = torch.tensor(data['Label'].values)

In [25]:
# Split the data into training and testing sets
train_inputs, test_inputs, train_labels, test_labels = train_test_split(input_ids, labels, test_size=0.15, random_state=42)

In [26]:
# Define the GRU model
class GRUClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(GRUClassifier, self).__init__()
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out, _ = self.gru(x)
        out = self.fc(out)
        out = self.sigmoid(out)
        return out



# Initialize the model, loss function, and optimizer
input_size = 512  # Assuming max_length = 512 after tokenization
hidden_size = 128
num_layers = 2
output_size = 1

model = GRUClassifier(input_size, hidden_size, num_layers, output_size)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [27]:
# Train the model
epochs = 10
batch_size = 512

for epoch in range(epochs):
    for i in range(0, len(train_inputs), batch_size):
        inputs = train_inputs[i:i+batch_size]
        labels = train_labels[i:i+batch_size]

        optimizer.zero_grad()

        outputs = model(inputs.float())
        loss = criterion(outputs.squeeze(), labels.float())

        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch}/{epochs}], Loss: {loss.item()}')

Epoch [0/10], Loss: 0.6804930567741394
Epoch [1/10], Loss: 0.6457083225250244
Epoch [2/10], Loss: 0.615138590335846
Epoch [3/10], Loss: 0.5734503269195557
Epoch [4/10], Loss: 0.5255307555198669
Epoch [5/10], Loss: 0.4709286391735077
Epoch [6/10], Loss: 0.4096842110157013
Epoch [7/10], Loss: 0.3590719997882843
Epoch [8/10], Loss: 0.3401224911212921
Epoch [9/10], Loss: 0.30857911705970764


In [28]:
# Evaluate the model
with torch.no_grad():
    outputs = model(test_inputs.float())
    predicted = torch.round(outputs).squeeze()
    accuracy = (predicted == test_labels).sum().item() / len(test_labels)
    print(f'Accuracy: {accuracy}')

Accuracy: 0.89
