In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torch.nn import functional as F
import torch.optim as optim
import numpy as np
import pandas as pd

Loading training and validation datasets 

In [3]:
#Converts.csv file to pandas dataframe format => prepares for Dataloader
class load_data(Dataset): 
    def __init__(self,csvfile): 
        self.data= pd.read_csv(csvfile)
        #Grabs all input features, in this case it is both the subtitle/word input_ids and the _ids: [all rows, all columns from 1 to 1024]
        self.x = self.data.iloc[:-1,1:1025].values
        #Grabs the labels. 1 for mischief, 0 for none : [all rows, just column 1025 for the labels]
        self.y = self.data.iloc[:-1,1025:1027].values

        #we convert the values into tensor.float datatypes
        self.x_train = torch.tensor(self.x, dtype = torch.float32)
        self.y_train = torch.tensor(self.y, dtype = torch.float32)

    def __len__(self): 
        #get length for data set so we can use it for indexing
        return len(self.y_train)
    
    def __getitem__(self, idx):
        #grab (feature,label) pairs based on the index 
        return self.x_train[idx], self.y_train[idx]
        

In [9]:
#convert training and testing to csv
featureset1 = load_data('train.csv')
featureset2 = load_data('val.csv')

#Dataloader does final prep before we pass into the model(seperate into batchesm and shuffles up the data )
training_loader = DataLoader(featureset1,batch_size=64, shuffle=True)
validation_loader = DataLoader(featureset2, batch_size=64, shuffle =True)


Building the model

In [5]:
class Model(nn.Module):
  def __init__(self):
    super(Model,self).__init__()
    #three fully connected layers that result in a final 
    self.fc1 = nn.Linear(1024,192)
    self.fc2 = nn.Linear(192,128)
    self.fc3 = nn.Linear(128,64)
    self.fc4 = nn.Linear(64,2)

  
  def forward(self, input_ids):
    x = input_ids
    x = torch.relu(self.fc1(x))
    x = torch.sigmoid(self.fc2(x))
    x = torch.relu(self.fc3(x))
    x = torch.sigmoid(self.fc4(x))


    return x

In [6]:
model = Model()
optimizer = optim.SGD(model.parameters(), lr=0.01)
loss_fn = nn.BCELoss()


In [7]:
def train(model,training_loader, validation_loader, optimizer,num_epochs): 
    for epoch in range(num_epochs):
        training_loss = 0
        for features,labels in training_loader: 
            #outputs the prediction
            outputs = model(features)
            #BCE to generate loss(predictions, true label)
            loss = loss_fn(outputs, labels)
            #Backprop.
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            training_loss += loss.item()
            acc = float((outputs.round() == labels).float().mean())


        #validation loss
        validation_loss = 0
        for features,labels in validation_loader:
            val_outputs = model(features)
            val_loss = loss_fn(val_outputs, labels)
            optimizer.zero_grad()
            validation_loss += val_loss.item()
        
        training_loss /= len(training_loader)
        validation_loss /= len(validation_loader)

        print(f"Epoch {epoch}: Training loss {training_loss}, Validation loss {validation_loss}, Acc: {acc}")
       
        

In [8]:
train(model, training_loader, validation_loader, optimizer, 100)

Epoch 0: Training loss 0.6531292498111725, Validation loss 0.6493591964244843, Acc: 0.6666666865348816
Epoch 1: Training loss 0.6495257019996643, Validation loss 0.6390639245510101, Acc: 0.65625
Epoch 2: Training loss 0.6393823325634003, Validation loss 0.6330879628658295, Acc: 0.6979166865348816
Epoch 3: Training loss 0.6313560605049133, Validation loss 0.6280313432216644, Acc: 0.7604166865348816
Epoch 4: Training loss 0.630130261182785, Validation loss 0.6251413524150848, Acc: 0.6875
Epoch 5: Training loss 0.6243195831775665, Validation loss 0.6177098453044891, Acc: 0.6979166865348816
Epoch 6: Training loss 0.6159536838531494, Validation loss 0.6168788075447083, Acc: 0.7916666865348816
Epoch 7: Training loss 0.6142722368240356, Validation loss 0.6074267327785492, Acc: 0.7291666865348816
Epoch 8: Training loss 0.6119742095470428, Validation loss 0.6053817570209503, Acc: 0.6875
Epoch 9: Training loss 0.6002068519592285, Validation loss 0.6024059057235718, Acc: 0.8125
Epoch 10: Training