In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import ToTensor
import pandas as pd
# from sklearn.metrics import accuracy_score
import numpy as np
from utilities import *
import textProcessing as tp

In [9]:
class MyDataset(Dataset):
    def __init__(self):
        self.data = load_text("dataset/train_preprocessed.txt")
        self.transform = ToTensor()
        self.T = 100

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        
        sentence = self.data[idx]
        # extract the label
        labels = tp.extract_diacritics_with_previous_letter(sentence)

        # remove the label from the sentence
        sentence = tp.clear_diacritics(sentence)

        # convert the sentence to one hot encoding
        sentence = convert_sentence_to_vector(sentence)

        # convert the labels to one hot encoding
        labels = convert_labels_to_vector(labels)

        # pad the sentence and labels if smaller than T
        temp = np.zeros((self.T, 38))
        if self.T > sentence.shape[0]:
            temp[:sentence.shape[0], :] = sentence
            sentence = temp
        else:
            sentence = sentence[:self.T, :]
        temp = np.zeros((self.T, 15))
        if self.T > labels.shape[0]:
            temp[:labels.shape[0], :] = labels
            labels = temp
        else:
            labels = labels[:self.T, :]

        # convert the sentence and labels to tensors
        sentence = torch.tensor(sentence, dtype=torch.float32)
        labels = torch.tensor(labels, dtype=torch.float32)

        return sentence, labels


In [10]:
class RNNClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNNClassifier, self).__init__()

        self.hidden_size = hidden_size
        self.input_size = input_size

        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.out = nn.Sequential(nn.Linear(hidden_size, 64),nn.Tanh(),nn.Linear(64, output_size),nn.Softmax())

    def forward(self, input, hidden):
        output, hidden = self.rnn(input, hidden)
        # reshape the output to be able to pass it to the linear layer
        # output = output.contiguous().view(-1, self.hidden_size)
        output = self.out(output)
        return output

    def init_hidden(self, batch_size):
        return torch.zeros(1,batch_size, self.hidden_size)

In [11]:
# Hyperparameter
input_size = 38
hidden_size = 64
output_size = 15
batch_size = 1024
num_epochs = 5

In [14]:
# connect to GPU if available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
print(torch.cuda.is_available())
print(torch.cuda.device_count())

# Create an instance of the RNN classifier
model = RNNClassifier(input_size, hidden_size, output_size)
model.to(device)

# Load data from CSV file
csv_file = 'train.csv'
dataset = MyDataset()

# Create a dataloader to handle batching and shuffling
train_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# csv_file = 'data.csv'
# dataset = MyDataset(csv_file)

# # Create a dataloader to handle batching and shuffling
# test_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

cpu
False
0


In [13]:
# Training loop
for epoch in range(num_epochs):
    for inputs, labels in train_dataloader:
      # Reshape input and labels to (batch_size, seq_length, input_size)
      input = inputs.view(batch_size, -1, input_size)
      labels = labels.view(batch_size, -1, output_size)
      input, labels = input.to(device), labels.to(device)
      hidden = model.init_hidden(batch_size=batch_size)
      # Zero the gradients
      optimizer.zero_grad()

      # Forward pass
      output = model(input, hidden)

      # Compute loss
      loss = criterion(output, labels)

      # Backward pass and optimization
      loss.backward()
      optimizer.step()

      # Print loss for monitoring
    print(f"Epoch: {epoch+1}, Batch Loss: {loss.item()}")

KeyboardInterrupt: 

In [None]:
# Set the model to evaluation mode
model.eval()

# Iterate over the test dataset
predictions = []
ground_truth = []
for inputs, labels in test_dataloader:
    # Forward pass
    outputs = model(inputs)

    # Get the predicted labels
    _, predicted = torch.max(outputs, dim=1)
    predictions += predicted.tolist()
    ground_truth += labels.tolist()

# Convert lists to numpy arrays
predictions = np.array(predictions)
ground_truth = np.array(ground_truth)

In [None]:
# Calculate accuracy
accuracy = accuracy_score(ground_truth, predictions)
print(f"Accuracy: {accuracy}")