## Preprocess

In [13]:
import _csv
import csv
import sys
from typing import List

# Change system path to base directory.
sys.path.append("..")
from preprocessing.preprocessor import Preprocessor

# Control which linguistic preprocessing steps should run.
preprocessor = Preprocessor(perform_case_folding=True,
                            remove_stop_words=True,
                            remove_punctuation=False,
                            perform_lemmatization=False,
                            perform_stemming=False)

token_lists: List[List[str]] = []
labels: List[int] = []

with open('../raw_data/fulltrain.csv') as training_dataset:
    reader: _csv.reader = csv.reader(training_dataset)
    row: List[str]
    i = 0
    for row in reader:
        if i > 10: break
        label: int = row[0]
        labels.append(label)
        document: str = row[1]
        token_list: List[str] = preprocessor.process(document)
        token_lists.append(token_list)
        i += 1

## Feature Extraction

In [14]:
import functools

import gensim.downloader

word_vectors = gensim.downloader.load('glove-wiki-gigaword-50')

In [15]:
inputs: List[List[float]] = []

for token_list in token_lists:
    vectors = list(map(lambda token : word_vectors[token] if token in word_vectors else [0.0] * 50, token_list))
    vector = list(functools.reduce(lambda v1, v2 : map(sum, zip(v1, v2)), vectors))
    inputs.append(vector)

labels = list(map(lambda x: int(x) - 1, labels))

## Training

### Model class

In [16]:
import torch

class Simple_NN(torch.nn.Module):
    def __init__(self, input_size: int):
        super(Simple_NN , self).__init__()
        self.lin1 = torch.nn.Linear(input_size, 20)
        self.lin2 = torch.nn.Linear(20, 4)

    def forward(self, input: torch.Tensor):
        x = self.lin1(input)
        x = torch.nn.functional.relu(x)
        x = self.lin2(x)
        return x

### Training Loop

In [None]:
import numpy as np

inputs = torch.tensor(inputs)
dataset_size = inputs.size(dim = 0)
input_size = inputs.size(dim = 1)

net = Simple_NN(input_size)
criterion = torch.nn.CrossEntropyLoss()
batch_size = 1000
learning_rate = 0.01

for epoch in range(10):

    # Update learning rate here

    optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)
    shuffled_indices = torch.randperm(1) # Change to number of data

    for i in range(0, dataset_size):
        optimizer.zero_grad()
        
        input = inputs[i]
        label = labels[i]

        scores = net(input.float())
        loss = criterion(scores, label) 
        loss.backward()

        optimizer.step()

outputs = []

with torch.no_grad():
    for input in inputs:
        outputs.append(np.argmax(net(input.float()).numpy()))

def eval_accuracy(predictions, answers):
    size = len(predictions)
    num_correct = 0
    for i in range(0, size):
        predicted = predictions[i]
        actual = answers[i]
        if predicted == actual:
            num_correct += 1
    return num_correct / size
print(eval_accuracy(outputs, list(labels.numpy())))


### Evaluating on test set

In [None]:
test_token_lists: List[List[str]] = []
test_labels: List[int] = []

with open('../raw_data/balancedtest.csv') as test_dataset:
    reader: _csv.reader = csv.reader(test_dataset)
    row: List[str]
    i = 0
    for row in reader:
        if i > 10: break
        label: int = row[0]
        test_labels.append(label)
        document: str = row[1]
        token_list: List[str] = preprocessor.process(document)
        test_token_lists.append(token_list)
        i += 1

test_inputs: List[List[float]] = []

for token_list in test_token_lists:
    vectors = list(map(lambda token : word_vectors[token] if token in word_vectors else [0.0] * 50, token_list))
    vector = list(functools.reduce(lambda v1, v2 : map(sum, zip(v1, v2)), vectors))
    test_inputs.append(vector)

test_inputs = torch.tensor(test_inputs)
test_labels = list(map(lambda x: int(x) - 1, test_labels))

test_outputs = []

with torch.no_grad():
    for input in test_inputs:
        test_outputs.append(np.argmax(net(input.float()).numpy()))
print(eval_accuracy(test_outputs, test_labels))