# Imports

In [None]:
import _csv
import csv
import sys
from typing import List
import torch
import numpy as np
import functools

import gensim.downloader
word_vectors = gensim.downloader.load('glove-wiki-gigaword-50')

# Change system path to base directory.
sys.path.append("..")
from preprocessing.preprocessor import Preprocessor

## Preprocess

In [None]:
# Control which linguistic preprocessing steps should run.
preprocessor = Preprocessor(perform_case_folding=True,
                            remove_stop_words=True,
                            remove_punctuation=False,
                            perform_lemmatization=False,
                            perform_stemming=False)

def parse_dataset(path: str, limit: int = 0):
    token_lists: List[List[str]] = []
    labels: List[int] = []

    with open(path) as dataset:
        reader: _csv.reader = csv.reader(dataset)
        row: List[str]
        i = 0
        for row in reader:
            if limit > 0 and i == limit: break
            label: int = row[0]
            labels.append(int(label) - 1)
            document: str = row[1]
            token_list: List[str] = preprocessor.process(document)
            token_lists.append(token_list)
            i += 1
    return (token_lists, labels)

token_lists, labels = parse_dataset('../raw_data/fulltrain.csv', 10)
test_token_lists, test_labels = parse_dataset('../raw_data/balancedtest.csv', 10)

## Feature Extraction

In [None]:
def extract_features(token_lists):
    inputs: List[List[float]] = []
    for token_list in token_lists:
        vectors = list(map(lambda token : word_vectors[token] if token in word_vectors else [0.0] * 50, token_list))
        vector = list(functools.reduce(lambda v1, v2 : map(sum, zip(v1, v2)), vectors))
        inputs.append(vector)
    return inputs

inputs = extract_features(token_lists)
test_inputs = extract_features(test_token_lists)


## Training

### Model class

In [None]:
class Simple_NN(torch.nn.Module):
    def __init__(self, input_size: int):
        super(Simple_NN , self).__init__()
        self.lin1 = torch.nn.Linear(input_size, 20)
        self.lin2 = torch.nn.Linear(20, 4)

    def forward(self, input: torch.Tensor):
        x = self.lin1(input)
        x = torch.nn.functional.relu(x)
        x = self.lin2(x)
        return x

### Training Loop

In [None]:
inputs = torch.tensor(inputs)
dataset_size = inputs.size(dim = 0)
input_size = inputs.size(dim = 1)

net = Simple_NN(input_size)
criterion = torch.nn.CrossEntropyLoss()
batch_size = 1000
learning_rate = 0.01

for epoch in range(10):

    # Update learning rate here

    optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)
    shuffled_indices = torch.randperm(1) # Change to number of data

    for i in range(0, dataset_size):
        optimizer.zero_grad()
        
        input = inputs[i]
        label = labels[i]

        scores = net(input.float())
        loss = criterion(scores, torch.tensor(label)) 
        loss.backward()

        optimizer.step()

def get_outputs(net, inputs):
    outputs: List[int] = []
    with torch.no_grad():
        for input in inputs:
            outputs.append(np.argmax(net(input.float()).numpy()))
    return outputs

def eval_accuracy(predictions, answers):
    size = len(predictions)
    num_correct = 0
    for i in range(0, size):
        predicted = predictions[i]
        actual = answers[i]
        if predicted == actual:
            num_correct += 1
    return num_correct / size

outputs = get_outputs(net, inputs)
print(eval_accuracy(outputs, labels))


### Evaluating on test set

In [None]:
test_outputs = get_outputs(net, torch.tensor(test_inputs))
print(eval_accuracy(test_outputs, test_labels))