## Preprocess

In [13]:
import _csv
import csv
import sys
from typing import List

# Change system path to base directory.
sys.path.append("..")
from preprocessing.preprocessor import Preprocessor

# Control which linguistic preprocessing steps should run.
preprocessor = Preprocessor(perform_case_folding=True,
                            remove_stop_words=True,
                            remove_punctuation=False,
                            perform_lemmatization=False,
                            perform_stemming=False)

token_lists: List[List[str]] = []
labels: List[int] = []

with open('../raw_data/fulltrain.csv') as training_dataset:
    reader: _csv.reader = csv.reader(training_dataset)
    row: List[str]
    i = 0
    for row in reader:
        if i > 10: break
        label: int = row[0]
        labels.append(label)
        document: str = row[1]
        token_list: List[str] = preprocessor.process(document)
        token_lists.append(token_list)
        i += 1

## Feature Extraction

In [5]:
import functools

import gensim.downloader

word_vectors = gensim.downloader.load('glove-wiki-gigaword-50')

In [14]:
inputs: List[List[float]] = []

for token_list in token_lists:
    vectors = list(map(lambda token : word_vectors[token] if token in word_vectors else [0.0] * 50, token_list))
    vector = list(functools.reduce(lambda v1, v2 : map(sum, zip(v1, v2)), vectors))
    inputs.append(vector)

labels = list(map(int, labels))

## Training

### Model class

In [17]:
import torch

class Simple_NN(torch.nn.Module):
    def __init__(self, input_size: int):
        super(Simple_NN , self).__init__()
        self.lin1 = torch.nn.Linear(input_size, 20)
        self.lin2 = torch.nn.Linear(20, 4)

    def forward(self, input: torch.Tensor):
        x = self.lin1(input)
        x = torch.nn.functional.relu(x)
        x = self.lin2(x)
        return x

### Training Loop

In [20]:
inputs = torch.tensor(inputs)
labels = torch.tensor(labels)
dataset_size = inputs.size(dim = 0)
input_size = inputs.size(dim = 1)

net = Simple_NN(input_size)
criterion = torch.nn.CrossEntropyLoss()
batch_size = 1000
learning_rate = 0.01

def test_eval():
    pass

for epoch in range(10):

    # Update learning rate here

    optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)
    shuffled_indices = torch.randperm(1) # Change to number of data

    for input in inputs:
        print(net(input.float()))
