# Train Queryable Input Classifier
This notebook trains an LSTM model that classifies whether the user's input can be queryable, in that the model should extract keywords to look online via the citation fetcher class.

In [1]:
from datasets import Dataset, DatasetDict

import const
import datasets
import pandas as pd
import torchtext
import torch
import torch.nn as nn
import random
import spacy
import subprocess

In [2]:
tokenizer = torchtext.data.utils.get_tokenizer('spacy')



In [3]:
TRAINING_DS_PATH = const.DATASETS_FOLDER + "tqi_training.csv"
TESTING_DS_PATH = const.DATASETS_FOLDER + "tqi_testing.csv"

FEATURE_COL = "input"
LABEL_COL = "is_queryable"

In [4]:
CORPUS_SIZE = 25000
LEARNING_RATE = 1e-4
BATCH_SIZE = 128
EPOCHS = 5

EMBEDDING_DIM = 256
HIDDEN_DIM = 512
OUTPUT_DIM = 1

In [5]:
training_df = pd.read_csv(TRAINING_DS_PATH)
testing_df = pd.read_csv(TESTING_DS_PATH)

training_df = training_df.sample(frac=1)

FileNotFoundError: [Errno 2] No such file or directory: '../datasets/tqi_training.csv'

In [None]:
training_ds = Dataset.from_pandas(training_df)
validation_ds = Dataset.from_pandas(validation_df)

In [None]:
torch.backends.cudnn.deterministic=True

In [None]:
tokenizer_funct = lambda input_sample, tokenizer: {'tokens': tokenizer(input_sample[FEATURE_COL] + "<eos>")}
label_tokenizer_funct = lambda input_sample, tokenizer: {'label_tokens': tokenizer(input_sample[LABEL_COL] + "<eos>")}
training_ds = training_ds.map(tokenizer_funct, fn_kwargs={'tokenizer': tokenizer})

In [None]:
training_ds[88]['tokens']

In [None]:
vocab = torchtext.vocab.buildvocab_from_iterator(training_ds['tokens'], min_freq=3)
vocab.insert_token('<unk>', 0)
vocab.insert_token('<eos>', 1)
vocab.set_default_index(vocab['<unk>'])
print(vocab.get_itos()[:10])

In [None]:
class QIClassifier(nn.Module):
    
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()

        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=2,
              batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid(dim=1)

    def forward(self, text):
        embedded_text = self.embedding(text)
        output, hidden = self.rnn(embedded_text)
        logits = self.fc(output[:, -1, :])
        output = self.sigmoid(logits)
        return logits, output
        

In [None]:
def get_data(ds, vocab, batch_size):
    data = []
    for example in ds:
        if example['tokens']:
            tokens = [vocab[token] for token in example['tokens']]
            data.extend(tokens)
    data = torch.LongTensor(data)
    num_batches = data.shape[0]
    data = data[:num_batches * batch_size]
    data = data.view(batch_size, num_batches)
    return data

In [None]:
training_ds = get_data(training_ds['tokens', vocab, 8])

In [None]:
training_ds

In [None]:
def get_batch(feat_data, seq_len, batch_size, batch_id):
    feature = feat_data[:, idx:idx+seq_len]
    label = 

In [None]:
def train(model, training_ds, epochs=10, device="cpu", epoch_timestamp=1, lr=0.001):
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    epoch_modulus: int
    for i in range(epochs):
        epoch_modulus = i % epoch_timestamp
        for X_batch, y_batch in enumerate(ds):
            X_tensor = torch.tensor(X_batch, dtype=torch.float64, device=device)
            Y_tensor = torch.tensor(y_batch, dtype=torch.float64, device=device)
            logits, out = model(X_tensor)
            optimizer.zero_grad()
            cost = loss(logits, Y_tensor.to(torch.long))
            cost.backward()
            optimizer.step()
            if epoch_timestamp == 1:
                print("Epoch " + str(i + 1) + "/" + str(epochs) + " loss: " + str(cost))
            elif epoch_timestamp > 1:
                if epoch_modulus == epoch_timestamp - 1:
                    end_line = "\n"
                else:
                    end_line = "                        \r"
                print("Epoch " + str(i + 1) + "/" + str(epochs) + " loss: " + str(cost), end=end_line)
            else:
                raise ValueError("Expected epoch_timestamp parameter to be a non-negative number but got " + str(epoch_timestamp))

In [None]:
torch.save(model, const.MODELS_FOLDER + "Aletheianomous-AI_QI_classifier.pt")