# Train Queryable Input Classifier
This notebook trains an LSTM model that classifies whether the user's input can be queryable, in that the model should extract keywords to look online via the citation fetcher class.

In [1]:
!pip install torchtext==0.5

Collecting torchtext==0.5
  Downloading torchtext-0.5.0-py3-none-any.whl (73 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.2/73.2 KB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m00:01[0m
Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: sentencepiece, torchtext
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.17.2
    Uninstalling torchtext-0.17.2:
      Successfully uninstalled torchtext-0.17.2
Successfully installed sentencepiece-0.2.0 torchtext-0.5.0


In [3]:
from datasets import Dataset, DatasetDict
from torchtext import data, datasets

import const
import datasets
import pandas as pd
import torchtext
import torch
import torch.nn as nn
import random
import spacy
import subprocess

In [4]:
TEXT = data.Field(tokenize="spacy", tokenizer_language="en_core_web_sm")

In [5]:
LABEL = data.LabelField(dtype= torch.float)

In [6]:
TRAINING_DS_PATH = const.DATASETS_FOLDER + "QI_training.csv"
TESTING_DS_PATH = const.DATASETS_FOLDER + "QI_testing.csv"

FEATURE_COL = "question"
LABEL_COL = "is_searchable"

In [7]:
CORPUS_SIZE = 25000
LEARNING_RATE = 1e-4
BATCH_SIZE = 128
EPOCHS = 5

EMBEDDING_DIM = 256
HIDDEN_DIM = 512
OUTPUT_DIM = 1

In [8]:
training_df = pd.read_csv(TRAINING_DS_PATH)
testing_df = pd.read_csv(TESTING_DS_PATH)

training_df = training_df.sample(frac=1)

  training_df = pd.read_csv(TRAINING_DS_PATH)


In [9]:
training_ds = Dataset.from_pandas(training_df)
validation_ds = Dataset.from_pandas(testing_df)

In [10]:
torch.backends.cudnn.deterministic=True

In [8]:
tokenizer_funct = lambda input_sample, tokenizer: {'tokens': tokenizer(input_sample[FEATURE_COL] + "<eos>")}
#label_tokenizer_funct = lambda input_sample, tokenizer: {'label_tokens': tokenizer(input_sample[LABEL_COL] + "<eos>")}
training_ds = training_ds.map(tokenizer_funct, fn_kwargs={'tokenizer': tokenizer})

Map:   0%|          | 0/204726 [00:00<?, ? examples/s]

In [16]:
training_ds[204000]['tokens']

['I',
 'know',
 ',',
 'but',
 'I',
 'always',
 'get',
 'my',
 'hopes',
 'up',
 'for',
 'no',
 'reason',
 '.',
 '<',
 'eos',
 '>']

In [19]:
positive_cl_count = len(training_df[training_df['is_searchable'] == True])
negative_cl_count = len(training_df[training_df['is_searchable'] == False])

In [20]:
print(positive_cl_count)
print(negative_cl_count)

165319
39407


In [24]:
training_len = len(training_df)
positive_weight = positive_cl_count / training_len
negative_weight = negative_cl_count / training_len
weights = [positive_weight, negative_weight]

In [25]:
print(weights)

[0.807513457010834, 0.192486542989166]


In [27]:
vocab = torchtext.vocab.build_vocab_from_iterator(training_ds['tokens'], min_freq=3)
vocab.insert_token('<unk>', 0)
vocab.insert_token('<eos>', 1)
vocab.set_default_index(vocab['<unk>'])
print(vocab.get_itos()[:10])

['<unk>', '<eos>', '>', 'the', 'What', 'of', 'in', 'to', 'and', 'is']


In [28]:
class QIClassifier(nn.Module):
    
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()

        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=2,
              batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid(dim=1)

    def forward(self, text):
        embedded_text = self.embedding(text)
        output, hidden = self.rnn(embedded_text)
        logits = self.fc(output[:, -1, :])
        output = self.sigmoid(logits)
        return logits, output
        

In [42]:
def get_data(ds, vocab, batch_size):
    data = []
    for example in ds:
        tokens = [vocab[token] for token in example]
        data.extend(tokens)
    data = torch.LongTensor(data)
    num_batches = data.shape[0]
    data = data[:num_batches * batch_size]
    data = data.view(batch_size, num_batches)
    return data

In [43]:
out_data = get_data(training_ds['tokens'], vocab, 8)

RuntimeError: shape '[8, 2529594]' is invalid for input of size 2529594

In [41]:
training_data

NameError: name 'training_data' is not defined

In [None]:
def get_batch(feat_data, seq_len, batch_size, batch_id):
    feature = feat_data[:, idx:idx+seq_len]
    label = 

In [None]:
def train(model, training_ds, epochs=10, device="cpu", epoch_timestamp=1, lr=0.001):
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    epoch_modulus: int
    for i in range(epochs):
        epoch_modulus = i % epoch_timestamp
        for X_batch, y_batch in enumerate(ds):
            X_tensor = torch.tensor(X_batch, dtype=torch.float64, device=device)
            Y_tensor = torch.tensor(y_batch, dtype=torch.float64, device=device)
            logits, out = model(X_tensor)
            optimizer.zero_grad()
            cost = loss(logits, Y_tensor.to(torch.long))
            cost.backward()
            optimizer.step()
            if epoch_timestamp == 1:
                print("Epoch " + str(i + 1) + "/" + str(epochs) + " loss: " + str(cost))
            elif epoch_timestamp > 1:
                if epoch_modulus == epoch_timestamp - 1:
                    end_line = "\n"
                else:
                    end_line = "                        \r"
                print("Epoch " + str(i + 1) + "/" + str(epochs) + " loss: " + str(cost), end=end_line)
            else:
                raise ValueError("Expected epoch_timestamp parameter to be a non-negative number but got " + str(epoch_timestamp))

In [None]:
torch.save(model, const.MODELS_FOLDER + "Aletheianomous-AI_QI_classifier.pt")