## Instructions for reproduction -

I expect you'd want to run this on a cluster like MAMBA. You'll need two scripts to submit your job -

`do.sh` - https://paste.rs/
1a4

```
#!/bin/bash

#rm -rf semisupervised_nlu

git clone --recurse-submodules https://github.com/DhruvDh/semisupervised_nlu.git

job=$(qsub -d `pwd` -l nodes=1:ppn=16:gpus=1 submit.sh -q mamba -l walltime=11:30:00)
num=${job:0:5}

echo "Job ID: $num"
echo $num > lastjob
```

`submit.sh` - https://paste.rs/xxK

```
#!/bin/bash

export PATH=~/.local/bin:$PATH

#module load cuda/8.0 cudnn/6.0-cuda8 anaconda3/5.0.1-cuda8

module load pytorch/1.2.0-anaconda3-cuda10.0 
python3 -m pip install --user transformers tensorboardx

source ~/.bashrc
conda init
source ~/.bashrc

cd semisupervised_nlu

python3 ./transformers/examples/run_lm_finetuning.py \
    --output_dir=output \
    --model_type=roberta \
    --model_name_or_path=roberta-base \
    --do_train \
    --train_data_file="./data/roberta/train.txt" \
    --do_eval \
    --eval_data_file="./data/roberta/test.txt" \
    --num_train_epochs=15 \
    --save_steps=659 \
    --save_total_limit=2 \
    --mlm
```

And run `do.sh` to submit the training job. This will produce a model in `semisupervised_nlu/output`, which is where this notebook will try to load the model from later.

In [1]:
from transformers import RobertaForMaskedLM, RobertaTokenizer
import torch
from tqdm import tqdm

## Baseline performance

In [2]:
model = RobertaForMaskedLM.from_pretrained('roberta-base')
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [3]:
import os
import pandas as pd
from random import choice

import nltk

In [4]:
path_to_intents = os.path.join('data', 'raw')
intents = os.listdir(path_to_intents)
get_path = lambda x: os.path.join('data', 'raw', x, x + ".csv")

In [5]:
os.chdir('helpers')
from analyze import questions, entities, get_data
data = get_data()
os.chdir('..')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dhruv\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
from collections import defaultdict
text = defaultdict(list)
encoded_text = defaultdict(list)

ans = defaultdict(list)
response = defaultdict(list)

scores = defaultdict(list)

for (_questions, intent, entity) in zip(questions, intents, entities):
    question = choice(_questions)
    
    for (r, e) in zip(data[intent]['df']['text'], data[intent]['df'][entity]):
        text[intent].append("<s> " + r.strip() + '. ' + question.strip() + " ".join(["<mask>" for x in str(e).split()]) + " </s>")
        ans[intent].append((str(e)).split())

In [7]:
for intent in intents:
    for row in text[intent]:
        encoded_text[intent].append(torch.tensor([tokenizer.encode(row, add_special_tokens=False)]))

In [None]:
with torch.no_grad():
    for intent in intents:
        for i, row in tqdm(enumerate(encoded_text[intent])):
            out = model(row)
            response[intent].append(tokenizer.decode(torch.argmax(out[0][0], dim=1).tolist()).split())
            
            score = nltk.translate.bleu_score.sentence_bleu(
                    [ans[intent][i]],
                    response[intent][i],
                    smoothing_function=nltk.translate.bleu_score.SmoothingFunction().method4,
                    auto_reweigh=True
                )
            scores[intent].append(score)
            

In [None]:
for intent in intents:
    print(f"For {intent}, avg. BLEU score is {sum(scores[intent]) / len(scores[intent])}")

## Finetuned performance (only 15 epochs)

In [8]:
model = RobertaForMaskedLM.from_pretrained('./output/')
tokenizer = RobertaTokenizer.from_pretrained('./output/')

In [None]:
with torch.no_grad():
    for intent in intents:
        for i, row in tqdm(enumerate(encoded_text[intent])):
            out = model(row)
            response[intent].append(tokenizer.decode(torch.argmax(out[0][0], dim=1).tolist()).split())
            
            score = nltk.translate.bleu_score.sentence_bleu(
                    [ans[intent][i]],
                    response[intent][i],
                    smoothing_function=nltk.translate.bleu_score.SmoothingFunction().method4,
                    auto_reweigh=True
                )
            scores[intent].append(score)
            

2042it [03:15, 10.45it/s]
2073it [03:24, 10.16it/s]
2100it [04:10,  8.37it/s]
2100it [03:47,  9.23it/s]
2056it [02:42, 12.65it/s]
2054it [02:39, 12.89it/s]
1192it [01:45, 12.91it/s]

In [None]:
for intent in intents:
    print(f"For {intent}, avg. BLEU score is {sum(scores[intent]) / len(scores[intent])}")