In [1]:
%%shell
wget https://obj.umiacs.umd.edu/elgohary/CANARD_Release.zip
unzip CANARD_Release.zip
rm CANARD_Release.zip
rm -r __MACOSX

pip install spacy
python -m spacy download en

mkdir data
mkdir data/seq2seq

--2020-12-08 16:01:01--  https://obj.umiacs.umd.edu/elgohary/CANARD_Release.zip
Resolving obj.umiacs.umd.edu (obj.umiacs.umd.edu)... 128.8.122.11
Connecting to obj.umiacs.umd.edu (obj.umiacs.umd.edu)|128.8.122.11|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3258983 (3.1M) [application/zip]
Saving to: ‘CANARD_Release.zip’


2020-12-08 16:01:02 (4.97 MB/s) - ‘CANARD_Release.zip’ saved [3258983/3258983]

Archive:  CANARD_Release.zip
   creating: CANARD_Release/
  inflating: __MACOSX/._CANARD_Release  
  inflating: CANARD_Release/multiple_refs.json  
  inflating: __MACOSX/CANARD_Release/._multiple_refs.json  
  inflating: CANARD_Release/test.json  
  inflating: __MACOSX/CANARD_Release/._test.json  
  inflating: CANARD_Release/dev.json  
  inflating: __MACOSX/CANARD_Release/._dev.json  
  inflating: CANARD_Release/train.json  
  inflating: __MACOSX/CANARD_Release/._train.json  
  inflating: CANARD_Release/readme.txt  
  inflating: __MACOSX/CANARD_Release/._readm



In [2]:
import json
import argparse
from os.path import join
from spacy.lang.en import English
import csv

def preprocess(dataset_files, output_dir, split):
  nlp = English()

  with open(join(output_dir,'{}.tsv').format(split), 'w') as outfile:
    tsv_writer = csv.writer(outfile, delimiter='\t')

    for file in dataset_files:
      with open(file) as inh:
        samples = json.load(inh)
    
      for sample in samples:
          src = ' ||| '.join(sample['History']+[sample['Question']])
          tgt = sample['Rewrite']
          src = ' '.join([tok.text for tok in nlp(src)])
          tgt = ' '.join([tok.text for tok in nlp(tgt)])
          tsv_writer.writerow([src, tgt])

In [3]:
preprocess(['CANARD_Release/train.json', 'CANARD_Release/dev.json'], 'data/seq2seq', 'train-dev')
preprocess(['CANARD_Release/train.json'], 'data/seq2seq', 'train')
preprocess(['CANARD_Release/dev.json'], 'data/seq2seq', 'dev')
preprocess(['CANARD_Release/test.json'], 'data/seq2seq', 'test')

!wc -l /content/data/seq2seq/train-dev.tsv
!wc -l /content/data/seq2seq/train.tsv
!wc -l /content/data/seq2seq/dev.tsv
!wc -l /content/data/seq2seq/test.tsv

34956 /content/data/seq2seq/train-dev.tsv
31526 /content/data/seq2seq/train.tsv
3430 /content/data/seq2seq/dev.tsv
5571 /content/data/seq2seq/test.tsv


In [4]:
import torch
import torchtext
from torchtext import data
import spacy

SEED = 1234
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
init_token = '<sos>'
eos_token = '<eos>'

TEXT = data.Field(sequential=True, tokenize="spacy", init_token=init_token, eos_token=eos_token, lower=True, batch_first=True, include_lengths=True)

# train, val, test = data.TabularDataset.splits(
#         path='data/seq2seq', train='train.tsv',
#         validation='dev.tsv', test='test.tsv', format='tsv',
#         fields=[('Source', TEXT), ('Target', TEXT)])

train, val = data.TabularDataset.splits(
        path='data/seq2seq', train='train-dev.tsv',
        validation='test.tsv', format='tsv',
        fields=[('Source', TEXT), ('Target', TEXT)])


print(vars(train[0]))

{'Source': ['johnny', 'unitas', '|||', '1964', 'mvp', 'season', '|||', 'what', 'team', 'did', 'unitas', 'play', 'for'], 'Target': ['what', 'team', 'did', 'johnny', 'unitas', 'play', 'for', '?']}


In [5]:
TEXT.build_vocab(train, val, min_freq = 2, vectors="glove.840B.300d")

print()
print(len(TEXT.vocab.stoi))

print(train[0].__dict__.keys())
print(train[0].Source)

.vector_cache/glove.840B.300d.zip: 2.18GB [16:58, 2.14MB/s]                            
100%|█████████▉| 2195023/2196017 [03:50<00:00, 9887.82it/s]


33069
dict_keys(['Source', 'Target'])
['johnny', 'unitas', '|||', '1964', 'mvp', 'season', '|||', 'what', 'team', 'did', 'unitas', 'play', 'for']


In [8]:
BATCH_SIZE = 16
train_iterator, valid_iterator = data.BucketIterator.splits(
    (train, val),
    batch_size=BATCH_SIZE,
    sort_within_batch = True,
    sort_key = lambda x : len(x.Source),
    shuffle=True,
    device=device)

In [9]:
batch = next(iter(train_iterator))
print(batch.Target[0].shape)

print('Training data has {} batches'.format(len(train_iterator)))
print('Validation data has {} batches'.format(len(valid_iterator)))

torch.Size([16, 21])
Training data has 2185 batches
Validation data has 349 batches


In [10]:
import numpy as np
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import torch.nn.functional as F
import random
import time

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [11]:
class EncoderBiRNN(nn.Module):
  def __init__(self, pretrained_embed, padding_idx, enc_hid_dim, dec_hid_dim, fix = True, dropout=0.0):
    super(EncoderBiRNN, self).__init__()
    self.vocab_size, self.embedding_dim = pretrained_embed.size()
    self.enc_hid_dim = enc_hid_dim
    self.dec_hid_dim = dec_hid_dim
    self.dropout = dropout

    self.embedding = nn.Embedding.from_pretrained(pretrained_embed)
    self.embedding.padding_idx = padding_idx
    if fix:
      self.embedding.weight.requires_grad = False

    self.gru = nn.GRU(self.embedding_dim, enc_hid_dim, batch_first=True, bidirectional=True)
    self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
    self.dropout = nn.Dropout(dropout)
  

  def forward(self, encoder_input, src_len, hidden):
    #src_len = [batch size]

    embedded = self.embedding(encoder_input)

    packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, src_len.cpu(), batch_first=True)

    packed_outputs, hidden = self.gru(packed_embedded, hidden)

    outputs, _ = nn.utils.rnn.pad_packed_sequence(packed_outputs) 
    outputs = outputs.permute(1,0,2)

    hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)))

    #outputs = [batch size, src sent len, enc hid dim * 2]
    #hidden = [batch size, dec hid dim]

    return outputs, hidden


In [12]:
class Attention(nn.Module):
  def __init__(self, enc_hid_dim, dec_hid_dim, attn_dim):
    super(Attention, self).__init__()

    self.enc_hid_dim = enc_hid_dim
    self.dec_hid_dim = dec_hid_dim
    self.attn_in = (enc_hid_dim * 2) + dec_hid_dim
    # self.attn = nn.Linear(self.attn_in, attn_dim)
            
    self.attn = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim)
    self.v = nn.Linear(dec_hid_dim, 1, bias = False)

  def forward(self, decoder_hidden, encoder_outputs, mask):
    batch_size = encoder_outputs.shape[0]

    #decoder_hidden = [batch size, src sent len, dec hid dim]
    #encoder_outputs = [batch size, src sent len, enc hid dim * 2]
    # print('attn decoder_hidden', decoder_hidden.shape)

    #repeat decoder hidden state src_len times
    repeated_decoder_hidden = decoder_hidden.unsqueeze(1).repeat(1, encoder_outputs.shape[1], 1)
    # print('atten repeated_decoder', repeated_decoder_hidden.shape)

    # Step 1: to enable feeding through "self.attn", concatenate 
    # `repeated_decoder_hidden` and `encoder_outputs`:
    # torch.cat((hidden, encoder_outputs), dim = 2) has shape 
    # [batch_size, seq_len, enc_hid_dim * 2 + dec_hid_dim]

    # Step 2: feed through self.attn to end up with:
    # [batch_size, seq_len, attn_dim]

    # Step 3: feed through tanh
    energy = torch.tanh(self.attn(torch.cat((
            repeated_decoder_hidden, 
            encoder_outputs), 
            dim = 2)))
    #energy = [batch size, src len, dec hid dim]

    #attention = torch.sum(energy, dim=2)
    attention = self.v(energy).squeeze(2)

    # print('atten shape', attention.shape)
    #attention= [batch size, src len]
    attention = attention.masked_fill(mask == 0, -1e10)

    return F.softmax(attention, dim=1)
        


In [13]:
class Decoder(nn.Module):
  def __init__(self, pretrained_embed, padding_idx, enc_hid_dim, dec_hid_dim, attention, fix=True, dropout=0.0):
    super(Decoder, self).__init__()
    self.enc_hid_dim = enc_hid_dim
    self.dec_hid_dim = dec_hid_dim
    self.attention = attention

    self.vocab_size, self.embedding_dim = pretrained_embed.size()

    self.embedding = nn.Embedding.from_pretrained(pretrained_embed)
    self.embedding.padding_idx = padding_idx
    if fix:
      self.embedding.weight.requires_grad = False

    self.gru = nn.GRU((enc_hid_dim * 2) + self.embedding_dim, dec_hid_dim, batch_first=True)

    self.out = nn.Linear(self.attention.attn_in + self.embedding_dim, self.vocab_size)
    self.dropout = nn.Dropout(dropout)

  def forward(self, x, decoder_hidden, encoder_outputs, mask):
    #x = [batch size] Note: "one character at a time"
    #hidden = [batch size, dec hid dim]
    # print('x', x.shape)
    # print('decoder_hidden', decoder_hidden.shape)

    x = x.unsqueeze(1)
    # print('x', x.shape)
    embedded = self.embedding(x)
    # print('embedded', embedded.shape)

    a = self.attention(decoder_hidden, encoder_outputs, mask)
    a = a.unsqueeze(1)
    weighted_encoder_rep = torch.bmm(a, encoder_outputs)

    rnn_input = torch.cat((embedded, weighted_encoder_rep), dim = 2)
    #rnn_input = [batch size, 1, (enc hid dim * 2) + emb dim]
    # print('rnn_input', rnn_input.shape)
    
    output, decoder_hidden = self.gru(rnn_input, decoder_hidden.unsqueeze(0))
    #output = [batch size, sent len, dec hid dim * n directions]
    #decoder_hidden = [n layers * n directions, batch size, dec hid dim]

    # print('decoder output', output.shape)
    # print('decoder hidden', decoder_hidden.shape)

    embedded = embedded.squeeze(1)
    output = output.squeeze(1)
    weighted_encoder_rep = weighted_encoder_rep.squeeze(1)
    output = self.out(torch.cat((output, weighted_encoder_rep, embedded), dim = 1))
    # print('decoder output', output.shape)
    #output = [bsz, output dim]
    return output, decoder_hidden.squeeze(0), a.squeeze(1)


In [14]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, src_pad_idx, device):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.enc_hid_dim = encoder.enc_hid_dim
        self.decoder = decoder
        self.src_pad_idx = src_pad_idx
        self.device = device

    def create_mask(self, src):
      mask = (src != self.src_pad_idx)
      return mask

    def forward(self, src, src_len, trg, teacher_forcing_ratio=0.5):
      batch_size = src.shape[0]
      max_len = trg.shape[1]
      trg_vocab_size = self.decoder.vocab_size

      outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)

      initHidden = torch.zeros((2, batch_size, self.enc_hid_dim)).to(self.device)
      # print('s2s initHidden', initHidden.shape)
      enc_output, hidden = self.encoder(src, src_len, initHidden)
      # print('hidden', hidden.shape)

      dec_input = trg[:,0]

      mask = self.create_mask(src)

      for t in range(1, max_len):
        pred, hidden, _ = self.decoder(dec_input, hidden, enc_output, mask)
        # print('pred', pred.shape)
        # print('hidden', hidden.shape)
        outputs[t] = pred

        teacher_force = random.random() < teacher_forcing_ratio
        top1 = pred.max(1)[1]
        dec_input = trg[:, t] if teacher_force else top1
  
      return outputs 


In [15]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [16]:
def train_epoch(epoch, model, data_iterator, optimizer, criterion):
  model.train()
  running_loss = 0.0

  for i, batch in enumerate(data_iterator):
    src, src_len = batch.Source
    trg, _ = batch.Target

    optimizer.zero_grad()

    output = model(src, src_len, trg)

    trg = trg.permute(1,0)
    trg = trg[1:].reshape(-1)

    output = output[1:].view(-1, output.shape[-1])

    loss = criterion(output, trg)
    
    loss.backward()

    torch.nn.utils.clip_grad_norm_(model.parameters(), 1)

    optimizer.step()

    running_loss += loss.item()

    if i % 100 == 0:
      print('Batch ', i)
      print('train_loss: {:.8f}'.format(loss.item()))

  epoch_loss = running_loss / len(data_iterator)
  print("Epoch: {} | train_loss:{:.8f}".format(epoch, epoch_loss))

  return epoch_loss


def eval_epoch(epoch, model, data_iterator, scheduler, criterion):
  model.eval()
  running_loss = 0.0

  with torch.no_grad():
    for i, batch in enumerate(data_iterator):
      src, src_len = batch.Source
      trg, _ = batch.Target

      output = model(src, src_len, trg, 0) #turn off teacher forcing
      output = output[1:].reshape(-1, output.shape[-1])

      trg = trg.permute(1,0)
      trg = trg[1:].reshape(-1)

      loss = criterion(output, trg)
      running_loss += loss.item()

      if i % 100 == 0:
        print('Batch ', i)
        print('val_loss: {:.8f}'.format(loss.item()))

  epoch_loss = running_loss / len(data_iterator)
  print("Epoch: {} | val_loss:{:.8f}".format(epoch, epoch_loss))
  scheduler.step(epoch_loss)
  print('scheduler adjust learning rate to ', optimizer.param_groups[0]['lr'])

  return epoch_loss


In [17]:
enc_hid_dim = 512
dec_hid_dim = 512
attn_dim = 64
PAD_idx = TEXT.vocab.stoi['<pad>']
attn = Attention(enc_hid_dim, dec_hid_dim, attn_dim)
enc = EncoderBiRNN(pretrained_embed=TEXT.vocab.vectors, padding_idx=TEXT.vocab.stoi[TEXT.pad_token], enc_hid_dim=enc_hid_dim, dec_hid_dim=dec_hid_dim)
dec = Decoder(pretrained_embed=TEXT.vocab.vectors, padding_idx=TEXT.vocab.stoi[TEXT.pad_token], enc_hid_dim=enc_hid_dim, dec_hid_dim=dec_hid_dim, attention=attn)
model = Seq2Seq(enc, dec, PAD_idx, device).to(device)


learning_rate = 1e-3
#optimizer = optim.Adam(model.parameters(), lr=learning_rate)
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)

scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, patience = 50, min_lr=1e-9)
criterion = nn.CrossEntropyLoss(ignore_index = TEXT.vocab.stoi['<pad>'])

epoches = 2000
best_loss = 5
plot_losses = []

print('total parameters.',  sum(p.numel() for p in model.parameters() if p.requires_grad))
print("training batches: ", len(train_iterator))
print("val batches: ", len(valid_iterator))


for epoch in range(epoches):
  start_time = time.time()
  train_epoch_loss = train_epoch(epoch,model, train_iterator, optimizer, criterion)
  eval_epoch_loss = eval_epoch(epoch, model, valid_iterator, scheduler, criterion)
  end_time = time.time()

  epoch_mins, epoch_secs = epoch_time(start_time, end_time)

  plot_losses.append(eval_epoch_loss)

  if (eval_epoch_loss < best_loss) or epoch % 10 == 0:
    if eval_epoch_loss < best_loss:
      best_loss = eval_epoch_loss
    torch.save(model.state_dict(), '/content/drive/MyDrive/gru/model_{:.8f}_{}.pt'.format(eval_epoch_loss, epoch))

  print(f'*************Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s ***********')



total parameters. 67383785
training batches:  2185
val batches:  349
Batch  0
train_loss: 10.40270042
Batch  100
train_loss: 5.31844807
Batch  200
train_loss: 5.75977659
Batch  300
train_loss: 5.20387268
Batch  400
train_loss: 5.25387621
Batch  500
train_loss: 4.57339430
Batch  600
train_loss: 4.37727928
Batch  700
train_loss: 4.60602236
Batch  800
train_loss: 4.56505442
Batch  900
train_loss: 3.21207333
Batch  1000
train_loss: 3.07398844
Batch  1100
train_loss: 4.25958443
Batch  1200
train_loss: 4.19718838
Batch  1300
train_loss: 4.22239733
Batch  1400
train_loss: 2.62138152
Batch  1500
train_loss: 2.99777508
Batch  1600
train_loss: 3.41293740
Batch  1700
train_loss: 3.65928960
Batch  1800
train_loss: 3.04116321
Batch  1900
train_loss: 3.44119787
Batch  2000
train_loss: 2.27388930
Batch  2100
train_loss: 2.86063862
Epoch: 0 | train_loss:3.81118659
Batch  0
val_loss: 3.51596832
Batch  100
val_loss: 4.15853119
Batch  200
val_loss: 4.43392754
Batch  300
val_loss: 6.12960243
Epoch: 0 | va

KeyboardInterrupt: ignored

In [18]:
enc_hid_dim = 512
dec_hid_dim = 512
attn_dim = 64
PAD_idx = TEXT.vocab.stoi['<pad>']
attn = Attention(enc_hid_dim, dec_hid_dim, attn_dim)
enc = EncoderBiRNN(pretrained_embed=TEXT.vocab.vectors, padding_idx=TEXT.vocab.stoi[TEXT.pad_token], enc_hid_dim=enc_hid_dim, dec_hid_dim=dec_hid_dim)
dec = Decoder(pretrained_embed=TEXT.vocab.vectors, padding_idx=TEXT.vocab.stoi[TEXT.pad_token], enc_hid_dim=enc_hid_dim, dec_hid_dim=dec_hid_dim, attention=attn)
model = Seq2Seq(enc, dec, PAD_idx, device).to(device)

model.load_state_dict(torch.load('/content/drive/MyDrive/gru/model_9.13775595_80.pt'))
model.eval()

Seq2Seq(
  (encoder): EncoderBiRNN(
    (embedding): Embedding(33069, 300, padding_idx=1)
    (gru): GRU(300, 512, batch_first=True, bidirectional=True)
    (fc): Linear(in_features=1024, out_features=512, bias=True)
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attn): Linear(in_features=1536, out_features=512, bias=True)
      (v): Linear(in_features=512, out_features=1, bias=False)
    )
    (embedding): Embedding(33069, 300, padding_idx=1)
    (gru): GRU(1324, 512, batch_first=True)
    (out): Linear(in_features=1836, out_features=33069, bias=True)
    (dropout): Dropout(p=0.0, inplace=False)
  )
)

In [54]:
learning_rate = 1e-3
optimizer = optim.Adam(model.parameters(), lr=learning_rate,)
scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, patience = 5, min_lr=1e-9)
criterion = nn.CrossEntropyLoss(ignore_index = TEXT.vocab.stoi['<pad>'])

eval_epoch(0, model, test_iterator, scheduler, criterion)

Batch  0
val_loss: 2.54023647
Batch  100
val_loss: 3.70880771
Batch  200
val_loss: 3.79370332
Batch  300
val_loss: 5.61560869
Epoch: 0 | val_loss:4.34996969
scheduler adjust learning rate to  0.001


4.349969694129375

In [19]:
def translate_sentence(sentence, src_field, model, device, max_len = 100):
  model.eval()
  nlp = spacy.load('en')
  tokens = [src_field.init_token] + [token.text.lower() for token in nlp(sentence)] + [src_field.eos_token]

  src_indexes = [src_field.vocab.stoi[token] for token in tokens]

  src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)
  src_len = torch.LongTensor([len(src_indexes)]).to(device)

  with torch.no_grad():
    initHidden = torch.zeros((2, 1, enc_hid_dim)).to(device)
    encoder_outputs, hidden = model.encoder(src_tensor, src_len, initHidden)

  mask = model.create_mask(src_tensor)
  
  trg_indexes = [src_field.vocab.stoi[src_field.init_token]]
  attentions = torch.zeros(max_len, 1, len(src_indexes)).to(device)

  for i in range(max_len):
    trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(device)
    with torch.no_grad():
      output, hidden, attention = model.decoder(trg_tensor, hidden, encoder_outputs, mask)
    
    attentions[i] = attention
    pred_token = output.argmax(1).item()
    trg_indexes.append(pred_token)
    if pred_token == src_field.vocab.stoi[src_field.eos_token]:
            break

  trg_tokens = [src_field.vocab.itos[i] for i in trg_indexes]

  return trg_tokens[1:], attentions[:len(trg_tokens)-1]


translate_sentence("how are you Nicole", TEXT, model,device)

(['how', 'are', 'you', 'nicole', '<eos>'],
 tensor([[[8.7455e-02, 6.7911e-01, 8.9767e-02, 3.5069e-02, 3.5552e-02,
           7.3048e-02]],
 
         [[2.3584e-02, 1.0861e-01, 7.9780e-01, 6.0897e-02, 8.3767e-03,
           7.3422e-04]],
 
         [[5.2186e-02, 1.8997e-02, 5.2403e-02, 6.4394e-01, 2.2600e-01,
           6.4728e-03]],
 
         [[7.7604e-03, 3.0207e-03, 2.0119e-03, 3.6106e-02, 8.7945e-01,
           7.1655e-02]],
 
         [[1.2719e-02, 7.2070e-03, 1.3895e-03, 1.2865e-02, 2.4963e-01,
           7.1619e-01]]], device='cuda:0'))

In [22]:
import json
import csv

with open('/content/treccastweb/2019/data/evaluation/evaluation_topics_v1.0.json', 'r') as f:
  data = json.load(f)

with open('eval_query.tsv', 'w') as f:
  tsv_writer = csv.writer(f, delimiter='\t')

  for item in data:
    qid = item['number']
    prev = []
    for turn in item['turn']:
      id = str(qid) + '_' + str(turn['number'])
      tsv_writer.writerow([id, ' ||| '.join(prev + [turn['raw_utterance']])])
      prev.append(turn['raw_utterance'])
   

In [23]:
import csv

with open('query_rewritten_gru.tsv', 'w') as outfile:
  tsv_writer = csv.writer(outfile, delimiter='\t')

  with open('eval_query.tsv', 'r') as f:
    read_tsv = csv.reader(f, delimiter="\t")
    for line in read_tsv:
      qid, query = line
      rewritten, _ = translate_sentence(query, TEXT, model, device)
      rewritten = ' '.join(rewritten[:-1])
      print(rewritten)
      # tsv_writer.writerow([qid, rewritten])

what is amitabh cancer ?
is is cancer cancer in the 1976 ?
tell me cancer cancer cancer .
what are the lung cancer ?
can the <unk> can can the the hattie mcdaniel ?
what causes lung cancer in relation to cancer ?
what is the first sign of the cancer cancer ?
is the first of the cancer as same cancer , cancer ?
what 's the in in the cancer bats and cancer cancer ?
what are the different types of of of the fantastic of of the ?
if the song , is the eagles " is ?
tell me more about the death of the article ?
what is the largest ever to have living on earth ?
what 's the biggest ever was in the article on the ?
what does for great the sweet , the fight ?
tell does for the cult , are there about
what are the show of <unk> about ?
where do the naga people ?
what do the band members on the cult ?
how do the price of the live on the philadelphia phillies being promoted for ?
tell me about the film story film .
what is the film story about ?
how is the film story about ?
did the film story film

KeyboardInterrupt: ignored

In [21]:
!git clone https://github.com/daltonj/treccastweb.git

Cloning into 'treccastweb'...
remote: Enumerating objects: 5, done.[K
remote: Counting objects: 100% (5/5), done.[K
remote: Compressing objects: 100% (5/5), done.[K
remote: Total 430 (delta 0), reused 0 (delta 0), pack-reused 425[K
Receiving objects: 100% (430/430), 13.04 MiB | 11.95 MiB/s, done.
Resolving deltas: 100% (186/186), done.
