#installing Dependencies 

In [None]:
!pip -q install trax

!pip install termcolor
!pip install numpy

[K     |████████████████████████████████| 637 kB 5.4 MB/s 
[K     |████████████████████████████████| 4.9 MB 33.4 MB/s 


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import json
import random
import numpy as np

import trax
from trax import layers as tl
from trax.supervised import training
from termcolor import colored

#Acessing the DATA path and vocab file

In [None]:
# filename of the MultiWOZ dialogue dataset
DATA_FILE = 'data.json'
# data directory
DATA_DIR = '/content/drive/MyDrive/Colab Notebooks/NLP/MULTIWOZ2 2'
# dictionary where we will load the dialogue dataset
DIALOGUE_DB = {}
#Vocab dir
VOCAB_DIR = '/content/drive/MyDrive/Colab Notebooks/NLP/vocab'

In [None]:
# help function to load a JSON file
def load_json(directory, file):
    with open(f'{directory}/{file}') as file: 
        db = json.load(file)
    return db

# load the dialogue data set into our dictionary
multiwoz_json = load_json(DATA_DIR, DATA_FILE)

print("Dataset loaded. Number of dialogues: {}".format(len(multiwoz_json)))

Dataset loaded. Number of dialogues: 10438


In [None]:
print(multiwoz_json['SNG01856.json'].keys())

dict_keys(['goal', 'log'])


In [None]:
## Displayed a dialogue with colors

sample_dialogue = multiwoz_json['SNG0129.json']['log']

for i in range(len(sample_dialogue)):
    if i % 2 == 0:
        print(colored(sample_dialogue[i]['text'], 'blue'))
    else:
        print(colored(sample_dialogue[i]['text'], 'red'))

[34mHello, I have been robbed.  Can you please help me get in touch with the police?[0m
[31mParkside Police Station is in Parkside, Cambridge. Their number is 01223358966. Anything else I can do for you?[0m
[34mCan I please have the postcode as well?[0m
[31mThe postcode for the Parkside Police Station is CB11JG. Can I help you with anything else?[0m
[34mWas Parkside the address of the police station? If not, can I have the address please?[0m
[31mYes, Parkside is the address.[0m
[34mThank you that will be all for now.[0m
[31mGreat. Thank you for contacting Cambridge Towninfo Centre.[0m
[34mYou were great. Goodbye.[0m
[31mWe are happy to help. Have a good day![0m


In [None]:
with open('/content/drive/MyDrive/Colab Notebooks/NLP/MULTIWOZ2 2/README.json') as file:
    print(file.read())

#####################################################
#####################################################
#  Copyright Cambridge Dialogue Systems Group, 2018 #
#####################################################
#####################################################

Dataset contains the following json files:
1. data.json: the woz dialogue dataset, which contains the conversation  users and wizards, as well as a set of coarse labels for each user turn. Files with multi-domain dialogues have "MUL" in their names. Single domain dialogues have either "SNG" or "WOZ" in their names.
2. restaurant_db.json: the Cambridge restaurant database file, containing restaurants in the Cambridge UK area and a set of attributes.
3. attraction_db.json: the Cambridge attraction database file, contining attractions in the Cambridge UK area and a set of attributes.
4. hotel_db.json: the Cambridge hotel database file, containing hotels in the Cambridge UK area and a set of attributes.
5. train_db.json: th

##Data Pre-Processing

----

---

In [None]:
dialogue_sentences_list = []

for json_index in multiwoz_json.keys():
    
    dialogue = multiwoz_json[json_index]['log']

    dialogue_sentences_str = ""

    for i in range(len(dialogue)):

        if i % 2 == 0:
            dialogue_sentences_str += " Person 1: " + dialogue[i]['text']
        else:
            dialogue_sentences_str += " Person 2: " + dialogue[i]['text']
    
    dialogue_sentences_list.append(dialogue_sentences_str)

In [None]:
print(len(dialogue_sentences_list))

10438


In [None]:
## shuffle the list
random.shuffle(dialogue_sentences_list)

## Split 500 dialogues to the test dataset
train_data, test_data = dialogue_sentences_list[:-500], dialogue_sentences_list[-500:]

print("Number of train_data: {}".format(len(train_data)))
print("Number of test_data: {}".format(len(test_data)))

Number of train_data: 9938
Number of test_data: 500


In [None]:
print(train_data[0])
for i in train_data[0:5]:
  print(i)

 Person 1: i need a place to stay Person 2: Sure, do you have an area of town you want to stay in? Person 1: I'd like to stay in the east.  I'm looking for a 4 star hotel.  I don't need any parking. Person 2: Okay, I recommend the Autumn House. It is a guest house in the cheap price range. Would you like me to book a reservation? Person 1: No, thank you.  Person 2: Can I help you with anything else? Person 1: Do you have any places that are hotels, not guest houses? Person 2: There aren't any 4 star hotels available in the east. Person 1: I guess I'll book the guesthouse for 4 people, 4 nights starting wednesday.  Person 2: I'm sorry, after taking a second look, the Autumn House is not located in the east area.  Would A&B Guest House be okay?  It's also 4-star. Person 1: Yes. Need it in the east. Star of 4 and do not care about parking, but do need it to be a hotel not guesthouse.  Person 2: I am sorry but there are no 4 star hotels in the east. Is there another area you would like to 

In [None]:
print(test_data[0])

 Person 1: Can you help me find some entertainment in Cambridge? Person 2: There are many things to do in Cambridge; clubs, museums, churches, boating etc. What would you like to do?  Person 1: I don't care but I need the area, entrance fee, and postcode of entertainment available in town. Person 2: I'd recommend All Saints Church in the centre. Their postcode is cb58bs and they have free entrance. Anything else I can help you with? Person 1: Yes, I am looking for a place to dine.  Can you recommend a restaurant in the expensive price range that serves Swiss food? Person 2: I'm sorry, there are no Swiss restaurants in the area.  Do you have another kind in mind that I could assist with? Person 1: How about British food? Person 2: I have several restaurants which serve British food in different areas of the town. Is there an area you prefer? Person 1: I'm sorry I did not want British food.  Are there any chinese restaurants instead located in the centre area? Person 2: Yes, I have found

In [None]:
for i in range(len(train_data)):
    train_data[i] = train_data[i].strip()

for i in range(len(test_data)):
    test_data[i] = test_data[i].strip()

#Building DATA Pipeline

In [None]:
def stream_generator(data):
    while True:
        x = random.choice(data)
        yield (x, x)

VOCAB_DIR = '/content/drive/MyDrive/Colab Notebooks/NLP/vocab'

In [None]:
data_pipeline = trax.data.Serial(trax.data.Shuffle(),
                                 trax.data.Tokenize(vocab_file = 'en_32k.subword'),
                                 trax.data.FilterByLength(2048),
                                 trax.data.BucketByLength(boundaries = [128, 256, 512, 1024], batch_sizes = [16, 8, 4, 2, 1]),
                                 trax.data.AddLossWeights(id_to_mask = 0))

train_stream = data_pipeline(stream_generator(train_data))
test_stream = data_pipeline(stream_generator(test_data))

In [None]:
# (input, target, weights)
print("train_stream")
print(next(train_stream))
print("\ntest_stream")
print(next(test_stream))

train_stream
(array([[8745,    3,   54, ...,    0,    0,    0],
       [8745,    3,   54, ...,    0,    0,    0],
       [8745,    3,   54, ...,    0,    0,    0],
       [8745,    3,   54, ...,    0,    0,    0]]), array([[8745,    3,   54, ...,    0,    0,    0],
       [8745,    3,   54, ...,    0,    0,    0],
       [8745,    3,   54, ...,    0,    0,    0],
       [8745,    3,   54, ...,    0,    0,    0]]), array([[1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.]], dtype=float32))

test_stream
(array([[8745,    3,   54, ...,    0,    0,    0],
       [8745,    3,   54, ...,    0,    0,    0],
       [8745,    3,   54, ...,    0,    0,    0],
       [8745,    3,   54, ...,    0,    0,    0]]), array([[8745,    3,   54, ...,    0,    0,    0],
       [8745,    3,   54, ...,    0,    0,    0],
       [8745,    3,   54, ...,    0,    0,    0],
       [8745,    3,   54, ...,    0,    0,    0

In [None]:
## Check
x = next(train_stream)[0]
y = next(test_stream)

print(x.shape)
y = trax.data.detokenize(x[0], vocab_file = 'en_32k.subword')
print(y)
del x, y

(4, 512)
Person 1: Let's find me a cheap eatery in the north part of the town. Thank you. Person 2: There are 2 places in the north that are in the cheap price range.  Royal Spice which serves Indian food and Da Vinci Pizzeria.  Do either sound good to you? Person 1: Royal Spice sounds good.  Can you reserve a table for 2 at 17:45 on Sunday? Person 2: Sure reference number is CH9ZLEFO Person 1: I also need a place to stay Person 2: Could you tell me what area you would like to stay in, and if you require parking or wifi? Person 1: I want somewhere with 0 stars and free wifi that is cheap and in the north, just like my restaurant Person 2: I have found the City Centre North B and B guesthouse.  It is cheap and has 0 stars. Person 1: Can you tell me the postcode? Person 2: Sure. The postcode is cb43ht. Can I help you with anything else today? Person 1: I'm going to need a taxi also. Person 2: Okay, please tell me where you will depart from, your destination, and a time that you either wi

#Create The Re-Former Model

##Initiate Training

In [None]:
def ReformerLM(vocab_size = 33000, n_layers = 6, mode = 'train', attention_type = tl.SelfAttention):
    model = trax.models.reformer.ReformerLM(vocab_size = vocab_size,
                                            n_layers = n_layers,
                                            mode = mode,
                                            attention_type = attention_type)

    return model

## Check
model = ReformerLM(mode = 'train')
print(str(model))
del model 

Serial[
  Serial[
    ShiftRight(1)
  ]
  Embedding_33000_512
  Dropout
  Serial[
    PositionalEncoding
  ]
  Dup_out2
  ReversibleSerial_in2_out2[
    ReversibleHalfResidualDecoderAttn_in2_out2[
      Serial[
        LayerNorm
      ]
      SelfAttention
    ]
    ReversibleSwap_in2_out2
    ReversibleHalfResidualDecoderFF_in2_out2[
      Serial[
        LayerNorm
        Dense_2048
        Dropout
        Serial[
          FastGelu
        ]
        Dense_512
        Dropout
      ]
    ]
    ReversibleSwap_in2_out2
    ReversibleHalfResidualDecoderAttn_in2_out2[
      Serial[
        LayerNorm
      ]
      SelfAttention
    ]
    ReversibleSwap_in2_out2
    ReversibleHalfResidualDecoderFF_in2_out2[
      Serial[
        LayerNorm
        Dense_2048
        Dropout
        Serial[
          FastGelu
        ]
        Dense_512
        Dropout
      ]
    ]
    ReversibleSwap_in2_out2
    ReversibleHalfResidualDecoderAttn_in2_out2[
      Serial[
        LayerNorm
      ]
      SelfA

In [None]:
def training_loop(ReformerLM, train_generator, eval_generator):
    # schedule of the learning rate
    lr_schedule = trax.lr.warmup_and_rsqrt_decay(n_warmup_steps = 4000, max_value = 0.001)

    # the training task
    train_task = training.TrainTask(labeled_data = train_generator,
                                    loss_layer = tl.CrossEntropyLoss(),
                                    optimizer = trax.optimizers.Adam(0.001),
                                    lr_schedule = lr_schedule,
                                    n_steps_per_checkpoint = 50)#Change the steps size --original was 200
    
    # the evaluation task
    eval_task = training.EvalTask(labeled_data = eval_generator,metrics = [tl.CrossEntropyLoss(), tl.Accuracy()])
    
    # create the loop object
    loop = training.Loop(model = ReformerLM(mode = 'train'),tasks = [train_task],
                         eval_tasks = [eval_task],output_dir = '/content/drive/MyDrive/ReF-MODEL') #/content/drive
    
    return loop

#SEE Training 

In [None]:
## This box needs to be executed if training_loop has been run before 55555
#!rm model.pkl.gz
#!rm config.gin
#!rm -r train
#!rm -r eval

In [None]:
loop = training_loop(ReformerLM, train_stream, test_stream)
loop.run(20000) # taking almost 75 minutes to run 200   

  "jax.host_count has been renamed to jax.process_count. This alias "



Step    900: Ran 25 train steps in 805.37 secs
Step    900: train CrossEntropyLoss | -122.80056000
Step    900: eval  CrossEntropyLoss | -129.02351379
Step    900: eval          Accuracy |  0.07477289

Step    925: Ran 25 train steps in 665.25 secs
Step    925: train CrossEntropyLoss | -132.05345154
Step    925: eval  CrossEntropyLoss | -135.43891907
Step    925: eval          Accuracy |  0.07008670

Step    950: Ran 25 train steps in 541.31 secs
Step    950: train CrossEntropyLoss | -141.64566040
Step    950: eval  CrossEntropyLoss | -147.44853210
Step    950: eval          Accuracy |  0.06678082

Step    975: Ran 25 train steps in 537.52 secs
Step    975: train CrossEntropyLoss | -152.55732727
Step    975: eval  CrossEntropyLoss | -151.58013916
Step    975: eval          Accuracy |  0.06196999

Step   1000: Ran 25 train steps in 543.35 secs
Step   1000: train CrossEntropyLoss | -163.88581848
Step   1000: eval  CrossEntropyLoss | -166.85350037
Step   1000: eval          Accuracy |  0

#Model EVALUATION 

####Helper Function to De-tokenize 
####To print Coloured Dialogue

In [None]:
#####################################################################################3
#!rm -f model/model.pkl.gz  #Location:/content/model/model.weights.npy.gz #/content/model/model.pkl.gz
#/content/model.pkl.gz
#loop = training_loop(ReformerLM, train_stream, test_stream)
#loop.run(20000)

In [None]:
def tokenize(sentence):
    return list(trax.data.tokenize(iter([sentence]), vocab_file = 'en_32k.subword'))[0]

def detokenize(tokens):
    return trax.data.detokenize(tokens, vocab_file = 'en_32k.subword')

In [None]:
## The helper function to print out the dialogues in colors

def print_colored_dialogue(dialogues):
    result = []
    cur_conversation = ""
    first_sentence_printed = False
    Person1_turn = True
    for s in dialogues:
        cur_conversation += s        # model predicts Person 2 finishes the sentence
        if cur_conversation.endswith("Person 1: "):
            if not first_sentence_printed:
                first_sentence_printed = True
            else:
                # print everything before "Person 1: "
                print(colored("Person 2: " + cur_conversation.split("Person 1: ")[0].strip(), 'red'))
                cur_conversation = ""
                Person1_turn = True

        # model predicts Person 1 finished the sentence
        elif cur_conversation.endswith("Person 2: "):
            # print everything before "Person 2: "
            print(colored("Person 1: " + cur_conversation.split("Person 2: ")[0].strip(), 'blue'))
            cur_conversation = ""
            Person1_turn = False

    # print remaining sentences
    if Person1_turn:
        print(colored("Person 1: " + cur_conversation, 'blue'))
    else:
        print(colored("Person 2: " + cur_conversation, 'red'))

In [None]:
# grab a batch from test_stream
test_x, test_y, test_w = next(test_stream)
print("Batch_size = {}".format(test_x.shape[0]))

# choose the first example
sample_x = test_x[0][None, :]

print("\nInput dialogue:")
print_colored_dialogue(detokenize(sample_x[0]))

pred = loop.eval_model(sample_x)
pred_token = pred.argmax(axis = -1)

print("\nOutput dialogue:")
print_colored_dialogue(detokenize(pred_token[0]))

#Real TEST
#Real Test!!!!!!!!

---

In [None]:
def generate_next_token(current_tokens, model):

    """
    Generate the next token
    
    Inputs
            current_tokens: <list of int> currently generated token so far
            model: <trax model> the model for the prediction
    
    Output
            next_token: <int> the next token generated by the model
    """

    # number of tokens generated so far
    current_tokens_length = len(current_tokens)
   
    # find the next power of 2 to be the final length after padding
    final_padded_length = 2**int(np.ceil(np.log2(current_tokens_length + 1)))

    # caucluate the number of zeros to pad
    to_pad_length = final_padded_length - current_tokens_length

    # padding
    padded_current_tokens = np.array(current_tokens.tolist() + [0 for _ in range(to_pad_length)])[None, :]

    # use the model to predict the log probabilities of the next token
    model_output, _ = model((padded_current_tokens, padded_current_tokens))

    # (note) model_output has shape (batch_size, len_of_whole_token_list, vocab_size)
    # only take the log probability distribution of the last token
    next_token_logprob = model_output[0, current_tokens_length, :]

    # select the token with the largest log probability
    next_token = int(np.argmax(next_token_logprob))

    return next_token

In [None]:
def extend_dialogue(current_dialogue, model, maximum_number_extension = 100):
    current_tokens_list = tokenize(current_dialogue)

    num_tokens_generated = 0

    while num_tokens_generated <= maximum_number_extension:
        # given current_tokens_list, generate the next token
        next_output_token = generate_next_token(current_tokens_list, model)

        current_tokens_list = current_tokens_list.tolist()

        current_tokens_list.append(next_output_token)

        current_tokens_list = np.array(current_tokens_list)

        num_tokens_generated += 1

    # maximum number of tokens reached, output the detokenized dialogue
    complete_dialogue = trax.data.detokenize(current_tokens_list, vocab_file = 'en_32k.subword')

    return complete_dialogue

In [None]:
## Example wehre Person 1 asks for an avocado for no reason

dialogue_seed = "Person 1: Um... Can I have some avocado? Person 2: "

complete_dialogue = extend_dialogue(dialogue_seed, loop.eval_model, maximum_number_extension = 100)

print_colored_dialogue(complete_dialogue)