# Imports

pip install torch torchtext --index-url [https://download.pytorch.org/whl/cu124](https:\download.pytorch.org\whl\cu124)

pip install torchtext 

pip install seaborn

pip install matplotlib 

pip install nltk

pip install numpy

pip install pandas

<span style="color: var(--vscode-foreground);">https://pytorch.org/tutorials/beginner/chatbot_tutorial.html</span>  

https://colab.research.google.com/drive/1B16-YpENJl1QLenYdu5OHUb\_WSYKIs89#scrollTo=7D4bTGub71za

In [6]:
# General imports
import os
import re
import string
from zipfile import ZipFile
from urllib.request import urlopen
from io import BytesIO
from collections import Counter
import random
import itertools
import unicodedata

# Dataframe processing imports
import pandas
import numpy

# Pytorch related imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [7]:
# Loadall Data
# you would need a sizable GPU for this to be feasible
corpus_loadall = False

# With, or without attention
with_attention = True

# Set save locations
model_save_dir = 'models'
corpus_name = 'ubuntu-dialogue-corpus'

# Folder to store the visuals
if not os.path.exists(model_save_dir):
    os.makedirs(model_save_dir)

# Set device to either use CPU or CUDA, if installes
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f"Using device: {DEVICE}")

Using device: cuda


# Dataset Creation

- Download dataset if not existing
- Load the data into a dataframe
    - Specify the corpus\_loadall paramter to define how much data to load
- Extract sentence pairs

In [8]:
SourceURL = 'https://www.kaggle.com/api/v1/datasets/download/rtatman/ubuntu-dialogue-corpus'

LocalFiles = {'toc': 'Data/toc.csv',
              'dialogueText': 'Data/Ubuntu-dialogue-corpus/dialogueText.csv',
              'dialogueText_196': 'Data/Ubuntu-dialogue-corpus/dialogueText_196.csv',
              'dialogueText_301': 'Data/Ubuntu-dialogue-corpus/dialogueText_301.csv',}

# Check if data folder exists, if not create it
if not os.path.exists('Data'):
    os.makedirs('Data')

# Check if data is already downloaded, if not download it
for file in LocalFiles:
    if os.path.exists(LocalFiles[file]):
        print(f'File {LocalFiles[file]} already downloaded')
    else:
        print(f'Downloading {SourceURL}')
        resp = urlopen(SourceURL)
        DocZip = ZipFile(BytesIO(resp.read()))
        DocZip.extractall('Data')

Downloading https://www.kaggle.com/api/v1/datasets/download/rtatman/ubuntu-dialogue-corpus


File Data/Ubuntu-dialogue-corpus/dialogueText.csv already downloaded
File Data/Ubuntu-dialogue-corpus/dialogueText_196.csv already downloaded
File Data/Ubuntu-dialogue-corpus/dialogueText_301.csv already downloaded


In [9]:
# If corpus_loadall is false, only load dialogueText.csv, otherwise load all files
corpus_df = pandas.DataFrame()

if corpus_loadall == True:
    print("Loading all corpus files")
    for f in LocalFiles:
        data = pandas.read_csv(LocalFiles[f], encoding='utf-8')
        corpus_df = pandas.concat([corpus_df, data])
elif corpus_loadall == False:
    print("Loading dialogueText.csv")
    data = pandas.read_csv(LocalFiles['dialogueText'], encoding='utf-8')
    corpus_df = pandas.concat([corpus_df, data])

Loading dialogueText.csv


In [10]:
# order the columns by dialogueID and date
corpus_df.sort_values(by=['dialogueID', 'date'], inplace=True)
corpus_df.reset_index(drop=True, inplace=True)

# Drop unwanted columns and rows, save memory!
corpus_df.drop(['folder', 'date', 'from', 'to'], axis=1, inplace=True)
corpus_df.dropna(subset=['text'], inplace=True)

print(corpus_df.shape)
corpus_df.head()

(1038235, 2)


Unnamed: 0,dialogueID,text
0,1.tsv,"Also guys, I'm trying to get into my FIrefox p..."
1,1.tsv,are you logged in as 'root' ?
2,1.tsv,no.
3,10.tsv,ugh ;( http://planet.ubuntulinux.org seems to...
4,10.tsv,"perhaps if you define *broken* a little, we ca..."


In [19]:
# Extract pairs of sentences from the conversations
# The output is a list of pairs of sentences, one pair per row

# Group sentences by dialogueID using dictionary comprehension
sentence_pairs = {
    dialogue_id: sentences.tolist()
    for dialogue_id, sentences in corpus_df.groupby('dialogueID')['text']
}

# Generate sentence pairs from the dictionary
pairs = []
for dialogue_id, sentences in sentence_pairs.items():
    if len(sentences) >= 2:  # Ensure at least two sentences for a dialogue
        for i in range(len(sentences) - 1):
            pairs.append((sentences[i], sentences[i + 1]))


# Write sentence pairs to a dataframe
pairs_df = pandas.DataFrame(pairs, columns=['question', 'answer'])
print(pairs_df.shape)
pairs_df.head()

(692127, 2)


Unnamed: 0,question,answer
0,"Also guys, I'm trying to get into my FIrefox p...",are you logged in as 'root' ?
1,are you logged in as 'root' ?,no.
2,ugh ;( http://planet.ubuntulinux.org seems to...,"perhaps if you define *broken* a little, we ca..."
3,"perhaps if you define *broken* a little, we ca...",hypa7ia: Every single entry is by Tollef Fog H...
4,ohh to late,http://www.ubuntulinux.org/ubuntu/login.png IS...


# Preprocessing

Let's now create a vocabulary and load query/response sentence pairs into memory.

Remember that we are dealing with sequences of words, which do not have an implicit mapping to a discrete numerical space. A mapping must be created by mapping each unique word in the dataset to an index value.

The Voc class keeps a mapping from words to indexes, a reverse mapping of indexes to words, a count of each word and a total word count. The class provides methods for adding a word to the vocabulary (addWord), adding all words in a sentence (addSentence) and trimming infrequently seen words (trim).

In [20]:
# Default word tokens
PAD_token = 0  # Used for padding short sentences
SOS_token = 1  # Start-of-sentence token
EOS_token = 2  # End-of-sentence token

class Voc:
    def __init__(self, name):
        self.name = name
        self.trimmed = False
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3  # Count SOS, EOS, PAD

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            self.word2count[word] += 1

    # Remove words below a certain count threshold
    def trim(self, min_count):
        if self.trimmed:
            return
        self.trimmed = True

        keep_words = []

        for k, v in self.word2count.items():
            if v >= min_count:
                keep_words.append(k)

        print('keep_words {} / {} = {:.4f}'.format(
            len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index)
        ))

        # Reinitialize dictionaries
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3 # Count default tokens

        for word in keep_words:
            self.addWord(word)
            

Let's also write some functions to clean the text.

1. Convert Unicode strings to ASCII using unicodeToAscii
2. Convert all letters to lowercase and trim all non-letter characters except for basic punctuation (normalizeString)
3. Filter out sentences with length greater than the MAX\_LENGTH threshold (filterPairs) to facilitate training convergence.

In [21]:
MAX_LENGTH = 10  # Maximum sentence length to consider

# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s = re.sub(r"\s+", r" ", s).strip()
    return s

# Read query/response pairs and return a voc object
def readVocs(dataframe, corpus_name):
    # Create pairs from the DataFrame
    pairs = dataframe.values.tolist()
    voc = Voc(corpus_name)
    return voc, pairs

# Returns True iff both sentences in a pair 'p' are under the MAX_LENGTH threshold
def filterPair(p):
    # Input sequences need to preserve the last word for EOS token
    return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH

# Filter pairs using filterPair condition
def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

# Using the functions defined above, return a populated voc object and pairs list
def loadPrepareData(dataframe, corpus_name):
    print("Start preparing training data ...")
    voc, pairs = readVocs(dataframe, corpus_name)
    print("Read {!s} sentence pairs".format(len(pairs)))
    pairs = filterPairs(pairs)
    print("Trimmed to {!s} sentence pairs".format(len(pairs)))
    print("Counting words...")
    for pair in pairs:
        voc.addSentence(pair[0])
        voc.addSentence(pair[1])
    print("Counted words:", voc.num_words)
    return voc, pairs

In [23]:
voc, pairs = loadPrepareData(pairs_df, corpus_name)
# Print some pairs to validate
print("\npairs:")
for pair in pairs[:10]:
    print(pair)

Start preparing training data ...
Read 692127 sentence pairs


Trimmed to 263288 sentence pairs
Counting words...


Counted words: 155430

pairs:
["are you logged in as 'root' ?", ' no.']
['ohh to late', 'http://www.ubuntulinux.org/ubuntu/login.png IS SO G-A-Y']
['http://www.ubuntulinux.org/ubuntu/login.png IS SO G-A-Y', 'easy, now']
['see bug 67085', 'sorry, typo, ignore that... try this one bug 767085']
['sorry, typo, ignore that... try this one bug 767085', 'so update libc6?']
['HAI THAR', 'go awai']
['go awai', 'hello']
['it would install anotehr bootloader', 'like what?']
['Does anyone use anything like that?', "grep 'what you are looking for' file"]
['make videos of what? your screen?', 'oo... no idea then, sorry']


Another tactic that is beneficial to achieving faster convergence during training is trimming rarely used words out of our vocabulary.

Decreasing the feature space will also soften the difficulty of the function that the model must learn to approximate. We will do this as a two-step process:

Trim words used under MIN\_COUNT threshold using the voc.trim function. Filter out pairs with trimmed words.

In [24]:
MIN_COUNT = 3    # Minimum word count threshold for trimming

def trimRareWords(voc, pairs, MIN_COUNT):
    # Trim words used under the MIN_COUNT from the voc
    voc.trim(MIN_COUNT)
    # Filter out pairs with trimmed words
    keep_pairs = []
    for pair in pairs:
        input_sentence = pair[0]
        output_sentence = pair[1]
        keep_input = True
        keep_output = True
        # Check input sentence
        for word in input_sentence.split(' '):
            if word not in voc.word2index:
                keep_input = False
                break
        # Check output sentence
        for word in output_sentence.split(' '):
            if word not in voc.word2index:
                keep_output = False
                break

        # Only keep pairs that do not contain trimmed word(s) in their input or output sentence
        if keep_input and keep_output:
            keep_pairs.append(pair)

    print("Trimmed from {} pairs to {}, {:.4f} of total".format(len(pairs), len(keep_pairs), len(keep_pairs) / len(pairs)))
    return keep_pairs


# Trim voc and pairs
pairs = trimRareWords(voc, pairs, MIN_COUNT)

keep_words 35965 / 155427 = 0.2314


Trimmed from 263288 pairs to 154913, 0.5884 of total


In [25]:
def indexesFromSentence(voc, sentence):
    return [voc.word2index[word] for word in sentence.split(' ')] + [EOS_token]


def zeroPadding(l, fillvalue=PAD_token):
    return list(itertools.zip_longest(*l, fillvalue=fillvalue))

def binaryMatrix(l, value=PAD_token):
    m = []
    for i, seq in enumerate(l):
        m.append([])
        for token in seq:
            if token == PAD_token:
                m[i].append(0)
            else:
                m[i].append(1)
    return m

# Returns padded input sequence tensor and lengths
def inputVar(l, voc):
    #print(l)
    #for sentence in l:
    #    print(indexesFromSentence(voc, sentence))
    #print()
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    padVar = torch.LongTensor(padList)
    #print(padVar)
    #print(lengths)
    return padVar, lengths

# Returns padded target sequence tensor, padding mask, and max target length
def outputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    max_target_len = max([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    mask = binaryMatrix(padList)
    mask = torch.BoolTensor(mask)
    padVar = torch.LongTensor(padList)
    return padVar, mask, max_target_len

# Returns all items for a given batch of pairs
def batch2TrainData(voc, pair_batch):
    #print()
    pair_batch.sort(key=lambda x: len(x[0].split(" ")), reverse=True)
    input_batch, output_batch = [], []
    for pair in pair_batch:
        input_batch.append(pair[0])
        output_batch.append(pair[1])
    inp, lengths = inputVar(input_batch, voc)
    output, mask, max_target_len = outputVar(output_batch, voc)
    return inp, lengths, output, mask, max_target_len


# Example for validation
small_batch_size = 5
batches = batch2TrainData(voc, [random.choice(pairs) for _ in range(small_batch_size)])
input_variable, lengths, target_variable, mask, max_target_len = batches

print("input_variable:", input_variable)
print("lengths:", lengths)
print("target_variable:", target_variable)
print("mask:", mask)
print("max_target_len:", max_target_len)


input_variable: tensor([[  277,   321,   126,   638,   214],
        [  627,    37,   344,  3421,     2],
        [  132,   794,  6333,     2,     0],
        [ 1488,    13,    87,     0,     0],
        [    6,   141,  1236,     0,     0],
        [ 1635,  1973,   196,     0,     0],
        [  200,   200,     2,     0,     0],
        [10643,  1228,     0,     0,     0],
        [    2,     2,     0,     0,     0]])
lengths: tensor([9, 9, 7, 3, 2])
target_variable: tensor([[  139,    85,   409,   387,   214],
        [  865,   271,   132,     2,   105],
        [ 1435,   171,   292,     0,     2],
        [    3,    13, 30296,     0,     0],
        [    4,  3102,  7696,     0,     0],
        [ 2367, 17122,  1143,     0,     0],
        [    2,     2,  6767,     0,     0],
        [    0,     0,   196,     0,     0],
        [    0,     0,     2,     0,     0]])
mask: tensor([[ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [ True,  True,  Tr

# Model Training

In [27]:
# Encoder
class EncoderRNN(nn.Module):
    def __init__(self, hidden_size, embedding, n_layers=1, dropout=0):
        super(EncoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embedding = embedding

        # Initialize GRU; the input_size and hidden_size params are both set to 'hidden_size'
        #   because our input size is a word embedding with number of features == hidden_size
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout), bidirectional=True)

    def forward(self, input_seq, input_lengths, hidden=None):
        # Convert word indexes to embeddings
        embedded = self.embedding(input_seq)
        # Pack padded batch of sequences for RNN module
        packed = nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
        # Forward pass through GRU
        outputs, hidden = self.gru(packed, hidden)
        # Unpack padding
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs)
        # Sum bidirectional GRU outputs
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:]
        # Return output and final hidden state
        return outputs, hidden

In [28]:
# Decoder, without attention mechanism, for the seq2seq model
class DecoderRNN(nn.Module):
    def __init__(self, embedding, hidden_size, output_size, n_layers=1, dropout=0.1):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers

        self.embedding = embedding
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout))
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, input_step, last_hidden, encoder_outputs):
        # Convert input word index to embedding
        # embedded = self.embedding(input_step).unsqueeze(0)
        # Note: we run this one step (word) at a time
        # Get embedding of current input word
        embedded = self.embedding(input_step)
       #embedded = self.embedding_dropout(embedded)

        # Forward pass through GRU
        output, hidden = self.gru(embedded, last_hidden)

        # Output layer (next word prediction)
        output = self.out(output.squeeze(0))
        # Softmax for probability distribution
        output = F.softmax(output, dim=1)

        return output, hidden

In [29]:
# Luong attention layer
class Attn(nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        self.method = method
        if self.method not in ['dot', 'general', 'concat']:
            raise ValueError(self.method, "is not an appropriate attention method.")
        self.hidden_size = hidden_size
        if self.method == 'general':
            self.attn = nn.Linear(self.hidden_size, hidden_size)
        elif self.method == 'concat':
            self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
            self.v = nn.Parameter(torch.FloatTensor(hidden_size))

    def dot_score(self, hidden, encoder_output):
        return torch.sum(hidden * encoder_output, dim=2)

    def general_score(self, hidden, encoder_output):
        energy = self.attn(encoder_output)
        return torch.sum(hidden * energy, dim=2)

    def concat_score(self, hidden, encoder_output):
        energy = self.attn(torch.cat((hidden.expand(encoder_output.size(0), -1, -1), encoder_output), 2)).tanh()
        return torch.sum(self.v * energy, dim=2)

    def forward(self, hidden, encoder_outputs):
        # Calculate the attention weights (energies) based on the given method
        if self.method == 'general':
            attn_energies = self.general_score(hidden, encoder_outputs)
        elif self.method == 'concat':
            attn_energies = self.concat_score(hidden, encoder_outputs)
        elif self.method == 'dot':
            attn_energies = self.dot_score(hidden, encoder_outputs)

        # Transpose max_length and batch_size dimensions
        attn_energies = attn_energies.t()

        # Return the softmax normalized probability scores (with added dimension)
        return F.softmax(attn_energies, dim=1).unsqueeze(1)

In [30]:
# Decoder, with attention mechanism, for the seq2seq model
class LuongAttnDecoderRNN(nn.Module):
    def __init__(self, attn_model, embedding, hidden_size, output_size, n_layers=1, dropout=0.1):
        super(LuongAttnDecoderRNN, self).__init__()

        # Keep for reference
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout

        # Define layers
        self.embedding = embedding
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout))
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)

        self.attn = Attn(attn_model, hidden_size)

    def forward(self, input_step, last_hidden, encoder_outputs):
        # Note: we run this one step (word) at a time
        # Get embedding of current input word
        embedded = self.embedding(input_step)
        embedded = self.embedding_dropout(embedded)
        # Forward through unidirectional GRU
        rnn_output, hidden = self.gru(embedded, last_hidden)
        # Calculate attention weights from the current GRU output
        attn_weights = self.attn(rnn_output, encoder_outputs)
        # Multiply attention weights to encoder outputs to get new "weighted sum" context vector
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
        # Concatenate weighted context vector and GRU output using Luong eq. 5
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1)
        concat_input = torch.cat((rnn_output, context), 1)
        concat_output = torch.tanh(self.concat(concat_input))
        # Predict next word using Luong eq. 6
        output = self.out(concat_output)
        # Softmax for probability distribution
        output = F.softmax(output, dim=1)
        # Return output and final hidden state
        return output, hidden

In [31]:
def maskNLLLoss(inp, target, mask):
    nTotal = mask.sum()
    crossEntropy = -torch.log(torch.gather(inp, 1, target.view(-1, 1)).squeeze(1))
    loss = crossEntropy.masked_select(mask).mean()
    loss = loss.to(DEVICE)
    return loss, nTotal.item()

In [32]:
def train(input_variable, lengths, target_variable, mask, max_target_len, encoder, decoder, embedding,
          encoder_optimizer, decoder_optimizer, batch_size, clip, max_length=MAX_LENGTH):

    # Zero gradients
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    # Set device options
    input_variable = input_variable.to(DEVICE)
    target_variable = target_variable.to(DEVICE)
    mask = mask.to(DEVICE)
    # Lengths for rnn packing should always be on the cpu
    lengths = lengths.to("cpu")

    # Initialize variables
    loss = 0
    print_losses = []
    n_totals = 0

    # Forward pass through encoder
    encoder_outputs, encoder_hidden = encoder(input_variable, lengths)

    # Create initial decoder input (start with SOS tokens for each sentence)
    decoder_input = torch.LongTensor([[SOS_token for _ in range(batch_size)]])
    decoder_input = decoder_input.to(DEVICE)

    # Set initial decoder hidden state to the encoder's final hidden state
    decoder_hidden = encoder_hidden[:decoder.n_layers]

    # Determine if we are using teacher forcing this iteration
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    # print('max_target_len: ', max_target_len)

    # Forward batch of sequences through decoder one time step at a time
    if use_teacher_forcing:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # Teacher forcing: next input is current target
            decoder_input = target_variable[t].view(1, -1)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal
    else:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # No teacher forcing: next input is decoder's own current output
            _, topi = decoder_output.topk(1)
            decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
            decoder_input = decoder_input.to(DEVICE)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal


    # Perform backpropatation
    loss.backward()

    # Clip gradients: gradients are modified in place
    _ = nn.utils.clip_grad_norm_(encoder.parameters(), clip)
    _ = nn.utils.clip_grad_norm_(decoder.parameters(), clip)

    # Adjust model weights
    encoder_optimizer.step()
    decoder_optimizer.step()

    return sum(print_losses) / n_totals

In [33]:
def trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer, embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size, print_every, save_every, clip, corpus_name):

    # Load batches for each  iteration
    training_batches = [batch2TrainData(voc, [random.choice(pairs) for _ in range(batch_size)])
                      for _ in range(n_iteration)]

    # Initializations
    print('Initializing ...')
    start_iteration = 1
    print_loss = 0
    #if loadFilename:
    #    start_iteration = checkpoint['iteration'] + 1

    # Training loop
    print("Training...")
    for iteration in range(start_iteration, n_iteration + 1):
        training_batch = training_batches[iteration - 1]
        # Extract fields from batch
        input_variable, lengths, target_variable, mask, max_target_len = training_batch

        # Run a training iteration with batch
        loss = train(input_variable, lengths, target_variable, mask, max_target_len, encoder,
                     decoder, embedding, encoder_optimizer, decoder_optimizer, batch_size, clip)
        print_loss += loss

        # Print progress
        if iteration % print_every == 0:
            print_loss_avg = print_loss / print_every
            print("Iteration: {}; Percent complete: {:.1f}%; Average loss: {:.4f}".format(iteration, iteration / n_iteration * 100, print_loss_avg))
            print_loss = 0

        # Save checkpoint
        if (iteration % save_every == 0):
            directory = os.path.join(save_dir, model_name, corpus_name, '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size))
            if not os.path.exists(directory): 
                os.makedirs(directory)
            torch.save({
                'iteration': iteration,
                'en': encoder.state_dict(),
                'de': decoder.state_dict(),
                'en_opt': encoder_optimizer.state_dict(),
                'de_opt': decoder_optimizer.state_dict(),
                'loss': loss,
                'voc_dict': voc.__dict__,
                'embedding': embedding.state_dict()
            }, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpoint')))

In [35]:
class GreedySearchDecoder(nn.Module):
    def __init__(self, encoder, decoder):
        super(GreedySearchDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, input_seq, input_length, max_length):
        # Forward input through encoder model
        encoder_outputs, encoder_hidden = self.encoder(input_seq, input_length)
        # Prepare encoder's final hidden layer to be first hidden input to the decoder
        decoder_hidden = encoder_hidden[:decoder.n_layers]
        # Initialize decoder input with SOS_token
        decoder_input = torch.ones(1, 1, dtype=torch.long) * SOS_token
        decoder_input = decoder_input.to(DEVICE)
        # Initialize tensors to append decoded words to
        all_tokens = torch.zeros([0], dtype=torch.long)
        all_tokens = all_tokens.to(DEVICE)
        all_scores = torch.zeros([0])
        all_scores = all_scores.to(DEVICE)
        # Iteratively decode one word token at a time
        for _ in range(max_length):
            # Forward pass through decoder
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden, encoder_outputs)
            # Obtain most likely word token and its softmax score
            decoder_scores, decoder_input = torch.max(decoder_output, dim=1)
            # Record token and score
            all_tokens = torch.cat((all_tokens, decoder_input), dim=0)
            all_scores = torch.cat((all_scores, decoder_scores), dim=0)
            # Prepare current token to be next decoder input (add a dimension)
            decoder_input = torch.unsqueeze(decoder_input, 0)
        # Return collections of word tokens and scores
        return all_tokens, all_scores

In [36]:
def evaluate(encoder, decoder, searcher, voc, sentence, max_length=MAX_LENGTH):
    ### Format input sentence as a batch
    # words -> indexes
    indexes_batch = [indexesFromSentence(voc, sentence)]
    # Create lengths tensor
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    # Transpose dimensions of batch to match models' expectations
    input_batch = torch.LongTensor(indexes_batch).transpose(0, 1)
    # Use appropriate device
    input_batch = input_batch.to(DEVICE)
    lengths = lengths.to("cpu")
    # Decode sentence with searcher
    tokens, scores = searcher(input_batch, lengths, max_length)
    # indexes -> words
    decoded_words = [voc.index2word[token.item()] for token in tokens]
    return decoded_words

In [38]:
def evaluateInput(encoder, decoder, searcher, voc):
    input_sentence = ''
    while(1):
        try:
            # Get input sentence
            input_sentence = input('> ')
            # Check if it is quit case
            if input_sentence == 'q' or input_sentence == 'quit': break
            # Normalize sentence
            input_sentence = normalizeString(input_sentence)
            # Evaluate sentence
            output_words = evaluate(encoder, decoder, searcher, voc, input_sentence)
            # Format and print response sentence
            output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')]
            print('Bot:', ' '.join(output_words))

        except KeyError:
            print("Error: Encountered unknown word.")

In [39]:
# Configure models
if with_attention:
    model_name = 'cb_model_attn'
    #attn_model = 'dot'
    attn_model = 'general'
    #attn_model = 'concat'
else:
    model_name = 'cb_model'
    attn_model = 'None'
hidden_size = 500
encoder_n_layers = 2
decoder_n_layers = 2
dropout = 0.1
batch_size = 64

# Print
print(f'Model will be {model_name} with attention {attn_model} and hidden size {hidden_size}')

Model will be cb_model_attn with attention general and hidden size 500


In [40]:
print('Building encoder and decoder ...')
# Initialize word embeddings
embedding = nn.Embedding(voc.num_words, hidden_size)
#if loadFilename:
#    embedding.load_state_dict(embedding_sd)
# Initialize encoder & decoder models
encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
# With/without attention
if with_attention:
    decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout)
else:
    decoder = DecoderRNN(embedding, hidden_size, voc.num_words, decoder_n_layers, dropout)

#if loadFilename:
#    encoder.load_state_dict(encoder_sd)
#    decoder.load_state_dict(decoder_sd)
# Use appropriate device
encoder = encoder.to(DEVICE)
decoder = decoder.to(DEVICE)
print('Models built and ready to go!')

# Configure training/optimization
clip = 50.0
teacher_forcing_ratio = 1.0
learning_rate = 0.0001
decoder_learning_ratio = 5.0
n_iteration = 4000
print_every = 250
save_every = 1000

# Ensure dropout layers are in train mode
encoder.train()
decoder.train()

# Initialize optimizers
print('Building optimizers ...')
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)

#if loadFilename:
#    encoder_optimizer.load_state_dict(encoder_optimizer_sd)
#    decoder_optimizer.load_state_dict(decoder_optimizer_sd)

# If you have cuda, configure cuda to call
for state in encoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()

for state in decoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()

Building encoder and decoder ...


Models built and ready to go!
Building optimizers ...


In [41]:
# Run training iterations
print(f"Starting Training {model_name} for attention method {attn_model}!")
trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer,
           embedding, encoder_n_layers, decoder_n_layers, model_save_dir, n_iteration, batch_size,
           print_every, save_every, clip, corpus_name)

Starting Training cb_model_attn for attention method general!


Initializing ...
Training...


Iteration: 250; Percent complete: 6.2%; Average loss: 6.5348


Iteration: 500; Percent complete: 12.5%; Average loss: 5.9308


Iteration: 750; Percent complete: 18.8%; Average loss: 5.6548


Iteration: 1000; Percent complete: 25.0%; Average loss: 5.5109


Iteration: 1250; Percent complete: 31.2%; Average loss: 5.3727


Iteration: 1500; Percent complete: 37.5%; Average loss: 5.2546


Iteration: 1750; Percent complete: 43.8%; Average loss: 5.1722


Iteration: 2000; Percent complete: 50.0%; Average loss: 5.0899


Iteration: 2250; Percent complete: 56.2%; Average loss: 5.0177


Iteration: 2500; Percent complete: 62.5%; Average loss: 4.9515


Iteration: 2750; Percent complete: 68.8%; Average loss: 4.8702


Iteration: 3000; Percent complete: 75.0%; Average loss: 4.8169


Iteration: 3250; Percent complete: 81.2%; Average loss: 4.7789


Iteration: 3500; Percent complete: 87.5%; Average loss: 4.7092


Iteration: 3750; Percent complete: 93.8%; Average loss: 4.6394


Iteration: 4000; Percent complete: 100.0%; Average loss: 4.5909


In [89]:
# Set checkpoint to load from; set to None if starting from scratch
loadFilename = True
checkpoint_iter = 4000
loadFilename = os.path.join(model_save_dir, model_name, corpus_name,
                            '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size),
                            '{}_checkpoint.tar'.format(checkpoint_iter))

# Load model if a loadFilename is provided
if loadFilename:
    # If loading on same machine the model was trained on
    checkpoint = torch.load(loadFilename)
    # If loading a model trained on GPU to CPU
    #checkpoint = torch.load(loadFilename, map_location=torch.device('cpu'))
    encoder_sd = checkpoint['en']
    decoder_sd = checkpoint['de']
    encoder_optimizer_sd = checkpoint['en_opt']
    decoder_optimizer_sd = checkpoint['de_opt']
    embedding_sd = checkpoint['embedding']
    voc.__dict__ = checkpoint['voc_dict']

if loadFilename:
    embedding.load_state_dict(embedding_sd)

if loadFilename:
    encoder.load_state_dict(encoder_sd)
    decoder.load_state_dict(decoder_sd)

  checkpoint = torch.load(loadFilename)


In [None]:
# Set dropout layers to eval mode
encoder.eval()
decoder.eval()

# Initialize search module
searcher = GreedySearchDecoder(encoder, decoder)

# Begin chatting (uncomment and run the following line to begin)
evaluateInput(encoder, decoder, searcher, voc)

Bot:          


Bot: can anyone help me with ubuntu


Bot: sudo apt-get install kubuntu-desktop
