In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mustafacicek/south-park-scripts-dataset")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/south-park-scripts-dataset


In [2]:
import pandas as pd
from __future__ import absolute_import, division, print_function, unicode_literals
import torch
import torch.nn as nn
from torch import optim
from torch.jit import script, trace
import torch.nn.functional as F
import csv
import random

import re
import os
import unicodedata
import codecs
from io import open
import itertools
import math

In [3]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("tovarischsukhov/southparklines")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/southparklines


In [4]:
# View dataframe
southpark_df = pd.read_csv("/kaggle/input/southparklines/All-seasons.csv")
southpark_df.head(10)

Unnamed: 0,Season,Episode,Character,Line
0,10,1,Stan,"You guys, you guys! Chef is going away. \n"
1,10,1,Kyle,Going away? For how long?\n
2,10,1,Stan,Forever.\n
3,10,1,Chef,I'm sorry boys.\n
4,10,1,Stan,"Chef said he's been bored, so he joining a gro..."
5,10,1,Chef,Wow!\n
6,10,1,Mrs. Garrison,Chef?? What kind of questions do you think adv...
7,10,1,Chef,What's the meaning of life? Why are we here?\n
8,10,1,Mrs. Garrison,I hope you're making the right choice.\n
9,10,1,Cartman,I'm gonna miss him. I'm gonna miss Chef and I...


In [5]:
# View first few lines
print("South Park lines:")
for i in range(0,5):
    print("Line #",i+1)
    print(southpark_df.Line[i])

South Park lines:
Line # 1
You guys, you guys! Chef is going away. 

Line # 2
Going away? For how long?

Line # 3
Forever.

Line # 4
I'm sorry boys.

Line # 5
Chef said he's been bored, so he joining a group called the Super Adventure Club. 



In [6]:
# Find number of seasons
seasons_list = southpark_df["Season"].tolist()
s_list = []
for season in seasons_list:
  try:
    s_int = int(season)
    s_list.append(s_int)
  except:
    pass
season_set = set(s_list)
print(season_set)

{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}


In [7]:
# Find where ep_index end per season
ep_per_season = [13,18,17,17,14,17,15,14,14,14,14,14,14,14,14,14,10,10]
seasons_dictionary = {"season_1":{}, 
                      "season_2":{}, 
                      "season_3":{}, 
                      "season_4":{}, 
                      "season_5":{}, 
                      "season_6":{}, 
                      "season_7":{}, 
                      "season_8":{}, 
                      "season_9":{}, 
                      "season_10":{}, 
                      "season_11":{}, 
                      "season_12":{}, 
                      "season_13":{}, 
                      "season_14":{}, 
                      "season_15":{}, 
                      "season_16":{}, 
                      "season_17":{}, 
                      "season_18":{}}

count_index = 0
prev_season = 0
for a_season in range(1, 19):
    curr_season = "season_" + str(a_season)
    total_eps_for_season = ep_per_season[a_season - 1]
    
    for an_ep in range(1, total_eps_for_season + 1):  # Adjust loop to include the last episode
        for i in range(len(southpark_df["Season"])):
            # Ensure that we are only processing valid integers for the season
            try:
                season_value = int(southpark_df["Season"][i])
            except ValueError:
                continue  # Skip invalid season values
            
            if season_value > prev_season:
                if season_value == a_season and int(southpark_df["Episode"][i]) == an_ep:
                    count_index += 1

        sub_dictionary = seasons_dictionary[curr_season]
        sub_dictionary[an_ep] = count_index
    prev_season = a_season


In [8]:
def clean_text(text):
    '''Clean text by removing unnecessary characters and altering 
    the format of words. We will do force expansion on some words and
    transform some of them to root form'''

    # Lowercase
    text = text.lower()
    
    # Substitute text
    text = re.sub(r"\n", "",  text)
    text = re.sub(r"[-()]", "", text)
    text = re.sub(r"\.", " .", text)
    text = re.sub(r"\!", " !", text)
    text = re.sub(r"\?", " ?", text)
    text = re.sub(r"\,", " ,", text)

    # Force expansion
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)

    # Root word transformation
    text = re.sub(r"ohh", "oh", text)
    text = re.sub(r"ohhh", "oh", text)
    text = re.sub(r"ohhhh", "oh", text)
    text = re.sub(r"ohhhhh", "oh", text)
    text = re.sub(r"ohhhhhh", "oh", text)
    text = re.sub(r"ahh", "ah", text)
    
    return text

def extractSentencePairs(conversations):
  """
  1 conversation = 1 episode of 1 season
  conversations = ALL eps in that season


  Iterate over all the lines of the conversation
  The final line of the conversation would be unable to find a pair hence it 
  would be ignored.
  """
  qa_pairs = []
  for conversation in conversations:
      for i in range(len(conversation)-1):
          inputLine = conversation[i].strip()
          targetLine = conversation[i+1].strip() + " \r"
          if inputLine and targetLine:
              qa_pairs.append([inputLine, targetLine])
  return qa_pairs

def printLines(file, n=10):
    with open(file, 'rb') as datafile:
        lines = datafile.readlines()
    for line in lines[:n]:
        print(line)

In [9]:
# Get ALL seasons of data 
conversations = []
start_index = 0
for a_key in seasons_dictionary:
  index_dictionary = seasons_dictionary[a_key]

  for ep_key in index_dictionary:
    ep_end_index = index_dictionary[ep_key]
    conversation = []
    for i in range(start_index,ep_end_index):
      line = southpark_df["Line"][i]
      # Level 1 preprocessing - basic data processing
      line = clean_text(line)
      conversation.append(line)
    start_index = ep_end_index
    conversations.append(conversation)

In [10]:
import os
import codecs
import csv

corpus_name = "southpark_corpus"
new_path = os.path.join(corpus_name, "formatted_southpark_lines_v2.txt")
delimiter = '\t'
delimiter = str(codecs.decode(delimiter, "unicode_escape"))

# Ensure the directory exists
os.makedirs(corpus_name, exist_ok=True)

print("\nWriting conversation pairs to new file")
with open(new_path, 'w', encoding='utf-8') as outfile:
    writer = csv.writer(outfile, delimiter=delimiter, lineterminator='\n')
    for pair in extractSentencePairs(conversations):
        writer.writerow(pair)

print("\nSample lines from file:")
printLines(new_path)


Writing conversation pairs to new file

Sample lines from file:
b'you guys , you guys ! chef is going away .\tgoing away ? for how long ? \r\n'
b'going away ? for how long ?\tforever . \r\n'
b'forever .\ti am sorry boys . \r\n'
b'i am sorry boys .\tchef said he is been bored , so he joining a group called the super adventure club . \r\n'
b'chef said he is been bored , so he joining a group called the super adventure club .\twow ! \r\n'
b'wow !\tchef ? ? what kind of questions do you think adventuring around the world is gonna answer ? ! \r\n'
b'chef ? ? what kind of questions do you think adventuring around the world is gonna answer ? !\tthat is the meaning of life ? why are we here ? \r\n'
b'that is the meaning of life ? why are we here ?\ti hope you are making the right choice . \r\n'
b'i hope you are making the right choice .\ti am gonna miss him .  i am gonna miss chef and i . . .and i do not know how to tell him ! \r\n'
b'i am gonna miss him .  i am gonna miss chef and i . . .and

In [11]:
PAD_token = 0 # pad short sentences
SOS_token = 1 # sentence start token
EOS_token = 2 # sentence end token

class Voc:
  def __init__(self, name):
    self.name = name
    self.trimmed = False
    self.word2index  = {}
    self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
    self.word2count = {}
    self.num_words = 3 
  
  def addSentences(self, sentence):
    for word in sentence.split(' '):
      self.addWord(word)

  def addWord(self, word):
    if word not in self.word2index:
      self.word2index[word] = self.num_words
      self.word2count[word] = 1
      self.index2word[self.num_words] = word
      self.num_words +=1
    else:
      # Increase word count if word appears before
      self.word2count[word] += 1
    
  # remove words below count threshold
  def trim(self, min_count):
    if self.trimmed:
      self.trimmed = True
    
    keep_words = []
    
    for k, v in self.word2count.items():
      if v>=min_count:
        keep_words.append(k)
    
    self.word2index = {}
    self.word2count = {}
    self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
    self.num_words = 3
    
    for word in keep_words:
      self.addWord(word)

In [12]:
MAX_LENGTH = 15

def unicode_to_ascii(s):
  return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

def normalizeString(s):
  s = unicode_to_ascii(s.lower().strip())
  s  = re.sub(r"([.!?])", r" \1", s)
  s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
  s = re.sub(r"\s+", r" ", s).strip()
  return s

def readVocs(datafile, corpus_name):
  print("Reading Lines ...")
  
  lines = open(datafile, encoding = 'utf-8').\
      read().strip().split('\n')
  
  pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
  voc = Voc(corpus_name)
  return voc, pairs

# Returns True if both sentences in a pair 'p' are under the MAX_LENGTH threshold
def filterPair(p):
  # Input sequences need to preserve the last word for EOS token
  if len(p) == 2:
    return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH

# Filter pairs using filterPair condition
def filterPairs(pairs):
  return [pair for pair in pairs if filterPair(pair)]

# Using the functions defined above, return a populated voc object and pairs list
def loadPrepareData(corpus_name, datafile):
  print("Start preparing training data ...")
  voc, pairs = readVocs(datafile, corpus_name)
  print("Read {!s} sentence pairs".format(len(pairs)))

  
  pairs = filterPairs(pairs)

  print("Trimmed to {!s} sentence pairs".format(len(pairs)))
  print("Counting words...")
  for pair in pairs:
      voc.addSentences(pair[0])
      voc.addSentences(pair[1])
  print("Counted words:", voc.num_words)
  return voc, pairs

In [13]:
# Load/Assemble voc and pairs
corpus_name = "southpark_corpus"
new_path = os.path.join(corpus_name, "formatted_southpark_lines_v2.txt")
datafile = new_path
voc, pairs = loadPrepareData(corpus_name, datafile)

# Print some pairs to validate
print("\npairs:")
for pair in pairs[:10]:
    print(pair)

Start preparing training data ...
Reading Lines ...
Read 72138 sentence pairs
Trimmed to 31510 sentence pairs
Counting words...
Counted words: 13729

pairs:
['you guys you guys ! chef is going away .', 'going away ? for how long ?']
['going away ? for how long ?', 'forever .']
['forever .', 'i am sorry boys .']
['that is the meaning of life ? why are we here ?', 'i hope you are making the right choice .']
['byebye !', 'goodbye !']
['goodbye !', 'so long !']
['so long !', 'so long chef !']
['so long chef !', 'goodbye chef !']
['goodbye chef !', 'goodbye chef ! have a great time with the super adventure club !']
['goodbye chef ! have a great time with the super adventure club !', 'goodbye ! . .']


In [14]:
MIN_COUNT = 3    # Minimum word count threshold for trimming

def trimRareWords(voc, pairs, MIN_COUNT):
    # Trim words used under the MIN_COUNT from the voc
    voc.trim(MIN_COUNT)
    # Filter out pairs with trimmed words
    keep_pairs = []
    for pair in pairs:
        input_sentence = pair[0]
        output_sentence = pair[1]
        keep_input = True
        keep_output = True
        # Check input sentence
        for word in input_sentence.split(' '):
            if word not in voc.word2index:
                keep_input = False
                break
        # Check output sentence
        for word in output_sentence.split(' '):
            if word not in voc.word2index:
                keep_output = False
                break

        # Only keep pairs that do not contain trimmed word(s) in their input or output sentence
        if keep_input and keep_output:
            keep_pairs.append(pair)

    print("Trimmed from {} pairs to {}, {:.4f} of total".format(len(pairs), len(keep_pairs), len(keep_pairs) / len(pairs)))
    return keep_pairs


# Trim voc and pairs
pairs = trimRareWords(voc, pairs, MIN_COUNT)

Trimmed from 31510 pairs to 22843, 0.7249 of total


In [15]:
def indexesFromSentence(voc, sentence):
    return [voc.word2index[word] for word in sentence.split(' ')] + [EOS_token]

def zeroPadding(l, fillvalue=PAD_token):
    return list(itertools.zip_longest(*l, fillvalue=fillvalue))

def binaryMatrix(l, value=PAD_token):
  m = []
  for i, seq in enumerate(l):
    m.append([])
    for token in seq:
      if token == PAD_token:
        m[i].append(0)
      else:
        m[i].append(1)
  return m

# Input Sequence Padding 
def inputVar(l, voc):
  indx_b = [indexesFromSentence(voc, sentence) for sentence in l]
  lengths = torch.tensor([len(indexes) for indexes in indx_b])
  padList = zeroPadding(indx_b)
  padVar = torch.LongTensor(padList)
  return padVar, lengths

# Output Sequence Padding 
def outputVar(l, voc):
  """"
  This function will output:
  1. Padded target 
  2. Padding mask
  3. Max target length
  """
  indx_b = [indexesFromSentence(voc, sentence) for sentence in l]
  max_target_len = max([len(indexes) for indexes in indx_b])
  padList = zeroPadding(indx_b)
  mask = binaryMatrix(padList)
  mask = torch.ByteTensor(mask)
  padVar = torch.LongTensor(padList)
  return padVar, mask, max_target_len

# Returns all items for a given batch of pairs
def batch2TrainData(voc, pair_batch):
  pair_batch.sort(key=lambda x: len(x[0].split(" ")), reverse=True)
  input_batch, output_batch = [], []
  for pair in pair_batch:
    input_batch.append(pair[0])
    output_batch.append(pair[1])
  inp, lengths = inputVar(input_batch, voc)
  output, mask, max_target_len = outputVar(output_batch, voc)
  return inp, lengths, output, mask, max_target_len


# Example for validation
small_batch_size = 5
batches = batch2TrainData(voc, [random.choice(pairs) for _ in range(small_batch_size)])
input_variable, lengths, target_variable, mask, max_target_len = batches

# Observe output and check to make sure it's correct
# DO NOT MOVE ON if tensor lengths are not matching
print("input_variable:", input_variable)
print("lengths:", lengths)
print("target_variable:", target_variable)
print("mask:", mask)
print("max_target_len:", max_target_len)

input_variable: tensor([[  65,   54,  115,  302,  138],
        [  16,   10,   10,   10, 1527],
        [ 133,   13,  115,   27, 1067],
        [  36,  149,  153,  413,   49],
        [  37,    3,  140,   49,  242],
        [ 253,  103,    3,  680, 1527],
        [ 765,   49,    4,   10,    2],
        [   5,  308,   10,    2,    0],
        [  16,   21,    2,    0,    0],
        [ 308, 1053,    0,    0,    0],
        [  16,   11,    0,    0,    0],
        [ 133,    2,    0,    0,    0],
        [   5,    0,    0,    0,    0],
        [   2,    0,    0,    0,    0]])
lengths: tensor([14, 12,  9,  8,  7])
target_variable: tensor([[  55,   10, 1606,  115, 1527],
        [  69,   10,  118,   10, 1527],
        [ 778,  984,  188,    2, 1527],
        [ 253, 1102,  450,    0, 1527],
        [ 107,   26,    3,    0, 1527],
        [   3,    3,    4,    0,    2],
        [ 124,  532, 1498,    0,    0],
        [  10,  107,  422,    0,    0],
        [  52,  388,   12,    0,    0],
        

In [16]:
class EncoderRNN(nn.Module):
  def __init__(self, hidden_size, embedding, n_layers=1, dropout=0):
    super(EncoderRNN, self).__init__()
    self.n_layers = n_layers
    self.hidden_size = hidden_size
    self.embedding = embedding
    
    # Initialize GRU
    self.gru = nn.GRU(hidden_size, 
                      hidden_size, 
                      n_layers,
                      dropout=(0 if n_layers == 1 else dropout), 
                      bidirectional=True)

  def forward(self, input_seq, input_lengths, hidden=None):
    # Convert word indexes to embeddings
    embedded = self.embedding(input_seq)
    
    # Pack padded batch of sequences for RNN module
    packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths.cpu())  # Ensure lengths are on CPU
    # Forward pass through GRU
    outputs, hidden = self.gru(packed, hidden)
    # Unpack padding
    outputs, _ = torch.nn.utils.rnn.pad_packed_sequence(outputs)
    # Sum bidirectional GRU outputs
    outputs = outputs[:, :, :self.hidden_size] + outputs[:, :, self.hidden_size:]
    # Return output and final hidden state
    return outputs, hidden

In [17]:
# Luong attention layer
class Attn(torch.nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        self.method = method
        self.hidden_size = hidden_size
        if self.method == 'general':
            self.attn = torch.nn.Linear(self.hidden_size, hidden_size)
        elif self.method == 'concat':
            self.attn = torch.nn.Linear(self.hidden_size * 2, hidden_size)
            self.v = torch.nn.Parameter(torch.FloatTensor(hidden_size))

    def dot_score(self, hidden, encoder_output):
        return torch.sum(hidden * encoder_output, dim=2)

    def general_score(self, hidden, encoder_output):
        energy = self.attn(encoder_output)
        return torch.sum(hidden * energy, dim=2)

    def concat_score(self, hidden, encoder_output):
        energy = self.attn(torch.cat((hidden.expand(encoder_output.size(0), -1, -1), encoder_output), 2)).tanh()
        return torch.sum(self.v * energy, dim=2)

    def forward(self, hidden, encoder_outputs):
        # Calculate the attention weights (energies) based on the given method
        if self.method == 'general':
            attn_energies = self.general_score(hidden, encoder_outputs)
        elif self.method == 'concat':
            attn_energies = self.concat_score(hidden, encoder_outputs)
        elif self.method == 'dot':
            attn_energies = self.dot_score(hidden, encoder_outputs)

        # Transpose max_length and batch_size dimensions
        attn_energies = attn_energies.t()

        # Return the softmax normalized probability scores (with added dimension)
        return F.softmax(attn_energies, dim=1).unsqueeze(1)

In [18]:
class LuongAttnDecoderRNN(nn.Module):
    def __init__(self, attn_model, embedding, hidden_size, output_size, n_layers=1, dropout=0.1):
        super(LuongAttnDecoderRNN, self).__init__()

        # Keep for reference
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout

        # Define layers
        self.embedding = embedding
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout))
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)

        self.attn = Attn(attn_model, hidden_size)

    def forward(self, input_seq, last_hidden, encoder_outputs):
        # Note: we run this one step (word) at a time
        # Get embedding of current input word
        embedded = self.embedding(input_seq)
        embedded = self.embedding_dropout(embedded)
        # Forward through unidirectional GRU
        rnn_output, hidden = self.gru(embedded, last_hidden)
        # Calculate attention weights from the current GRU output
        attn_weights = self.attn(rnn_output, encoder_outputs)
        # Multiply attention weights to encoder outputs to get new "weighted sum" context vector
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
        # Concatenate weighted context vector and GRU output using Luong eq. 5
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1) 
        concat_input = torch.cat((rnn_output, context), 1)
        concat_output = torch.tanh(self.concat(concat_input))
        # Predict next word using Luong eq. 6
        output = self.out(concat_output)
        output = F.softmax(output, dim=1)
        # Return output and final hidden state
        return output, hidden

In [19]:
def maskNLLLoss(inp, target, mask):
    nTotal = mask.sum()
    crossEntropy = -torch.log(torch.gather(inp, 1, target.view(-1, 1)))
    # Ensure mask is a BoolTensor
    mask = mask.bool()  # Convert mask to BoolTensor
    loss = crossEntropy.masked_select(mask).mean()
    loss = loss.to(device)
    return loss, nTotal.item()


In [20]:
def train(input_variable, lengths, target_variable, mask, max_target_len, encoder, decoder, embedding,
          encoder_optimizer, decoder_optimizer, batch_size, clip, max_length=MAX_LENGTH):

    # Zero gradients
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    # Set device options
    input_variable = input_variable.to(device)
    lengths = lengths.to(device)
    target_variable = target_variable.to(device)
    mask = mask.to(device)

    # Initialize variables
    loss = 0
    print_losses = []
    n_totals = 0

    # Forward pass through encoder
    encoder_outputs, encoder_hidden = encoder(input_variable, lengths)

    # Create initial decoder input (start with SOS tokens for each sentence)
    decoder_input = torch.LongTensor([[SOS_token for _ in range(batch_size)]])
    decoder_input = decoder_input.to(device)

    # Set initial decoder hidden state to the encoder's final hidden state
    decoder_hidden = encoder_hidden[:decoder.n_layers]

    # Determine if we are using teacher forcing this iteration
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    # Forward batch of sequences through decoder one time step at a time
    if use_teacher_forcing:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # Teacher forcing: next input is current target
            decoder_input = target_variable[t].view(1, -1)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal
    else:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # No teacher forcing: next input is decoder's own current output
            _, topi = decoder_output.topk(1)
            decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
            decoder_input = decoder_input.to(device)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal

    # Perform backpropatation
    loss.backward()

    # Clip gradients: gradients are modified in place
    _ = torch.nn.utils.clip_grad_norm_(encoder.parameters(), clip)
    _ = torch.nn.utils.clip_grad_norm_(decoder.parameters(), clip)

    # Adjust model weights
    encoder_optimizer.step()
    decoder_optimizer.step()

    return sum(print_losses) / n_totals
     

In [21]:
def trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer, 
               embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size, 
               print_every, save_every, clip, corpus_name, loadFilename):

    # Load batches for each iteration
    training_batches = [batch2TrainData(voc, [random.choice(pairs) for _ in range(batch_size)])
                      for _ in range(n_iteration)]

    # Initializations
    print('Initializing ...')
    start_iteration = 1
    print_loss = 0
    if loadFilename:
        start_iteration = checkpoint['iteration'] + 1

    # Training loop
    print("Training...")
    for iteration in range(start_iteration, n_iteration + 1):
        training_batch = training_batches[iteration - 1]
        # Extract fields from batch
        input_variable, lengths, target_variable, mask, max_target_len = training_batch

        # Run a training iteration with batch
        loss = train(input_variable, lengths, target_variable, mask, max_target_len, encoder,
                     decoder, embedding, encoder_optimizer, decoder_optimizer, batch_size, clip)
        print_loss += loss

        # Print progress
        if iteration % print_every == 0:
            print_loss_avg = print_loss / print_every
            print("Iteration: {}; Percent complete: {:.1f}%; Average loss: {:.4f}".format(iteration, iteration / n_iteration * 100, print_loss_avg))
            print_loss = 0

        # Save checkpoint
        if (iteration % save_every == 0):
            directory = os.path.join(save_dir, model_name, corpus_name, '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size))
            if not os.path.exists(directory):
                os.makedirs(directory)
            torch.save({
                'iteration': iteration,
                'en': encoder.state_dict(),
                'de': decoder.state_dict(),
                'en_opt': encoder_optimizer.state_dict(),
                'de_opt': decoder_optimizer.state_dict(),
                'loss': loss,
                'voc_dict': voc.__dict__,
                'embedding': embedding.state_dict()
            }, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpoint')))

In [41]:
class GreedySearchDecoder(nn.Module):
    def __init__(self, encoder, decoder):
        super(GreedySearchDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, input_seq, input_length, max_length):
        # Forward input through encoder model
        encoder_outputs, encoder_hidden = self.encoder(input_seq, input_length)
        # Prepare encoder's final hidden layer to be the first hidden input to the decoder
        decoder_hidden = encoder_hidden[:self.decoder.n_layers]
        # Initialize decoder input with SOS_token
        decoder_input = torch.LongTensor([[SOS_token]]).to(device)  # Use the global `device` variable
        # Initialize tensors to append decoded words to
        all_tokens = torch.zeros([0], device=device, dtype=torch.long)  # Use the global `device` variable
        all_scores = torch.zeros([0], device=device)
        # Iteratively decode one word token at a time
        for _ in range(max_length):
            # Forward pass through decoder
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden, encoder_outputs)
            # Obtain most likely word token and its softmax score
            decoder_scores, decoder_input = torch.max(decoder_output, dim=1)
            # Record token and score
            all_tokens = torch.cat((all_tokens, decoder_input), dim=0)
            all_scores = torch.cat((all_scores, decoder_scores), dim=0)
            # Prepare current token to be next decoder input (add a dimension)
            decoder_input = torch.unsqueeze(decoder_input, 0)
        # Return collections of word tokens and scores
        return all_tokens, all_scores


In [42]:
def evaluate(encoder, decoder, searcher, voc, sentence, max_length=MAX_LENGTH):
    indexes_batch = [indexesFromSentence(voc, sentence)]
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    input_batch = torch.LongTensor(indexes_batch).transpose(0, 1).to(device)
    lengths = lengths.to(device)
    tokens, scores = searcher(input_batch, lengths, max_length)
    decoded_words = [voc.index2word[token.item()] for token in tokens]
    return decoded_words
def evaluateInput(encoder, decoder, searcher, voc):
    input_sentence = ''
    while(1):
        try:
            # Get input sentence
            input_sentence = input('> ')
            # Check if it is quit case
            if input_sentence == 'q' or input_sentence == 'quit': break
            # Normalize sentence
            input_sentence = normalizeString(input_sentence)
            # Evaluate sentence
            output_words = evaluate(encoder, decoder, searcher, voc, input_sentence)
            # Format and print response sentence
            output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')]
            print('Bot:', ' '.join(output_words))
    
        except KeyError:
            # print("Error: Encountered unknown word.")
            print("I don't know!")

In [27]:
model_name = 'cb_model_2'
attn_model = 'dot'
#attn_model = 'general'
#attn_model = 'concat'
hidden_size = 500
encoder_n_layers = 2
decoder_n_layers = 2
dropout = 0.1
batch_size = 64

# Set checkpoint to load from; set to None if starting from scratch
loadFilename = None
checkpoint_iter = 4000
# loadFilename = os.path.join(corpus_name, 
#                             '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size) +'{}_checkpoint.tar'.format(checkpoint_iter))


# Load model if a loadFilename is provided
if loadFilename:
    # If loading on same machine the model was trained on
    checkpoint = torch.load(loadFilename)
    # If loading a model trained on GPU to CPU
    #checkpoint = torch.load(loadFilename, map_location=torch.device('cpu'))
    encoder_sd = checkpoint['en']
    decoder_sd = checkpoint['de']
    encoder_optimizer_sd = checkpoint['en_opt']
    decoder_optimizer_sd = checkpoint['de_opt']
    embedding_sd = checkpoint['embedding']
    voc.__dict__ = checkpoint['voc_dict']
    

print('Building encoder and decoder ...')
# Initialize word embeddings
embedding = nn.Embedding(voc.num_words, hidden_size)

if loadFilename:
    embedding.load_state_dict(embedding_sd)
# Initialize encoder & decoder models
encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout)
if loadFilename:
    encoder.load_state_dict(encoder_sd)
    decoder.load_state_dict(decoder_sd)

USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")
# Use appropriate device
encoder = encoder.to(device)
decoder = decoder.to(device)
print('Models built and ready to go!')

Building encoder and decoder ...
Models built and ready to go!


In [28]:
# Configure training/optimization
clip = 50.0
teacher_forcing_ratio = 1.0
learning_rate = 0.0001
decoder_learning_ratio = 5.0
n_iteration = 10000
print_every = 1
save_every = 500

# Ensure dropout layers are in train mode
encoder.train()
decoder.train()

# Initialize optimizers
print('Building optimizers ...')
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)
if loadFilename:
    encoder_optimizer.load_state_dict(encoder_optimizer_sd)
    decoder_optimizer.load_state_dict(decoder_optimizer_sd)

save_dir = "new_training"
# Run training iterations
print("Starting Training!")
trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer, 
           embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size, 
           print_every, save_every, clip, corpus_name, loadFilename)

Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 1; Percent complete: 0.0%; Average loss: 8.7292
Iteration: 2; Percent complete: 0.0%; Average loss: 8.4705
Iteration: 3; Percent complete: 0.0%; Average loss: 8.1237
Iteration: 4; Percent complete: 0.0%; Average loss: 7.6646
Iteration: 5; Percent complete: 0.1%; Average loss: 6.9429
Iteration: 6; Percent complete: 0.1%; Average loss: 6.3988
Iteration: 7; Percent complete: 0.1%; Average loss: 6.4572
Iteration: 8; Percent complete: 0.1%; Average loss: 6.2491
Iteration: 9; Percent complete: 0.1%; Average loss: 6.1648
Iteration: 10; Percent complete: 0.1%; Average loss: 6.2857
Iteration: 11; Percent complete: 0.1%; Average loss: 5.7436
Iteration: 12; Percent complete: 0.1%; Average loss: 6.0297
Iteration: 13; Percent complete: 0.1%; Average loss: 5.7042
Iteration: 14; Percent complete: 0.1%; Average loss: 5.4005
Iteration: 15; Percent complete: 0.1%; Average loss: 5.2832
Iteration: 16; Percent complete: 0.2%

In [43]:
def evaluate(encoder, decoder, searcher, voc, sentence, max_length=MAX_LENGTH):
    """Evaluates a single input sentence and returns the response."""
    indexes_batch = [indexesFromSentence(voc, sentence)]
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    input_batch = torch.LongTensor(indexes_batch).transpose(0, 1).to(device)
    lengths = lengths.to(device)
    
    encoder.eval()
    decoder.eval()
    
    tokens, scores = searcher(input_batch, lengths, max_length)
    decoded_words = [voc.index2word[token.item()] for token in tokens]
    return decoded_words


def evaluateInput(encoder, decoder, searcher, voc):
    """Interactive loop to chat with the trained bot."""
    print("Chat with the bot! Type 'q' or 'quit' to end the conversation.")
    
    encoder.eval()
    decoder.eval()

    while True:
        try:
            input_sentence = input("> ")
            if input_sentence.lower() in ['q', 'quit']:
                print("Goodbye!")
                break
                
            input_sentence = normalizeString(input_sentence)
            output_words = evaluate(encoder, decoder, searcher, voc, input_sentence)
            
            output_words = [word for word in output_words if word not in ['EOS', 'PAD']]
            print("Bot:", ' '.join(output_words))
            
        except KeyError as e:
            print("I didn't understand that. Could you rephrase?")

# Setup for evaluation (ensure you define or import GreedySearchDecoder)
print("Setting up Greedy Search Decoder...")
searcher = GreedySearchDecoder(encoder, decoder)



Setting up Greedy Search Decoder...


In [44]:
# Begin chatbot interaction
evaluateInput(encoder, decoder, searcher, voc)

Chat with the bot! Type 'q' or 'quit' to end the conversation.


>  Hi


Bot: hi .


>  How Are you ?


Bot: bad .


>  why ?


Bot: cause just that that is just the way it is .


>  no you need to be good ?


Bot: i am not seeing anybody .


>  why ?


Bot: cause just that that is just the way it is .


>  what is your name ?


Bot: jakov ?


>  what ?


Bot: yeah they left .


>  i am bilal !


I didn't understand that. Could you rephrase?


>  my name is bilal


I didn't understand that. Could you rephrase?


>  q


Goodbye!


In [45]:
save_path = '/kaggle/working'
torch.save(encoder.state_dict(), f"{save_path}/encoder.pth")
torch.save(decoder.state_dict(), f"{save_path}/decoder.pth")
torch.save(embedding.state_dict(), f"{save_path}/embedding.pth")


In [46]:
import pickle

with open(f"{save_path}/voc.pkl", "wb") as f:
    pickle.dump(voc, f)
