In [2]:
import torch as t
import csv
import ast
import re

  from .autonotebook import tqdm as notebook_tqdm


# Data Preprocessing

## 1. Visualise the first few lines of the tsv files

In [3]:
movie_lines_filename = "cornell_movie_corpus/movie_lines.tsv"
lines_to_visualise = 10

# Open the TSV file in read mode
with open(movie_lines_filename, "r", encoding="utf-8") as file:
    # Iterate over each line in the file
    for i, line in enumerate(file):
        # Remove any leading/trailing whitespace and split the line by tabs
        row = line.strip().split("\t")
        
        # Process the row as desired
        print(row)
        
        # Check if the desired number of lines has been reached
        if i + 1 >= lines_to_visualise:
            break

    print("\n")
    print(type(row))

['L1045', 'u0', 'm0', 'BIANCA', 'They do not!']
['L1044', 'u2', 'm0', 'CAMERON', 'They do to!']
['L985', 'u0', 'm0', 'BIANCA', 'I hope so.']
['L984', 'u2', 'm0', 'CAMERON', 'She okay?']
['L925', 'u0', 'm0', 'BIANCA', "Let's go."]
['L924', 'u2', 'm0', 'CAMERON', 'Wow']
['L872', 'u0', 'm0', 'BIANCA', "Okay -- you're gonna need to learn how to lie."]
['L871', 'u2', 'm0', 'CAMERON', 'No']
['"L870', 'u0', 'm0', 'BIANCA', 'I\'m kidding.  You know how sometimes you just become this ""persona""?  And you don\'t know how to quit?"']
['L869', 'u0', 'm0', 'BIANCA', 'Like my fear of wearing pastels?']


<class 'list'>


In [4]:
movie_conv_filename = "cornell_movie_corpus/movie_conversations.tsv"
lines_to_visualise = 10

# Open the TSV file in read mode
with open(movie_conv_filename, "r", encoding="utf-8") as file:
    # Iterate over each line in the file
    for i, line in enumerate(file):
        # Remove any leading/trailing whitespace and split the line by tabs
        row = line.strip().split("\t")
        
        # Process the row as desired
        print(row)
        
        # Check if the desired number of lines has been reached
        if i + 1 >= lines_to_visualise:
            break
     

    print("\n")
    print(type(row))

['u0', 'u2', 'm0', "['L194' 'L195' 'L196' 'L197']"]
['u0', 'u2', 'm0', "['L198' 'L199']"]
['u0', 'u2', 'm0', "['L200' 'L201' 'L202' 'L203']"]
['u0', 'u2', 'm0', "['L204' 'L205' 'L206']"]
['u0', 'u2', 'm0', "['L207' 'L208']"]
['u0', 'u2', 'm0', "['L271' 'L272' 'L273' 'L274' 'L275']"]
['u0', 'u2', 'm0', "['L276' 'L277']"]
['u0', 'u2', 'm0', "['L280' 'L281']"]
['u0', 'u2', 'm0', "['L363' 'L364']"]
['u0', 'u2', 'm0', "['L365' 'L366']"]


<class 'list'>


## 2. Create relevant dictionaries based on the tsv file datas 

In [5]:
movie_lines = {}
movie_lines_fields = ['line ID','user ID','movie ID', 'char name', 'text']

# Open the TSV file in read mode
with open(movie_lines_filename, "r", encoding="utf-8") as file:
    for i, line in enumerate(file):
        
        # Remove any leading/trailing whitespace and split the line by tabs
        line_parts = line.strip().split("\t")
        
        if(len(line_parts) > 4):             # Handling the bad datas where there are empty texts
            
            # Extract the individual parts
            line_id = line_parts[0]
            user_id = line_parts[1]
            movie_id = line_parts[2]
            character_name = line_parts[3]
            text = line_parts[4]

        # Create a dictionary for the line
        line_dict = {
            'lineID': line_id,
            'userID': user_id,
            'movieID': movie_id,
            'charName': character_name,
            'text': text
        }

        # Add the line dictionary to the result dictionary
        movie_lines[line_id] = line_dict


In [6]:
# movie_lines is a dictionary of dictionaries
# Print the first 3 of them
movie_lines

count = 0
for key, value in movie_lines.items():
    print(key, value)
    count += 1
    if count == 3:
        break

L1045 {'lineID': 'L1045', 'userID': 'u0', 'movieID': 'm0', 'charName': 'BIANCA', 'text': 'They do not!'}
L1044 {'lineID': 'L1044', 'userID': 'u2', 'movieID': 'm0', 'charName': 'CAMERON', 'text': 'They do to!'}
L985 {'lineID': 'L985', 'userID': 'u0', 'movieID': 'm0', 'charName': 'BIANCA', 'text': 'I hope so.'}


In [7]:
# First element of the movie_lines dictionary
list(movie_lines.items())[0]

('L1045',
 {'lineID': 'L1045',
  'userID': 'u0',
  'movieID': 'm0',
  'charName': 'BIANCA',
  'text': 'They do not!'})

In [8]:
# Demonstrating the regex function used
import re

line_numbers_str = "['L194' 'L195' 'L196' 'L197']"

# Extract the line numbers using regular expressions
line_numbers_list = re.findall(r"'(\w+)'", line_numbers_str)

print(line_numbers_list)
print(type(line_numbers_list))

['L194', 'L195', 'L196', 'L197']
<class 'list'>


In [9]:
movie_conv_fields = ['charID1','charID2','movieID', 'lineIDs']
conversations = []

# Open the TSV file in read mode
with open(movie_conv_filename, "r", encoding="utf-8") as file:
    
    # Iterate over each line in the file
    for i, line in enumerate(file):
        conv_dict = {}                            # Declare an empty dictionary
        row = line.strip().split("\t")            #  ['u0', 'u2', 'm0', "['L194' 'L195' 'L196' 'L197']"]
        for j, conv_field in enumerate(movie_conv_fields):
            if(conv_field == 'lineIDs'):
                row[j] = re.findall(r"'(\w+)'", row[j])   # matches any alphanumeric characters (\w+) enclosed in single quotes
            conv_dict[conv_field] = row[j] 
        
        conversations.append(conv_dict)

In [10]:
# conversations is a list of dictionaries
# First element of the conversations list
conversations[0]

{'charID1': 'u0',
 'charID2': 'u2',
 'movieID': 'm0',
 'lineIDs': ['L194', 'L195', 'L196', 'L197']}

In [11]:
c = [
    {
        'charID1': 'u0',
        'charID2': 'u2',
        'movieID': 'm0',
        'lineIDs': ['L194', 'L195', 'L196', 'L197']
    },
    # other dictionaries
    ]

m_lines = {
    'L194': {'lineID': 'L194', 'userID': 'u0', 'movieID': 'm0', 'charName': 'BIANCA', 'text': 'They do not!'},
    'L195': {'lineID': 'L195', 'userID': 'u2', 'movieID': 'm0', 'charName': 'CAMERON', 'text': 'They do to!'},
    'L500': {'lineID': 'L500', 'userID': 'u2', 'movieID': 'm0', 'charName': 'CAMERON', 'text': 'They do to!'},
    # other dictionaries
}

for item in c:
    line_ids = item['lineIDs']
    lines = []
    for line_id in line_ids:
        line = m_lines.get(line_id)
        if line:
            lines.append(line)
    item['lines'] = lines

print(c)

[{'charID1': 'u0', 'charID2': 'u2', 'movieID': 'm0', 'lineIDs': ['L194', 'L195', 'L196', 'L197'], 'lines': [{'lineID': 'L194', 'userID': 'u0', 'movieID': 'm0', 'charName': 'BIANCA', 'text': 'They do not!'}, {'lineID': 'L195', 'userID': 'u2', 'movieID': 'm0', 'charName': 'CAMERON', 'text': 'They do to!'}]}]


## 3. Merging the processed conversations list and movie_lines dictionary 
- Add a new key 'lines' in each dictionary of the conversations list of dictionaries

In [12]:
for conv in conversations:
    line_ids = conv['lineIDs']
    lines = []
    for line_id in line_ids:
        line = movie_lines.get(line_id)
        if line:
            lines.append(line)
    conv['lines'] = lines

In [13]:
conversations[0]

{'charID1': 'u0',
 'charID2': 'u2',
 'movieID': 'm0',
 'lineIDs': ['L194', 'L195', 'L196', 'L197'],
 'lines': [{'lineID': 'L194',
   'userID': 'u0',
   'movieID': 'm0',
   'charName': 'BIANCA',
   'text': 'Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.'},
  {'lineID': 'L195',
   'userID': 'u2',
   'movieID': 'm0',
   'charName': 'CAMERON',
   'text': "Well I thought we'd start with pronunciation if that's okay with you."},
  {'lineID': 'L196',
   'userID': 'u0',
   'movieID': 'm0',
   'charName': 'BIANCA',
   'text': 'Not the hacking and gagging and spitting part.  Please.'},
  {'lineID': 'L197',
   'userID': 'u2',
   'movieID': 'm0',
   'charName': 'CAMERON',
   'text': "Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?"}]}

## 4. Extract q/a pairs

In [14]:
qa_pairs = []

for conv in conversations:
    lines = conv['lines']
    for i in range(len(lines) - 1):
        q = lines[i]['text'].strip()       # remove trailing and leading whitespace characters 
        a = lines[i+1]['text'].strip()
        if len(q) > 0 and len(a) > 0:       # filter empty lists
            qa_pairs.append([q, a])

        
len(qa_pairs)

217150

In [15]:
# qa_pairs being a list of lists, we first visualise the first 3

qa_pairs[:3]

[['Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.',
  "Well I thought we'd start with pronunciation if that's okay with you."],
 ["Well I thought we'd start with pronunciation if that's okay with you.",
  'Not the hacking and gagging and spitting part.  Please.'],
 ['Not the hacking and gagging and spitting part.  Please.',
  "Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?"]]

## 5. Writing a new tsv file using the qa_pairs

In [16]:
filename = "cornell_movie_corpus/formatted_qa_pairs.tsv"

print("Writting a newly formatted file .....")
with open(filename, "w", encoding="utf-8") as file:
    for pair in qa_pairs:
        q, a = pair
        file.write(f"{q}\t{a}\n")     # format the QnA with a TAB character b/w and a newline b/w every Qna pair 

print("File written successfully.")

Writting a newly formatted file .....
File written successfully.


In [17]:
with open(filename, "rb") as file:
    lines = file.readlines()
for line in lines[:5]:
    print(line)

b"Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.\tWell I thought we'd start with pronunciation if that's okay with you.\r\n"
b"Well I thought we'd start with pronunciation if that's okay with you.\tNot the hacking and gagging and spitting part.  Please.\r\n"
b"Not the hacking and gagging and spitting part.  Please.\tOkay... then how 'bout we try out some French cuisine.  Saturday?  Night?\r\n"
b"You're asking me out.  That's so cute. What's your name again?\tForget it.\r\n"
b"No no it's my fault -- we didn't have a proper introduction ---\tCameron.\r\n"


## 6. Processing the words
- Defining a WordIndexer class
- Define helper functions to preprocess the text such as - convert unicode to ASCII, normalise the strings
- Read from the formatted qa pairs tsv file and store the preprocessed qa pairs in pair 

In [18]:
# Defining tokens
PAD_token = 0             # for padding short sentences
SOS_token = 1             # start of sentence token
EOS_token = 2             # End of sentence

class WordIndexer:
    def __init__(self, corpus_name):
        """
        Initializes a WordIndexer object.

        Args:
        - corpus_name: A string representing the name of the corpus or dataset.
                                                                                        """
        self.corpus_name = corpus_name
        self.special_tokens = ['PAD', 'SOS', 'EOS']
        self.word_to_index = {'PAD' : PAD_token, 'SOS': SOS_token, 'EOS': EOS_token}
        self.word_counts = {}           # Stores the count of each word in the corpus
        self.index_to_word = {PAD_token:'PAD', SOS_token:'SOS', EOS_token:'EOS'}
        self.num_words = 3             # to include the 3 special tokens: PAD, SOS, EOS

    def add_word(self, word):
        """
        Adds a word to the vocabulary/WordIndexer object

        Args:
        - word: A string representing the word to be added.
                                                                    """
        if word not in self.word_to_index:
            index = len(self.word_to_index) + 1
            self.word_to_index[word] = index
            self.index_to_word[index] = word
            self.word_counts[word] = 1
            self.num_words += 1
        else:
            self.word_counts[word] += 1

    def add_sentence(self, sentence):
        """
            Adds all words in a sentence to the vocabulary/WordIndexer object

        Args:
        - sentence: A string representing the sentence.
                                                                """
        words = sentence.split()
        for word in words:
            self.add_word(word)
    

    def trim_less_freq_words(self, threshold):
        """
        Remove words below a certain count threshold and update the word-index mapping.

        Args:
        - threshold: int : the minimum count for a word to be retained.
        """
        words_to_remove = []
        for word, count in self.word_counts.items():
            if count < threshold:
                words_to_remove.append(word)

        new_word_to_index = {'PAD': PAD_token, 'SOS': SOS_token, 'EOS': EOS_token}
        new_index_to_word = {PAD_token: 'PAD', SOS_token: 'SOS', EOS_token: 'EOS'}
        new_word_counts = {}
        new_num_words = 3                    # to include the 3 special tokens: PAD, SOS, EOS

        next_index = 3
        for word, index in self.word_to_index.items():
            if word not in words_to_remove:
                if word not in self.special_tokens:
                    new_word_to_index[word] = next_index
                    new_index_to_word[next_index] = word
                    new_word_counts[word] = self.word_counts[word]
                    next_index += 1
                    new_num_words += 1

        self.word_to_index = new_word_to_index
        self.index_to_word = new_index_to_word
        self.word_counts = new_word_counts
        self.num_words = new_num_words


    # Retrieve the index of a word from the vocabulary
    def get_word_index(self, word):  
        return self.word_to_index.get(word)

    # Retruns the word corresponding to an index from the vocabulary
    def get_index_word(self, index):
        return self.index_to_word.get(index)


### A sample Run to test/show the working of the WordIndexer class

In [19]:
W = WordIndexer("sample")
W.add_sentence("My name is Donal. The name is Sherlock Holmes")
W.word_to_index

{'PAD': 0,
 'SOS': 1,
 'EOS': 2,
 'My': 4,
 'name': 5,
 'is': 6,
 'Donal.': 7,
 'The': 8,
 'Sherlock': 9,
 'Holmes': 10}

In [20]:
W.word_counts

{'My': 1,
 'name': 2,
 'is': 2,
 'Donal.': 1,
 'The': 1,
 'Sherlock': 1,
 'Holmes': 1}

In [21]:
W.index_to_word

{0: 'PAD',
 1: 'SOS',
 2: 'EOS',
 4: 'My',
 5: 'name',
 6: 'is',
 7: 'Donal.',
 8: 'The',
 9: 'Sherlock',
 10: 'Holmes'}

In [22]:
W.num_words

10

In [23]:
W.trim_less_freq_words(2)

In [24]:
W.word_to_index

{'PAD': 0, 'SOS': 1, 'EOS': 2, 'name': 3, 'is': 4}

In [25]:
!pip install unidecode



In [26]:
from unidecode import unidecode

# Thanks to stackoverflow "https://stackoverflow.com/a/518232/2809427"
def unicode_to_ascii(text):
    """
    Convert Unicode text to ASCII by transliterating non-ASCII characters to their closest ASCII equivalents.
    
    Args:
        text (str): The Unicode text to convert.
    Returns:
        str: The converted ASCII text.
                                                                                                                """
    return unidecode(text)

In [27]:
unicode_to_ascii('北亰')

'Bei Jing '

In [28]:
unicode_to_ascii('François')

'Francois'

In [29]:
unicode_to_ascii('kožušček')

'kozuscek'

In [30]:
def normalize_string(text):
    """
    Normalize a string by converting it to lowercase, adding space before punctuation marks,
    removing non-letter characters, and removing sequences of whitespace.
    
    Args:
        text (str): The input string to normalize.
    
    Returns:
        str: The normalized string.
    """
    # Convert the text to lowercase
    normalized_text = unicode_to_ascii(text.lower())         # no need to strip()

    # Add space before punctuation marks
    normalized_text = re.sub(r"([.,!?])", r" \1", normalized_text)

    # Remove non-letter characters
    normalized_text = re.sub(r"[^a-zA-Z.,!? ]", "", normalized_text)

    # Remove sequences of whitespace
    normalized_text = re.sub(r"\s+", " ", normalized_text)

    return normalized_text.strip()

In [31]:
normalize_string("    AbC123aa!s's    dd?    ")

'abcaa !ss dd ?'

In [32]:
def read_tsv_file(file_path):
    data = []
    
    print("Reading and processing file ...")
    with open(file_path, 'r', encoding='utf-8') as tsv_file:
        reader = csv.reader(tsv_file, delimiter='\t')
        for row in reader:
            if len(row) == 2:
                q = normalize_string(row[0])
                a = normalize_string(row[1])
                data.append([q, a])
    print("Done reading.")
    return data

In [33]:
data_filename = "cornell_movie_corpus/formatted_qa_pairs.tsv"
pairs = read_tsv_file(data_filename)

Reading and processing file ...


Done reading.


In [34]:
# Visualise the first 2 pairs
pairs[0:2]

[['can we make this quick ? roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad . again .',
  'well i thought wed start with pronunciation if thats okay with you .'],
 ['well i thought wed start with pronunciation if thats okay with you .',
  'not the hacking and gagging and spitting part . please .']]

In [35]:
len(pairs)

217150

## Filtering the text 

In [36]:
def filter_qa_pairs(data, q_threshold=12, a_threshold=12):
    """
    Filters the question-answer pairs beyond a threshold length of words,
    Args:
    data = (the list of lists containing the normalized question-answer pairs) 
    q_threshold, a_threshold = (the maximum number of words allowed for the question and answer).
                                                                                                        """
    filtered_data = []
    for pair in data:
        q = pair[0]
        a = pair[1]
        if len(q.split()) <= q_threshold and len(a.split()) <= a_threshold:
            filtered_data.append(pair)
    
    return filtered_data


In [37]:
print(f"There are {len(pairs)} pairs/conversations in the dataset")
filtered_pairs = filter_qa_pairs(pairs)
print(f"After filtering (by threshold) , there are {len(filtered_pairs)} pairs/conversations")

There are 217150 pairs/conversations in the dataset


After filtering (by threshold) , there are 100847 pairs/conversations


### Instantiate an object of the class WordIndexer

In [38]:
voc = WordIndexer("cornell_movie_corpus")

In [39]:
for pair in filtered_pairs:
    q, a = pair[0], pair[1]
    voc.add_sentence(q)
    voc.add_sentence(a)

print(f"Count of words in the voc = {voc.num_words}")
for pair in filtered_pairs[:5]:
    print(pair)
    

Count of words in the voc = 28151
['no no its my fault we didnt have a proper introduction', 'cameron .']
['gosh if only we could find kat a boyfriend . . .', 'let me see what i can do .']
['cesc ma tete . this is my head', 'right . see ? youre ready for the quiz .']
['thats because its such a nice one .', 'forget french .']
['how is our little find the wench a date plan progressing ?', 'well theres someone i think might be']


## Remove those qa pairs if any word of  'q' or 'a' occurs less than a threshold value

In [40]:
def filter_by_word_frequency(voc, qa_pairs, threshold):

    # Remove words below the threshold from the class instance
    voc.trim_less_freq_words(threshold)

    # Filter QA pairs based on word frequency
    keep_pairs = []
    for pair in qa_pairs:
        q, a = pair[0], pair[1]
        q_words = q.split()
        a_words = a.split()

        # Check if any word in 'q' or 'a' is below the threshold
        if any(voc.word_counts.get(word, 0) < threshold for word in q_words + a_words):
            continue                                    # Skip this QA pair if any word is below the threshold

        # Append the QA pair to the filtered list
        keep_pairs.append(pair)

    print("Trimmed from {} pairs to {}, {:.4f} of total".format(len(qa_pairs), len(keep_pairs), len(keep_pairs) / len(qa_pairs)))
    
    return keep_pairs



In [41]:
threshold = 3    # trial and error
pairs = filter_by_word_frequency(voc, filtered_pairs, threshold)

Trimmed from 100847 pairs to 83359, 0.8266 of total


In [42]:
pairs[0:5]

[['no no its my fault we didnt have a proper introduction', 'cameron .'],
 ['gosh if only we could find kat a boyfriend . . .',
  'let me see what i can do .'],
 ['thats because its such a nice one .', 'forget french .'],
 ['there .', 'where ?'],
 ['you have my word . as a gentleman', 'youre sweet .']]

# Data Preparation
- voc = The WordIndexer Object instantiated, it contains the words of the dataset and corresponding indexes  
- pairs = The question-answer pairs after all data preprocessing, in the form of [['q1', 'a1'], [  ], ......,[  ]]

In [43]:
def sentence2indexes(voc, sentence):
    """ Given a sentence as input and returns a list of indexes 
        corresponding to the words in the sentence,
        followed by the EOS token.                               
    Args: 
        voc : The WordIndexer class instantiated.
        sentence : string : The sentence
    returns:
        list: the list of indices
                                                                    """
    words = sentence.split()
    indexes = [voc.get_word_index(word) for word in words]
    indexes.append(EOS_token)
    
    return indexes

In [44]:
pairs[0][0]

'no no its my fault we didnt have a proper introduction'

In [45]:
sentence2indexes(voc, pairs[0][0])

[3, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 2]

In [46]:
# TEST ON A BATCH_SIZE = 6
batch_size = 8
inp = []
op = []
for pair in pairs[:batch_size]:
    inp.append(pair[0])
    op.append(pair[1])
    
indexes = [sentence2indexes(voc, sentence) for sentence in inp]
indexes

[[3, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 2],
 [15, 16, 17, 7, 18, 19, 20, 10, 21, 14, 14, 14, 2],
 [40, 41, 4, 42, 10, 43, 44, 14, 2],
 [59, 14, 2],
 [61, 9, 5, 62, 14, 63, 10, 64, 2],
 [66, 14, 2],
 [9, 80, 72, 34, 2],
 [26, 82, 34, 2]]

In [47]:
def zero_pad_rows(index_list, pad_token=PAD_token):
    """
    Zero-pads the rows in the index_list so that all rows have the same length.
    Transposes the resulting list of lists.

    Args:
        index_list (list[list[int]]): List of index lists to be zero-padded and transposed.
        pad_token (int, optional): The padding token to use. Defaults to PAD_token = 0

    Returns:
        list[tuples]: Transposed and zero-padded list of tuples.
                    Shape = ()

    """
    max_length = max(len(row) for row in index_list)
    
    # Zero pad the rows
    padded_list = [row + [pad_token] * (max_length - len(row)) for row in index_list]
    
    # Transpose the list[list]
    transposed = list(zip(*padded_list))
    
    return transposed

### Testing the function - zero_pad_rows( ) and sentence2index()

In [48]:
padded_list = zero_pad_rows(indexes)
padded_list

[(3, 15, 40, 59, 61, 66, 9, 26),
 (3, 16, 41, 14, 9, 14, 80, 82),
 (4, 17, 4, 2, 5, 2, 72, 34),
 (5, 7, 42, 0, 62, 0, 34, 2),
 (6, 18, 10, 0, 14, 0, 2, 0),
 (7, 19, 43, 0, 63, 0, 0, 0),
 (8, 20, 44, 0, 10, 0, 0, 0),
 (9, 10, 14, 0, 64, 0, 0, 0),
 (10, 21, 2, 0, 2, 0, 0, 0),
 (11, 14, 0, 0, 0, 0, 0, 0),
 (12, 14, 0, 0, 0, 0, 0, 0),
 (2, 14, 0, 0, 0, 0, 0, 0),
 (0, 2, 0, 0, 0, 0, 0, 0)]

In [49]:
rows = len(padded_list)
columns = len(padded_list[0])

print("Shape is (max_length, batch_size) = ", (rows, columns))

Shape is (max_length, batch_size) =  (13, 8)


In [50]:
# This will later help us save space and time during training as it can be stored in 1 bit also
def binaryMatrix(padded_list):
    """ Given a padded matrix, converts it into a binary matrix
    by replacing non-zero elements with 1, else 0
    Args:
        padded_list (list of tuples): A list of tuples representing a matrix.
    Returns:
        list[list]: The binary matrix .
                                                                        """
    # convert non-zero elements to 1 and 0s to 0
    binary_matrix = [[1 if element > 0 else 0 for element in row] for row in padded_list]
    
    return binary_matrix

In [51]:
## Test the function
binaryMatrix(padded_list)

[[1, 1, 1, 1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1, 1, 1, 1],
 [1, 1, 1, 0, 1, 0, 1, 1],
 [1, 1, 1, 0, 1, 0, 1, 0],
 [1, 1, 1, 0, 1, 0, 0, 0],
 [1, 1, 1, 0, 1, 0, 0, 0],
 [1, 1, 1, 0, 1, 0, 0, 0],
 [1, 1, 1, 0, 1, 0, 0, 0],
 [1, 1, 0, 0, 0, 0, 0, 0],
 [1, 1, 0, 0, 0, 0, 0, 0],
 [1, 1, 0, 0, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0, 0, 0]]

In [52]:
def generateInputTensor(word_indexer, sentence_list):
    # Convert sentences to indexes
    indexes_batch = [sentence2indexes(voc, sentence) for sentence in sentence_list]    
    
    # Get the lengths of each sentences + 1 (EOS_token)
    lengths = torch.tensor([len(index) for index in indexes_batch])
    
    # Zero-pad the index list and transpose it, so as to be able to pass as batches
    padded_batches = zero_pad_rows(indexes_batch)
        
    # Convert the transposed matrix to a LongTensor
    input_tensor = torch.LongTensor(padded_batches)
    
    return input_tensor, lengths

In [53]:
def generateOutputTensor(word_indexer, sentence_list):
    # Convert sentences to indexes
    indexes_batch = [sentence2indexes(voc, sentence) for sentence in sentence_list]    
    
    # Get the maximum target/output length in the batch
    max_target_len = max([len(index) for index in indexes_batch])
    
    # Zero-pad the index list and transpose it, so as to be able to pass as batches
    padded_batches = zero_pad_rows(indexes_batch)
    
    # Get the binary mask
    binary_mask = binaryMatrix(padded_batches)
    binary_mask = torch.ByteTensor(binary_mask)
        
    # Convert the transposed matrix to a LongTensor
    output_tensor = torch.LongTensor(padded_batches)
    
    return output_tensor, binary_mask, max_target_len

In [54]:
def batch2Train(word_indexer, qa_batches):
    """
    Convert question-answer batches into input and output tensors for training.

    Arguments:
        word_indexer (WordIndexer): An instance of the WordIndexer class.
        qa_batches (list of lists): A list of question-answer batches, where each batch is a list of two elements: question and answer.

    Returns:
        input_tensor (torch.LongTensor): The input tensor containing the indexes of the questions after padding.
        input_lengths (torch.Tensor): The tensor containing the lengths of each input sequence.
        output_tensor (torch.LongTensor): The output tensor containing the indexes of the answers after padding.
        output_mask (torch.ByteTensor): The binary mask indicating the positions with non-zero elements in the output tensor.
        max_target_len (int): The maximum length of the target/output sequence.

    """
    # Sort the batches in descending order of question length as in number of words
    sorted_batches = sorted(qa_batches, key=lambda x: len(x[0].split()), reverse=True)
    
    question_batch, answer_batch = [], []
    for qa_batch in sorted_batches:
        question_batch.append(qa_batch[0])
        answer_batch.append(qa_batch[1])
        
    # generate input tensor and input lengths
    input_tensor, input_lengths = generateInputTensor(word_indexer, question_batch)
    
    # get output tensor, binary mask, and max target length
    output_tensor, output_mask, max_target_len = generateOutputTensor(word_indexer, answer_batch)
    
    return input_tensor, input_lengths, output_tensor, output_mask, max_target_len

In [55]:
import random
import torch 

In [56]:
# Small Example run
example_batch_size = 6
print("--------------- Visualising pairs[0:2] ------------")
print(pairs[:2])
print("\n")
qa_batches = [random.choice(pairs) for _ in range(example_batch_size)]
print("------------------ Printing qa_batches --------------- \n", qa_batches)
print("\n")

batches = batch2Train(voc, qa_batches)
input_tensor, input_lengths, output_tensor, output_mask, max_target_len = batches  # unpack the tuple

print("---------- VALIDATING THE ABOVE FUNCTIONS DEFINED ---------------")
print("Input tensor")
print(input_tensor)
print("Input lengths:", input_lengths)

print("Output tensor")
print(output_tensor)

print("Binary output/target mask")
print(output_mask)

print("Max target length: ", max_target_len)


--------------- Visualising pairs[0:2] ------------
[['no no its my fault we didnt have a proper introduction', 'cameron .'], ['gosh if only we could find kat a boyfriend . . .', 'let me see what i can do .']]


------------------ Printing qa_batches --------------- 
 [['i know .', 'how do you know ?'], ['home .', 'someone waiting for you ?'], ['thank god .', 'first time i ever heard that .'], ['how about someone who really really liked shostakovich ?', 'are you asking me to marry you ?'], ['you dont like the tune find another station .', 'what are you hiding danny ?'], ['i am arthur king of the britons .', 'my liege . . . forgive me . . .']]


---------- VALIDATING THE ABOVE FUNCTIONS DEFINED ---------------
Input tensor
tensor([[   47,    61,    26,    26,   605,   137],
        [  125,   141,   613,    74,   225,    14],
        [   55,    68,  4481,    14,    14,     2],
        [  130,    38,  2391,     2,     2,     0],
        [  409,  2285,   106,     0,     0,     0],
        

# Building the Model

The model is built using these 2 blocks, which is discussed briefly below
### Encoder
-  aedewfwefw

### Decoder
- wefewfwefw

In [57]:
import torch 
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

In [58]:
# !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117

In [59]:
CUDA = torch.cuda.is_available()
device = torch.device("cuda" if CUDA else "cpu")

In [60]:
CUDA

False

In [61]:
device

device(type='cpu')

## Encoder
- edkjewbfowen

In [62]:
class EncoderRNN(nn.Module):
    def __init__(self, voc , hidden_size, n_layers=1, dropout=0):
        """
        Encoder RNN module for sequence encoding using GRU.

        Args:
            voc (Vocabulary): The Vocabulary object containing relevant information like num_words
            hidden_size (int): Size of GRU's hidden state 
            n_layers (int): Number of GRU layers. Defaults to 1.
            dropout (float): Dropout probability between GRU layers.set to be 0, if n_layers = 1
        """
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.input_size = voc.num_words

        # Initialize word embeddings 
        self.embedding = nn.Embedding(self.input_size, hidden_size)
        
        # Input size of the GRU is set to hidden size because our 
        # input word is a word embedding with num of features = hidden_size (it can be anything though)
        self.gru = nn.GRU(hidden_size, hidden_size, num_layers=n_layers, bidirectional=True, 
                          dropout=(0 if n_layers == 1 else dropout))

        
    def forward(self, input_seq, input_lengths, hidden_state=None):
        """
        Forward pass of the EncoderRNN.

        Args:
            input_seq (torch.Tensor): Input sequence of word indexes. Shape: (max_length, batch_size).
            input_lengths (list): List of sequence lengths in the batch.
            hidden (torch.Tensor, optional): Initial hidden state of the GRU. Defaults to None.

        Returns:
            Outputs : hidden states at each time steps of the GRU. Shape: (max_len, batch_size, hidden_size * 2).
            hidden_state: GRU's Final hidden state. Shape: (num_layers * num_directions, batch_size, hidden_size).
        """
        
        # Word indexes to word embeddings
        embedded = self.embedding(input_seq)  # embedded shape: (sequence_length, batch_size, hidden_size)
        # Pack padded batch of sequences for RNN module, used to handle variable input lengths
        packed = nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)

        outputs, hidden_state = self.gru(packed, hidden_state)
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs)  # outputs shape: (sequence_length, batch_size, hidden_size * 2)

        # Sum the forward and backward hidden states of the bidirectional GRU
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, :, self.hidden_size:]
        
        # outputs: the output features h_t from the last layer of the GRU, at each timestep
        return outputs, hidden_state          # hidden_state from the last tine step(hn)

    

In [63]:
encoder=EncoderRNN(voc , 300, n_layers=1, dropout=0)
ans=encoder.forward(input_tensor, input_lengths=input_lengths, hidden_state=None)

In [64]:
ans[0].size()

torch.Size([10, 6, 300])

## Decoder


- This code for the attention

In [65]:
# Attention:
# encoder outputs 
# Method 1, using score as dot product between encoder's outputs and decoder's hidden state
class Attention(nn.Module):
    
    def forward(self,encoderOutputs,hidden):
        return torch.sum(hidden * encoderOutputs,dim=2)

**Decoder RNN**

In [68]:
# Decoder => uses attention b/w encoder's outputs and decoder's hidden state
class decoderRNN(nn.Module):
    # INFO:
    # 1. outputSize : It contains value as voc.num_words
    # 2. hidden : It contains value as last hidden state of encoder (2 x 6 x 300)
    # 3. hidden_size : It is equal to 300 => size of the vector that we are making
    # 4. voc : It is the complete vocabulary
    def __init__(self,voc,hidden_size,n_layers,batch_size, dropout = 0.1):
        super(decoderRNN,self).__init__()
        self.input_size =voc.num_words
        self.outputSize= voc.num_words
        self.hidden_size=hidden_size
        self.dropout=dropout
        self.n_layers=n_layers
        self.batch_size=batch_size
        # 1.
        # Initialize word embeddings 
        # This function comverts the input (1,batch_size)[1st time step] to a vector of dimetion 300 based on voc indexes
        self.embedding = nn.Embedding(self.input_size, hidden_size) # embedded shape: (batch_size, hidden_size)
        # after embedding, we get size as 6 x 300
        self.embedding_dropout = nn.Dropout(dropout)
        # 2.
        #  Now we do the GRU stuff
        #  We output both the output and the hidden_state, hidden state we further use for finding attention
        self.gru = nn.GRU(hidden_size, hidden_size, num_layers=n_layers, 
                          dropout=(0 if n_layers == 1 else dropout))
        # 3.
        # Now the attention
        # A  simple attention based on dot product between encoder's outputs and hidden state of decoder
        self.attn = Attention()
        
        # 
        self.FFNN1=nn.Linear(16,self.input_size)

        self.FFNN2=nn.Linear(self.input_size,self.outputSize)
    # variables:

    # 1.inputData : this a single time step , dim of 1 x 6
    # 2. encoderLastHidden : this is the encoder's last hidden state , dim of 2 x 6 x 300
    # encoderOutputs
    # encoderOutputs
    def forward(self,inputData,encoderLastHidden,encoderOutputs):
        # step 1 -  here u embed the 1 x batch_size [1,213,23,4,44,432] to a batch_size x hidden(300) vectors
        embedding = self.embedding(inputData)
        embedding = self.embedding_dropout(embedding)
        # step 2  - GRU thing
        decoderOutput, hiddenState = self.gru(embedding, encoderLastHidden[0])

        # step 3 - Now apply the attention to the encoder outputs and hidden state
        attentionWeights=self.attn.forward(decoderOutput,encoderOutputs)
        #print(attentionWeights.size(),encoderOutputs.size())
        # step 4 - multply the wiegts and encoder outputs, this gives context
        context=attentionWeights.unsqueeze(1).bmm(encoderOutputs)
        context=context.transpose(0,1)[0].transpose(0,1)
        # step 5 - here we have the decoderOutput of dim 300 and also we concatenate contex to this amking a 2* self.input_size to make a vector cantaining the context and output
        # Now, make a FFNN between 2* input_size and inputsize
      
        ConcatedVector_input = torch.cat((decoderOutput.transpose(0,1), context), 1)
        ConcatedVector_output = torch.tanh(self.FFNN1(ConcatedVector_input))
        
        # step 6 - Finally, make a FFNN between 2* input_size and inputsize
        # finalOutput is of dimension voc.num_words
        finalOutput=self.FFNN2(ConcatedVector_output)
        
        # step 7 - apply softmax 
        finalOutput=F.softmax(finalOutput, dim=1)

        # now return the hidden stata and output
        return finalOutput,hiddenState


*decoder run*

In [69]:

# create an instance of decoder
mydecoder= decoderRNN(voc,hidden_size=300,dropout=0,n_layers=6,batch_size=6)
decoderOutput=mydecoder.forward(input_tensor[0],encoderLastHidden=ans[1],encoderOutputs=ans[0])
decoderOutput


(tensor([[4.0916e-05, 6.1009e-05, 1.4920e-04,  ..., 1.0211e-04, 7.0448e-05,
          3.9080e-05],
         [5.5520e-05, 6.0984e-05, 1.0417e-04,  ..., 7.4741e-05, 5.6895e-05,
          4.1878e-05],
         [1.0117e-04, 6.9762e-05, 7.2131e-05,  ..., 7.4872e-05, 4.9256e-05,
          9.1082e-05],
         ...,
         [4.9443e-05, 7.6325e-05, 1.0631e-04,  ..., 8.6037e-05, 8.0963e-05,
          5.2117e-05],
         [4.3179e-05, 7.5870e-05, 1.0020e-04,  ..., 8.7553e-05, 7.2238e-05,
          5.9592e-05],
         [1.0162e-04, 6.6477e-05, 7.9408e-05,  ..., 4.2883e-05, 5.7729e-05,
          8.8069e-05]], grad_fn=<SoftmaxBackward0>),
 tensor([[-0.0577, -0.5455,  0.5709,  ..., -0.1032, -0.0665, -0.2428],
         [ 0.0377,  0.2740,  0.0081,  ..., -0.0994,  0.0461,  0.0799],
         [-0.1138, -0.0840, -0.0678,  ...,  0.1221,  0.0186, -0.0135],
         [-0.0096, -0.0528,  0.0030,  ...,  0.0310,  0.0209, -0.0273],
         [-0.1412,  0.0401,  0.0836,  ..., -0.0111,  0.0143, -0.0152],
       

# Training the Model