In [41]:
import re

In [42]:
with open('intermedium/loaded_movie_lines.txt') as f:
    read_in = f.read().strip().split('\n')

In [43]:
print(f'Total number of lines: {len(read_in)}')
read_in[:5]

Total number of lines: 221282


["Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.\tWell, I thought we'd start with pronunciation, if that's okay with you.",
 "Well, I thought we'd start with pronunciation, if that's okay with you.\tNot the hacking and gagging and spitting part.  Please.",
 "Not the hacking and gagging and spitting part.  Please.\tOkay... then how 'bout we try out some French cuisine.  Saturday?  Night?",
 "You're asking me out.  That's so cute. What's your name again?\tForget it.",
 "No, no, it's my fault -- we didn't have a proper introduction ---\tCameron."]

In [44]:
def formalize(s):
    """
    Formalize the given string by removing all non-alphabet characters

    Args:
        s(<str>): the given string
    Return:
        (<str>): A processed string
    """
    # Turn a Unicode string to plain ASCII
    temp_str = (s.encode('ascii', 'ignore')).decode('utf-8')

    # Trim and remove all non-letter characters using regular expression
    temp_str = re.sub(r"([.!?])", r" \1", s)
    temp_str = re.sub(r"[^a-zA-Z.!?]+", r" ", s)  

    # Lowercase for final return
    return temp_str.lower().strip()

In [45]:
# Split each line and formalize each part to build the conversation pairs for later processing
pairs = [[formalize(s) for s in l.split('\t')] for l in read_in]

In [46]:
pairs[0:5]

[['can we make this quick? roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad. again.',
  'well i thought we d start with pronunciation if that s okay with you.'],
 ['well i thought we d start with pronunciation if that s okay with you.',
  'not the hacking and gagging and spitting part. please.'],
 ['not the hacking and gagging and spitting part. please.',
  'okay... then how bout we try out some french cuisine. saturday? night?'],
 ['you re asking me out. that s so cute. what s your name again?',
  'forget it.'],
 ['no no it s my fault we didn t have a proper introduction', 'cameron.']]

In [47]:
def filter(pairs, max_length=20):
    """
    Filter pairs that under max_length threshold

    Args:
        pairs (list<list<str>>): The given list of pairs
        max_length (int): The threshold for filtering pairs, default in 10
    Return:
        (list<list<str>>): The processed list of pairs that both length of sentences are under given max_length
    """
    return [p for p in pairs 
            if len(p[0].split(' ')) < max_length and len(p[1].split(' ')) < max_length]

In [48]:
filtered_pairs = filter(pairs, max_length=20)
print(f'After filtering: {len(filtered_pairs)}') 

After filtering: 160465


In [49]:
PAD = 0
SOS = 1
EOS = 2

class IndexMapping:
    """
    Map each unique word that encounter in the pairs to an index value
    Then represent and store the discrete space by a dictionary
    """
    def __init__(self) -> None:
        # self.name = name
        self.word2index = {} # encode the word into an integer
        self.index2word = {PAD: '<P>', SOS: '<S>', EOS: '<E>'} # decode the integer into a word
        self.word2count = {} # count the occurence time of words
        self.n_words = 3 # Count the SOS and EOS, then accumulate when new words come
        
    def add_word(self, word):
        # add the word into the dictionary and record its occurence time
        if word not in self.word2index:
            # If the word is new, then add it in the dictionary and count its number as 1
            self.word2index[word] = self.n_words
            self.index2word[self.n_words] = word 
            self.word2count[word] = 1            
            self.n_words += 1                    
        else:
            # If the word existed, just change its count number
            self.word2count[word] += 1

    def add_sentence(self, sentence):
        for word in sentence.split(' '):
            self.add_word(word)

In [50]:
num_map = IndexMapping()

In [51]:
for p in filtered_pairs:
    num_map.add_sentence(p[0])
    num_map.add_sentence(p[1])

print(f'Counted words: {num_map.n_words}')

Counted words: 70811


In [52]:
def trim_mapping(mapping, min_count):
    """
    Trim the infrequently seen words in the given mapping 
    decided by the given minimum counts

    Args:
        mapping (dict): The given dict for triming
        min_count (int): The threshold for the minimum count for triming, 
                         the word count in the original dict below the min_count will be removed
    """
    keep_words = [] # Store all remaining words

    for k, v in mapping.word2count.items():
        if v > min_count:
            keep_words.append(k) # Remove all words that the count is less than the threshold 
    
    new_mapping = NumericalMapping() # Create a new mapping
    
    for w in keep_words:
        new_mapping.add_word(w)
    
    return new_mapping

In [53]:
trim_num_map = trim_mapping(num_map, min_count=2)

In [54]:
print(f'After triming, counted words: {trim_num_map.n_words}')

After triming, counted words: 29545


In [55]:
def trim_pairs(pairs, mapping):
    """ 
    Trim the infrequently seen words in the given list of pairs 
    decided by the given mapping
    
    Args:
        mapping (dict): the trim mapping that remove all infrequency seen words
        pairs (list<list<str>>): the pairs for triming based on the mapping
    Return:
        (list<list<str>>): the trimed pairs
    """
    keep_pairs = []

    for pair in pairs:
        keep_input, keep_output = True, True # Set flag for checking 

        # Check for the input sentence
        for word in pair[0].split(' '):
            if word not in mapping.word2index:
                keep_input = False
                break
        
        # Check for the output sentence
        for word in pair[1].split(' '):
            if word not in mapping.word2index:
                keep_output = False
                break
        
        # Only keep the pair if the input and output sentence pass both checking
        if keep_input and keep_output:
            keep_pairs.append(pair)
        
    # for pair in pairs:
    #     is_keep = True

    #     for index in range(2):
    #         for word in pair[index].split(' '):
    #             if word not in mapping.word2index:
    #                 is_keep = False
    #                 break

    #     if is_keep:
    #         keep_pairs.append(pairs)     
    
    return keep_pairs


In [56]:
selected_pairs = trim_pairs(filtered_pairs, trim_num_map)

In [57]:
len(selected_pairs)

117117