In [304]:
import random

## Dataset Loading

### Show first five lines in the moviequotes.scripts.txt file

In [305]:
first_five_lines = []

with open('data/movie_lines.txt', 'r', encoding='iso-8859-1') as f:
    counter = 0
    for i, line in enumerate(f):
        if counter > 4:
            break
        first_five_lines.append(line)
        counter += 1

In [306]:
first_five_lines

['L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!\n',
 'L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!\n',
 'L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.\n',
 'L984 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ She okay?\n',
 "L925 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Let's go.\n"]

In [307]:
# Split each line with the chars string '+++$+++'
values_in_first_five_lines = []
for line in first_five_lines:
    values_in_first_five_lines.append(line.split(" +++$+++ "))

In [308]:
values_in_first_five_lines

[['L1045', 'u0', 'm0', 'BIANCA', 'They do not!\n'],
 ['L1044', 'u2', 'm0', 'CAMERON', 'They do to!\n'],
 ['L985', 'u0', 'm0', 'BIANCA', 'I hope so.\n'],
 ['L984', 'u2', 'm0', 'CAMERON', 'She okay?\n'],
 ['L925', 'u0', 'm0', 'BIANCA', "Let's go.\n"]]

In [309]:
# Define a fields in a line for fields extraction
MOVIE_LINES_FIELDS = ["LINE_ID", "CHARACTER_ID", "MOVIE_ID", "CHARACTER", "TEXT"]

In [310]:
# Extract fileds in each line
for field in MOVIE_LINES_FIELDS:
    print(f'{field:<15}', end='')
print()

for values in values_in_first_five_lines:
    for v in values:
        print(f'{v:<15}', end='')        

LINE_ID        CHARACTER_ID   MOVIE_ID       CHARACTER      TEXT           
L1045          u0             m0             BIANCA         They do not!
  L1044          u2             m0             CAMERON        They do to!
   L985           u0             m0             BIANCA         I hope so.
    L984           u2             m0             CAMERON        She okay?
     L925           u0             m0             BIANCA         Let's go.
     

In [311]:
# Create a dictionary to save fields in the first line
line_obj = {}

for i, field in enumerate(MOVIE_LINES_FIELDS):
    line_obj[field] = values_in_first_five_lines[0][i]

In [312]:
line_obj

{'LINE_ID': 'L1045',
 'CHARACTER_ID': 'u0',
 'MOVIE_ID': 'm0',
 'CHARACTER': 'BIANCA',
 'TEXT': 'They do not!\n'}

In [313]:
# Create a dictionary to contain all dictionaries for each line
line = {}

for values in values_in_first_five_lines:
    line_obj = {}
    for i, field in enumerate(MOVIE_LINES_FIELDS):
        line_obj[field] = values[i]
    
    line[line_obj['LINE_ID']] = line_obj # Use LINE_ID to mark each line of script fields

In [314]:
line

{'L1045': {'LINE_ID': 'L1045',
  'CHARACTER_ID': 'u0',
  'MOVIE_ID': 'm0',
  'CHARACTER': 'BIANCA',
  'TEXT': 'They do not!\n'},
 'L1044': {'LINE_ID': 'L1044',
  'CHARACTER_ID': 'u2',
  'MOVIE_ID': 'm0',
  'CHARACTER': 'CAMERON',
  'TEXT': 'They do to!\n'},
 'L985': {'LINE_ID': 'L985',
  'CHARACTER_ID': 'u0',
  'MOVIE_ID': 'm0',
  'CHARACTER': 'BIANCA',
  'TEXT': 'I hope so.\n'},
 'L984': {'LINE_ID': 'L984',
  'CHARACTER_ID': 'u2',
  'MOVIE_ID': 'm0',
  'CHARACTER': 'CAMERON',
  'TEXT': 'She okay?\n'},
 'L925': {'LINE_ID': 'L925',
  'CHARACTER_ID': 'u0',
  'MOVIE_ID': 'm0',
  'CHARACTER': 'BIANCA',
  'TEXT': "Let's go.\n"}}

In [315]:
def load_lines(file_name, fields):
    """
    A function to create a dictionary in the above shown format covering all lines in the corpus

    Args:
        file_name(str): file to read
        fields(list<str>): fileds to extract
    Return:
        dict<dict<str>>: the extracted fileds for each line
    """
    lines = {}

    # Check the LINE_ID is included in the fields set
    assert 'LINE_ID' in fields, "The given fields set does not contain 'LINE_ID'"

    with open(file_name, 'r', encoding='iso-8859-1') as f:
        for line in f:
            # Convert the line into a values list according to 
            # the spliting result 
            values = line.split(" +++$+++ ")

            line_obj = {}
            for i, field in enumerate(fields):
                line_obj[field] = values[i]
            
            lines[line_obj['LINE_ID']] = line_obj
    
    return lines
            

In [316]:
lines = load_lines('data/movie_lines.txt', MOVIE_LINES_FIELDS)

In [317]:
lines

{'L1045': {'LINE_ID': 'L1045',
  'CHARACTER_ID': 'u0',
  'MOVIE_ID': 'm0',
  'CHARACTER': 'BIANCA',
  'TEXT': 'They do not!\n'},
 'L1044': {'LINE_ID': 'L1044',
  'CHARACTER_ID': 'u2',
  'MOVIE_ID': 'm0',
  'CHARACTER': 'CAMERON',
  'TEXT': 'They do to!\n'},
 'L985': {'LINE_ID': 'L985',
  'CHARACTER_ID': 'u0',
  'MOVIE_ID': 'm0',
  'CHARACTER': 'BIANCA',
  'TEXT': 'I hope so.\n'},
 'L984': {'LINE_ID': 'L984',
  'CHARACTER_ID': 'u2',
  'MOVIE_ID': 'm0',
  'CHARACTER': 'CAMERON',
  'TEXT': 'She okay?\n'},
 'L925': {'LINE_ID': 'L925',
  'CHARACTER_ID': 'u0',
  'MOVIE_ID': 'm0',
  'CHARACTER': 'BIANCA',
  'TEXT': "Let's go.\n"},
 'L924': {'LINE_ID': 'L924',
  'CHARACTER_ID': 'u2',
  'MOVIE_ID': 'm0',
  'CHARACTER': 'CAMERON',
  'TEXT': 'Wow\n'},
 'L872': {'LINE_ID': 'L872',
  'CHARACTER_ID': 'u0',
  'MOVIE_ID': 'm0',
  'CHARACTER': 'BIANCA',
  'TEXT': "Okay -- you're gonna need to learn how to lie.\n"},
 'L871': {'LINE_ID': 'L871',
  'CHARACTER_ID': 'u2',
  'MOVIE_ID': 'm0',
  'CHARACTER': 

In [318]:
len(lines)

304713

In [319]:
first_five_lines = []

with open('data/movie_conversations.txt', 'r', encoding='iso-8859-1') as f:
    counter = 0
    for i, line in enumerate(f):
        if counter > 4:
            break
        first_five_lines.append(line)
        counter += 1

In [320]:
first_five_lines

["u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L194', 'L195', 'L196', 'L197']\n",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L198', 'L199']\n",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L200', 'L201', 'L202', 'L203']\n",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L204', 'L205', 'L206']\n",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L207', 'L208']\n"]

In [321]:
MOVIE_CONVERSATIONS_FIELDS = ["CHARACTER1_ID", "CHARACTER2_ID", "MOVIE_ID", "UTTERANCE_IDS"]

In [322]:
for line in first_five_lines:
    values = line.split(' +++$+++ ')

    conv_obj = {}
    for i, field in enumerate(MOVIE_CONVERSATIONS_FIELDS):
        conv_obj[field] = values[i]

    line_ids = eval(conv_obj['UTTERANCE_IDS'])
    # print(line_ids)

    conv_obj['LINEs'] = []
    for line_id in line_ids:
        conv_obj['LINEs'].append(lines[line_id])
    
conv_obj['LINEs']

[{'LINE_ID': 'L207',
  'CHARACTER_ID': 'u0',
  'MOVIE_ID': 'm0',
  'CHARACTER': 'BIANCA',
  'TEXT': 'Gosh, if only we could find Kat a boyfriend...\n'},
 {'LINE_ID': 'L208',
  'CHARACTER_ID': 'u2',
  'MOVIE_ID': 'm0',
  'CHARACTER': 'CAMERON',
  'TEXT': 'Let me see what I can do.\n'}]

In [323]:
# Define a function to match all sentence as a conversation
def match_conversation(file_name, lines, fileds):
    """
    According to loaded lines, match lines as a conversation

    Args:
        file_name (str): file to load
        lines (dict<dict<str>>): lines dictionary read previously
        fields(list<str>): the defined fields for each line
    Return:
        list<dict>:
    """
    conversation = []

    with open(file_name, 'r', encoding='iso-8859-1') as f:
        for line in f:
            values = line.split(" +++$+++ ")

            conv_obj = {}
            for i, field in enumerate(fileds):
                conv_obj[field] = values[i]
            
            line_ids = eval(conv_obj['UTTERANCE_IDS'])

            conv_obj['LINES'] = []
            for line_id in line_ids:
                conv_obj['LINES'].append(lines[line_id])
            
            conversation.append(conv_obj)
    
    return conversation

In [324]:
conversations = match_conversation('data/movie_conversations.txt', lines, MOVIE_CONVERSATIONS_FIELDS)

In [356]:
conversations[0]

{'CHARACTER1_ID': 'u0',
 'CHARACTER2_ID': 'u2',
 'MOVIE_ID': 'm0',
 'UTTERANCE_IDS': "['L194', 'L195', 'L196', 'L197']\n",
 'LINES': [{'LINE_ID': 'L194',
   'CHARACTER_ID': 'u0',
   'MOVIE_ID': 'm0',
   'CHARACTER': 'BIANCA',
   'TEXT': 'Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.\n'},
  {'LINE_ID': 'L195',
   'CHARACTER_ID': 'u2',
   'MOVIE_ID': 'm0',
   'CHARACTER': 'CAMERON',
   'TEXT': "Well, I thought we'd start with pronunciation, if that's okay with you.\n"},
  {'LINE_ID': 'L196',
   'CHARACTER_ID': 'u0',
   'MOVIE_ID': 'm0',
   'CHARACTER': 'BIANCA',
   'TEXT': 'Not the hacking and gagging and spitting part.  Please.\n'},
  {'LINE_ID': 'L197',
   'CHARACTER_ID': 'u2',
   'MOVIE_ID': 'm0',
   'CHARACTER': 'CAMERON',
   'TEXT': "Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?\n"}]}

In [325]:
conversations[0]["LINES"]

[{'LINE_ID': 'L194',
  'CHARACTER_ID': 'u0',
  'MOVIE_ID': 'm0',
  'CHARACTER': 'BIANCA',
  'TEXT': 'Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.\n'},
 {'LINE_ID': 'L195',
  'CHARACTER_ID': 'u2',
  'MOVIE_ID': 'm0',
  'CHARACTER': 'CAMERON',
  'TEXT': "Well, I thought we'd start with pronunciation, if that's okay with you.\n"},
 {'LINE_ID': 'L196',
  'CHARACTER_ID': 'u0',
  'MOVIE_ID': 'm0',
  'CHARACTER': 'BIANCA',
  'TEXT': 'Not the hacking and gagging and spitting part.  Please.\n'},
 {'LINE_ID': 'L197',
  'CHARACTER_ID': 'u2',
  'MOVIE_ID': 'm0',
  'CHARACTER': 'CAMERON',
  'TEXT': "Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?\n"}]

In [326]:
len(conversations)

83097

In [344]:
pairs = []
counter = 0
while counter < len(conversations[0]["LINES"]) - 1:
    # print(counter)
    initial_line = conversations[0]['LINES'][counter]['TEXT'].strip()
    response_line = conversations[0]['LINES'][counter+1]['TEXT'].strip()

    if initial_line and response_line:
        pairs.append([initial_line, response_line])

    counter += 1

In [345]:
pairs

[['Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.',
  "Well, I thought we'd start with pronunciation, if that's okay with you."],
 ["Well, I thought we'd start with pronunciation, if that's okay with you.",
  'Not the hacking and gagging and spitting part.  Please.'],
 ['Not the hacking and gagging and spitting part.  Please.',
  "Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?"]]

In [352]:
def extract_conversation_sentence_pairs(converations):
    """
    Extract sentences pair from the conversation dictionary

    Arg:
        conversations(list<dict>)
    Return:
        list<list<str>>: a list of conversational sentence pairs
    """
    pairs = []

    for conv in converations:
        
        # for i in range(len(conv['LINES']) - 1):
        #     initial_line = conv['LINES'][i]['TEXT'].strip()
        #     response_line = conv['LINES'][i+1]['TEXT'].strip()
        
        #     if initial_line and response_line:
        #         pairs.append([initial_line, response_line])
        
        # Use every two sentences in the LINES to form a converstional pair
        counter = 0
        while counter < len(conv['LINES']) - 1:
            initial_line = conv['LINES'][counter]['TEXT'].strip()
            response_line = conv['LINES'][counter+1]['TEXT'].strip()

            if initial_line and response_line:
                pairs.append([initial_line, response_line])
            
            counter += 1
    
    return pairs

In [353]:
pairs = extract_conversation_sentence_pairs(conversations)

In [354]:
len(pairs)

221282

In [355]:
rand_idx = random.randint(0, len(pairs))
pairs[rand_idx]

['Change?  You want my change?  I fought the war and lost my eyes just so I could give you my change?',
 'All right, old man.  Christ.']