In [77]:
import os

## Dataset Loading

### Show first five lines in the moviequotes.scripts.txt file

In [78]:
first_five_lines = []

with open('data/moviequotes.scripts.txt', 'r', encoding='iso-8859-1') as f:
    counter = 0
    for i, line in enumerate(f):
        if counter > 4:
            break
        first_five_lines.append(line)
        counter += 1

In [79]:
first_five_lines

['0 +++$+++ "murderland" +++$+++ 1 +++$+++ announcer +++$+++  +++$+++ Ladies and gentlemen, the official mascot of Murderland.... Scraps the Dog !\n',
 '1 +++$+++ "murderland" +++$+++ 2 +++$+++ announcer +++$+++  +++$+++ Choose the doorway that starts you on your magical journey into  MURDERLAND !\n',
 '2 +++$+++ "murderland" +++$+++ 3 +++$+++ johnny +++$+++  +++$+++ I didn\'t think he\'d make it past Scraps.\n',
 '3 +++$+++ "murderland" +++$+++ 4 +++$+++ bruce +++$+++ 2 +++$+++ Let\'s just see if he can make it into round two....\n',
 '4 +++$+++ "murderland" +++$+++ 5 +++$+++ bruce +++$+++  +++$+++ Don\'t.\n']

In [80]:
# Split each line with the chars string '+++$+++'
values_in_first_five_lines = []
for line in first_five_lines:
    values_in_first_five_lines.append(line.split(" +++$+++ "))

In [81]:
values_in_first_five_lines

[['0',
  '"murderland"',
  '1',
  'announcer',
  '',
  'Ladies and gentlemen, the official mascot of Murderland.... Scraps the Dog !\n'],
 ['1',
  '"murderland"',
  '2',
  'announcer',
  '',
  'Choose the doorway that starts you on your magical journey into  MURDERLAND !\n'],
 ['2',
  '"murderland"',
  '3',
  'johnny',
  '',
  "I didn't think he'd make it past Scraps.\n"],
 ['3',
  '"murderland"',
  '4',
  'bruce',
  '2',
  "Let's just see if he can make it into round two....\n"],
 ['4', '"murderland"', '5', 'bruce', '', "Don't.\n"]]

In [82]:
# Define a fields in a line for fields extraction
MOVIE_LINES_FIELDS = ["LINE_ID", "MOVIE_TITLE", "MOVIE_LINE_NR", "CHARACTER", "REPLY_TO_LINE_ID", "TEXT"]

In [83]:
# Extract fileds in each line
for field in MOVIE_LINES_FIELDS:
    print(f'{field:<20}', end='')
print()

for values in values_in_first_five_lines:
    for v in values:
        print(f'{v:<20}', end='')        

LINE_ID             MOVIE_TITLE         MOVIE_LINE_NR       CHARACTER           REPLY_TO_LINE_ID    TEXT                
0                   "murderland"        1                   announcer                               Ladies and gentlemen, the official mascot of Murderland.... Scraps the Dog !
1                   "murderland"        2                   announcer                               Choose the doorway that starts you on your magical journey into  MURDERLAND !
2                   "murderland"        3                   johnny                                  I didn't think he'd make it past Scraps.
3                   "murderland"        4                   bruce               2                   Let's just see if he can make it into round two....
4                   "murderland"        5                   bruce                                   Don't.
             

In [84]:
# Create a dictionary to save fields in the first line
line_obj = {}

for i, field in enumerate(MOVIE_LINES_FIELDS):
    line_obj[field] = values_in_first_five_lines[0][i]

In [85]:
line_obj

{'LINE_ID': '0',
 'MOVIE_TITLE': '"murderland"',
 'MOVIE_LINE_NR': '1',
 'CHARACTER': 'announcer',
 'REPLY_TO_LINE_ID': '',
 'TEXT': 'Ladies and gentlemen, the official mascot of Murderland.... Scraps the Dog !\n'}

In [86]:
# Create a dictionary to contain all dictionaries for each line
line = {}

for values in values_in_first_five_lines:
    line_obj = {}
    for i, field in enumerate(MOVIE_LINES_FIELDS):
        line_obj[field] = values[i]
    
    line[line_obj['LINE_ID']] = line_obj # Use LINE_ID to mark each line of script fields

In [87]:
line

{'0': {'LINE_ID': '0',
  'MOVIE_TITLE': '"murderland"',
  'MOVIE_LINE_NR': '1',
  'CHARACTER': 'announcer',
  'REPLY_TO_LINE_ID': '',
  'TEXT': 'Ladies and gentlemen, the official mascot of Murderland.... Scraps the Dog !\n'},
 '1': {'LINE_ID': '1',
  'MOVIE_TITLE': '"murderland"',
  'MOVIE_LINE_NR': '2',
  'CHARACTER': 'announcer',
  'REPLY_TO_LINE_ID': '',
  'TEXT': 'Choose the doorway that starts you on your magical journey into  MURDERLAND !\n'},
 '2': {'LINE_ID': '2',
  'MOVIE_TITLE': '"murderland"',
  'MOVIE_LINE_NR': '3',
  'CHARACTER': 'johnny',
  'REPLY_TO_LINE_ID': '',
  'TEXT': "I didn't think he'd make it past Scraps.\n"},
 '3': {'LINE_ID': '3',
  'MOVIE_TITLE': '"murderland"',
  'MOVIE_LINE_NR': '4',
  'CHARACTER': 'bruce',
  'REPLY_TO_LINE_ID': '2',
  'TEXT': "Let's just see if he can make it into round two....\n"},
 '4': {'LINE_ID': '4',
  'MOVIE_TITLE': '"murderland"',
  'MOVIE_LINE_NR': '5',
  'CHARACTER': 'bruce',
  'REPLY_TO_LINE_ID': '',
  'TEXT': "Don't.\n"}}

In [88]:
def load_lines(file_name, fields):
    """
    A function to create a dictionary in the above shown format covering all lines in the corpus

    Args:
        file_name(str): file to read
        fields(set<str>): fileds to extract
    Return:
        dict<dict<str>>: the extracted fileds for each line
    """
    lines = {}

    # Check the LINE_ID is included in the fields set
    assert 'LINE_ID' in fields, "The given fields set does not contain 'LINE_ID'"

    with open(file_name, 'r', encoding='iso-8859-1') as f:
        for line in f:
            # Convert the line into a values list according to 
            # the spliting result 
            values = line.split(" +++$+++ ")

            line_obj = {}
            for i, field in enumerate(fields):
                line_obj[field] = values[i]
            
            lines[line_obj['LINE_ID']] = line_obj
    
    return lines
            

In [89]:
lines = load_lines('data/moviequotes.scripts.txt', MOVIE_LINES_FIELDS)

In [90]:
lines['0']

{'LINE_ID': '0',
 'MOVIE_TITLE': '"murderland"',
 'MOVIE_LINE_NR': '1',
 'CHARACTER': 'announcer',
 'REPLY_TO_LINE_ID': '',
 'TEXT': 'Ladies and gentlemen, the official mascot of Murderland.... Scraps the Dog !\n'}