In [7]:
import re
import pandas as pd

def read_and_parse(file_name):
    """Takes in a file and returns a dictionary
    of CHARACTER NAME : DIALOGUE"""
    
    with open(str(file_name),encoding='ISO-8859-1') as f:
        script = f.read()
        f.close()

    # Grouped regex pattern to capture character in group 1 and their dialouge in group 2 
    char_dialogue = re.compile(r"(?m)^\s*\b([A-Z]+)\b\s*\n(.*(?:\n.+)*)")
    extract_dialogue = char_dialogue.findall(script)

    result = {}

    for element in extract_dialogue:
        # Seperating the character from the dialogue
        char = element[0]
        line = element[1]
        # If the character is already a key in the dictionary
        # and line is not empty append the dialogue to the value list
        if char in result:
            if line != '':
                result[char].append(line)
        else:
            # Otherwise add the character name to the 
            # dictionary keys with their first line
            if char.isupper() and len(char) >3:
                result[char] = [line]
    # Some final cleaning to drop empty and supporting/background characters
    result = {k: v for k, v in result.items() if v  != ['']}
    # Only taking characters with n lines of dialogue
    result = {k: v for k, v in result.items() if len(v) >50}
    
    return result

In [8]:
script_dict = read_and_parse("/content/drive/MyDrive/Full Projects/Character Emotions/Text Data/Godfather.txt")

In [6]:
script_dict.keys()

dict_keys(['SONNY', 'HAGEN', 'MICHAEL'])

In [9]:
# strip lists/dictionarys 
def strip_all(x):
  # If dictionary strip keys and values
  if x == dict:
    for key, value in x.items():
      x.pop(k)  # also strip keys
      x[ strip_all(k) ] = strip_all(v)
  # If list strip
  elif x == list:
    x = [ strip_all(lst) for v in lst]
  return x

In [10]:
strip_dict = strip_all(script_dict)

In [11]:
pd.set_option('display.max_colwidth', 200)

script_df = pd.DataFrame(dict([  (k,pd.Series(v)) for k,v in strip_dict.items() ]))
print(len(script_df))
script_df.head()

204


Unnamed: 0,SONNY,HAGEN,MICHAEL
0,"He'll be here Pop, it's still early.",(nodding)\n It will cost.,I told you I had a lot of relatives.
1,"Sandra, watch the kids. They're\n running wild.",Who do I give this job to?,They're waiting to see my father.
2,"Buddy, this is a private party.",Francesco Nippi. His nephew has\n been refused parole. A bad case.,"They're going to talk to my father,\n which means they're going to ask\n him for something, which means they\n better get it right."
3,Goddamn FBI...don't respect nothing.,You understand him better than\n anyone.,Because they know that no Sicilian\n will refuse a request on his\n daughter's wedding day.
4,And Michael?,"Hello Kay. Your father's inside,\n doing some business.\n (privately)\n He's been asking for you.",No. His name is Luca Brasi. You\n wouldn't like him.


In [12]:
script_df.to_csv("/content/drive/MyDrive/Full Projects/Character Emotions/godfather_chars.csv", index=False)