In [1]:
import os
import string
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
import re
import spacy
from collections import defaultdict
from drama_movies import drama_movies
#from crime_film_noir import mystery_movies

In [3]:
%%time
nlp=spacy.load("en_core_web_lg")

Wall time: 9.37 s


In [4]:
def check_if_character(line, punct_set):
    if "(" in line:
        line=line[:line.index("(")].strip()
    if "{" in line:
        line=line[:line.index("{")].strip()
    if '[' in line:
        line=line[:line.index('[')].strip()
    if "/" in line:
            line=line[:line.index('/')-1].strip()
    if line.upper()==line and line.isupper() and line[-1] not in punct_set and line.count(' ')<4:
        
        return True, line
    else:
        return False, ""


In [7]:
def check_if_acceptable_character(this_character, acceptable_characters):
    if this_character in acceptable_characters:
        return True
    else:
        cleaner=str.maketrans('','',string.punctuation)
        this_character=this_character.translate(cleaner)
        for character in acceptable_characters:
            if character in this_character or this_character in character:
                return True
    return False

        

In [22]:
def read_scripts(movie_metadata, unacceptable_starters, punct_remover):

    returner=[]

    for movie_filename in movie_metadata:
        path=os.path.join("scripts","Using", movie_filename)
        current_movie=movie_metadata[movie_filename]

        with open(path, "r", encoding="utf-8") as f:
            movie_script=f.readlines()
            start_of_content=False
            this_movie_dicts=[]
            current_character_dict={}
            movie_title=movie_metadata[movie_filename]['title']
            current_character_name="" 
            year=int(current_movie['year'])
            acceptable_characters=current_movie["characters"]+current_movie['villain']      
            
            for line in movie_script:
                line=line.strip()

                #checks for the title and the first character, otherwise skips the header stuff
                if not start_of_content:
                    line=line.strip('"')

                    is_character, new_character_name = check_if_character(line, string.punctuation)

                    if is_character and new_character_name not in unacceptable_starters and check_if_acceptable_character(new_character_name, acceptable_characters):
                        start_of_content=True
                        current_character_name=new_character_name
                        current_character_dict={"character": current_character_name, "movie_title": movie_title, "year":year, "is_villain": False, "raw_dialogue":""}
                        if current_character_name in current_movie['villain']:
                            current_character_dict['is_villain']=True
                        this_movie_dicts.append(current_character_dict)
                
                #if in the middle of the script and you have a current character, check if this line is a new character otherwise add the dialogue to this character's list
                else:
                    is_new_character, new_character_name = check_if_character(line, string.punctuation)
                    if is_new_character:
                        current_character_name=new_character_name
                        if current_character_name not in unacceptable_starters and check_if_acceptable_character(current_character_name, acceptable_characters):
                            if current_character_name not in [entry['character'] for entry in this_movie_dicts]:
                                current_character_dict={"character": current_character_name, "movie_title": movie_title, "year":year, "is_villain": False, "raw_dialogue":""}
                                if current_character_name in current_movie['villain']:
                                    current_character_dict['is_villain']=True
                                this_movie_dicts.append(current_character_dict)
                            else:
                                current_character_dict=[entry for entry in this_movie_dicts if entry['character']==current_character_name][0]
                            
                    else:
                        if current_character_name in unacceptable_starters or not check_if_acceptable_character(current_character_name, acceptable_characters):
                            pass
                        else:
                            if len(line)>0 and line[0]!='(' and line not in unacceptable_starters:
                                current_character_dict['raw_dialogue']=current_character_dict['raw_dialogue'].strip()+' ' + line.translate(punct_remover).strip()

            returner+=this_movie_dicts

        
    for entry in returner:
        entry['num_words']=entry['raw_dialogue'].count(' ')+1
    returner=[entry for entry in returner if entry['num_words']>50]    

    return returner                            
    

In [23]:

unacceptable_starters=["VOICE (cont'd)", "VOICE (CONT'D)", "VOICE OVER (CONT'D)", "VOICE OVER (cont'd)", "DISSOLVE", "CUT", "CUT TO", 'FADE', 'FADE OUT', 'FADE IN', 'PAN', 'CONTINUED', "CONT'D", '', ' ', "VOICE", "VOICE OVER", 'CUT TO', 'DISSOLVE TO', 'THE END', 'FADE TO BLACK', "DISSOLVE TO:", "CUT TO:", "FADE TO:"]

punct_remover=str.maketrans('','', '"#$%&()*+-/:;<=>?@[\\]^_`{|}~')

movie_character_dicts=read_scripts(drama_movies, unacceptable_starters, punct_remover)

In [24]:
#fixes where characters are referred to by separate names
def combine_characters(movie_character_dict, movie_title, character_name_to_keep, character_name_to_remove):
    movie_subset=[entry for entry in movie_character_dict if entry['movie_title']==movie_title]
    if character_name_to_remove not in [entry['character'] for entry in movie_subset] or character_name_to_keep not in [entry['character'] for entry in movie_subset]:
        return
    to_remove=[entry for entry in movie_character_dict if entry['character']==character_name_to_remove][0]
    keeper=[entry for entry in movie_subset if entry['character']==character_name_to_keep][0]
    keeper['raw_dialogue']+=to_remove['raw_dialogue']
    keeper['num_words']+=to_remove['num_words']
    movie_character_dict.remove(to_remove)

In [25]:
#random fixes

combine_characters(movie_character_dicts, "8MM", "AMY", "AMY'S VOICE")
combine_characters(movie_character_dicts, "8mm", "DINO VELVET", "DINO")
combine_characters(movie_character_dicts, "8mm", "DINO VELVET", "DINO VELVET VOICE")
combine_characters(movie_character_dicts, "8mm", "WELLES", "WELLES VOICE")
combine_characters(movie_character_dicts, "8mm", "WELLES", "WELLES' VOICE")

combine_characters(movie_character_dicts, "MANHATTAN MURDER MYSTERIES", "HELEN", "HELEN'S VOICE")

combine_characters(movie_character_dicts, 'The Black Dahlia', "CAPTAIN VASQUEZ", 'VASQUEZ')
combine_characters(movie_character_dicts, 'The Black Dahlia', "JOHNNY VOGEL", 'JOHNNY')
combine_characters(movie_character_dicts, 'The Black Dahlia', "JOHNNY VOGEL", 'VOGEL')
combine_characters(movie_character_dicts, "The Black Dahlia", "LEE BLANCHARD", "LEE")

combine_characters(movie_character_dicts, 'The Black Dahlia', "ELLIS LOEW", 'LOEW')
combine_characters(movie_character_dicts, 'The Black Dahlia', "RUSS MILLARD", 'MILLARD')
combine_characters(movie_character_dicts, 'BASIC INSTINCT', "CAPTAIN TALCOTT", 'TALCOTT')
combine_characters(movie_character_dicts, 'BASIC INSTINCT', "CAPTAIN TALCOTT", 'CAPT. TALCOTT')

combine_characters(movie_character_dicts, 'Basic', "DUNBAR", 'DUN BAR')
combine_characters(movie_character_dicts, 'Basic', "MUELLER", 'MUE:LLER')
combine_characters(movie_character_dicts, 'Basic', "OSBORNE", 'OSB0RNE')

combine_characters(movie_character_dicts, 'THE GIRL WITH THE DRAGON TATTOO', "GREGOR", 'GREGER')
combine_characters(movie_character_dicts, 'THE GIRL WITH THE DRAGON TATTOO', "GREGOR", 'GREGER')
combine_characters(movie_character_dicts, 'THE GIRL WITH THE DRAGON TATTOO', "BLOMKVIST", 'BLOMVIST')
combine_characters(movie_character_dicts, 'THE GIRL WITH THE DRAGON TATTOO', "HARRIET", 'HARRIE')
combine_characters(movie_character_dicts, 'THE GIRL WITH THE DRAGON TATTOO', "WENNERSTROM", 'WENNERSTROM ON TV')
combine_characters(movie_character_dicts, 'THE GIRL WITH THE DRAGON TATTOO', "VANGER", 'YOUNGER VANGER')

In [37]:
# Source: Professor Wilkens, INFO 3350
#convenience function to read the emotion lexicon into the variable emolex
def read_emolex(filepath=None):
    '''
    Takes a file path to the emolex lexicon file.
    Returns a dictionary of emolex sentiment values.
    '''
    if filepath==None: # Try to find the emolex file
        filepath = os.path.join('data','emolex.txt')
        if os.path.isfile(filepath):
            pass
        elif os.path.isfile('emolex.txt'):
            filepath = 'emolex.txt'
        else:
            raise FileNotFoundError('No EmoLex file found')
    emolex = defaultdict(dict) # Like Counter(), defaultdict eases dictionary creation
    with open(filepath, 'r') as f:
    # emolex file format is: word emotion value
        for line in f:
            word, emotion, value = line.strip().split()
            emolex[word][emotion] = int(value)
    return emolex

# Get EmoLex data. Make sure you set the right file path above.
emolex = read_emolex()


In [38]:
%%time
sample_dataframe['token_list']=[[token for token in nlp(doc) if not (token.is_punct or token.is_space or token.is_stop)] for doc in sample_dataframe['raw_dialogue']]

Wall time: 1min 14s


In [39]:
sample_dataframe.head()

Unnamed: 0,character,movie_title,year,is_villain,raw_dialogue,num_words,token_list
0,MACREEDY,Bad Day at Black Rock,1995,False,I'll only be here twentyfour hours. That's rig...,2305,"[twentyfour, hours, right, grin, Probably, thi..."
1,HECTOR,Bad Day at Black Rock,1995,True,Find Smith! GOD BLESS AMERICA glued to Macreed...,400,"[Find, Smith, GOD, BLESS, AMERICA, glued, Macr..."
2,DOC,Bad Day at Black Rock,1995,False,"Who Why ask me He's no salesman, that's sure. ...",1110,"[ask, salesman, sure, grin, peddling, dynamite..."
3,SMITH,Bad Day at Black Rock,1995,False,Sit down. Sit down. Doesn't push easy What do ...,1421,"[Sit, Sit, push, easy, want, Doc, wonder, talk..."
4,LIZ,Bad Day at Black Rock,1995,False,"I can manage. It'll be two dollars an hour, ga...",401,"[manage, dollars, hour, gas, extra, dollars, t..."


In [40]:
def add_emotion_scores(token_list, lexicon=None):
    string_list=[token.text.lower().strip() for token in token_list]
    re=[[],[],[],[],[],[],[],[],[],[]]
    if lexicon==None:
        lexicon=read_emolex()
    for token in string_list:
        if token in lexicon:
            score_list=list(lexicon[token].values())
            for score_index in range(len(score_list)):
                re[score_index].append(score_list[score_index])
    re=[score if len(score)>0 else [0] for score in re]
    to_return = [np.mean(emotion) for emotion in re]
    return to_return
    

    

In [41]:
%%time
all_emotions = [add_emotion_scores(entry) for entry in sample_dataframe['token_list']]
sample_dataframe['mean_anger'], sample_dataframe['mean_anticipation'], sample_dataframe['mean_disgust'], sample_dataframe['mean_fear'], sample_dataframe['mean_joy'], sample_dataframe['mean_negative'], sample_dataframe['mean_positive'], sample_dataframe['mean_sadness'], sample_dataframe['mean_surprise'], sample_dataframe['mean_trust'] = np.flipud(np.rot90(all_emotions))

Wall time: 40.3 s


In [42]:
def add_embeddings(token_list, vector_length):
    token_list=[token for token in token_list if token.has_vector]
    doc_matrix=np.zeros([len(token_list), vector_length])
    for i in range(len(doc_matrix)):
        doc_matrix[i]=token_list[i].vector
    return np.average(doc_matrix, axis=0)

In [43]:
%%time
vector_length=nlp.vocab.vectors_length
sample_dataframe['embeddings']=[add_embeddings(entry, vector_length) for entry in sample_dataframe['token_list']]

Wall time: 1.44 s


In [44]:
sample_dataframe.head()

Unnamed: 0,character,movie_title,year,is_villain,raw_dialogue,num_words,token_list,mean_anger,mean_anticipation,mean_disgust,mean_fear,mean_joy,mean_negative,mean_positive,mean_sadness,mean_surprise,mean_trust,embeddings
0,MACREEDY,Bad Day at Black Rock,1995,False,I'll only be here twentyfour hours. That's rig...,2305,"[twentyfour, hours, right, grin, Probably, thi...",0.050114,0.100228,0.045558,0.129841,0.059226,0.207289,0.168565,0.136674,0.059226,0.14123,"[-0.05360775204635609, 0.13671957931861364, -0..."
1,HECTOR,Bad Day at Black Rock,1995,True,Find Smith! GOD BLESS AMERICA glued to Macreed...,400,"[Find, Smith, GOD, BLESS, AMERICA, glued, Macr...",0.090909,0.125,0.136364,0.068182,0.090909,0.204545,0.170455,0.068182,0.068182,0.159091,"[-0.1295891623623902, 0.07702989907156735, -0...."
2,DOC,Bad Day at Black Rock,1995,False,"Who Why ask me He's no salesman, that's sure. ...",1110,"[ask, salesman, sure, grin, peddling, dynamite...",0.035088,0.118421,0.065789,0.092105,0.061404,0.20614,0.157895,0.122807,0.057018,0.092105,"[-0.052376541407746474, 0.09626584564088855, -..."
3,SMITH,Bad Day at Black Rock,1995,False,Sit down. Sit down. Doesn't push easy What do ...,1421,"[Sit, Sit, push, easy, want, Doc, wonder, talk...",0.072727,0.116364,0.069091,0.12,0.032727,0.210909,0.101818,0.101818,0.065455,0.087273,"[-0.09854166424143915, 0.09306020818229836, -0..."
4,LIZ,Bad Day at Black Rock,1995,False,"I can manage. It'll be two dollars an hour, ga...",401,"[manage, dollars, hour, gas, extra, dollars, t...",0.028169,0.15493,0.014085,0.070423,0.014085,0.112676,0.15493,0.098592,0.042254,0.098592,"[-0.05106351753455345, 0.13368790789886756, -0..."


In [45]:
sample_dataframe.to_csv("mystery_movie_data.csv")