In [54]:
import os
import string
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
import re
import spacy
from collections import defaultdict

In [55]:
%%time
nlp=spacy.load("en_core_web_lg")

Wall time: 6 s


In [56]:
def check_if_character(line, punct_set):
    if "(" in line:
        line=line[:line.index("(")].strip()
    if "{" in line:
        line=line[:line.index("{")].strip()
    if '[' in line:
        line=line[:line.index('[')].strip()
    if "/" in line:
            line=line[:line.index('/')-1].strip()
    if line.upper()==line and line.isupper() and line[-1] not in punct_set and line.count(' ')<4:
        
        return True, line
    else:
        return False, ""


In [57]:
def read_scripts(movie_filenames, unacceptable_starters, punct_remover):

    returner=[]

    for movie in movie_filenames:
        path=os.path.join("scripts", "samples", movie)

        with open(path, "r", encoding="utf-8") as f:
            movie_script=f.readlines()
            start_of_content=False
            this_movie_dicts=[]
            current_character_dict={}
            movie_title=""
            current_character_name="" 
            year=0      
            
            for line in movie_script:
                line=line.strip()

                #checks for the title and the first character, otherwise skips the header stuff
                if not start_of_content:
                    line=line.strip('"')

                    if movie_title=="" and line.translate(punct_remover)==line and line not in unacceptable_starters:
                        movie_title=line

                    if year==0 and len(re.findall('\d{4,}', line))>0:
                        year=re.findall('\d{4,}',line)[0]

                    is_character, new_character_name = check_if_character(line, string.punctuation)

                    if is_character and (new_character_name not in unacceptable_starters):
                        start_of_content=True
                        current_character_name=new_character_name
                        current_character_dict={"character": current_character_name, "movie_title": movie_title, "year":year, "is_villain": False, "raw_dialogue":""}
                        this_movie_dicts.append(current_character_dict)
                
                #if in the middle of the script and you have a current character, check if this line is a new character otherwise add the dialogue to this character's list
                else:
                    is_new_character, new_character_name = check_if_character(line, string.punctuation)
                    if is_new_character:
                        current_character_name=new_character_name
                        if current_character_name not in unacceptable_starters:
                            if current_character_name not in [entry['character'] for entry in this_movie_dicts]:
                                current_character_dict={"character": current_character_name, "movie_title": movie_title, "year":year, "is_villain": False, "raw_dialogue":""}
                                this_movie_dicts.append(current_character_dict)
                            else:
                                current_character_dict=[entry for entry in this_movie_dicts if entry['character']==current_character_name][0]
                            
                    else:
                        if current_character_name in unacceptable_starters:
                            pass
                        else:
                            if len(line)>0 and line[0]!='(' and line not in unacceptable_starters:
                                current_character_dict['raw_dialogue']=current_character_dict['raw_dialogue'].strip()+' ' + line.translate(punct_remover).strip()

            returner+=this_movie_dicts
    returner=[entry for entry in returner if len(entry['raw_dialogue'])!=0]
    for entry in returner:
        entry['num_words']=entry['raw_dialogue'].count(' ')+1

    return returner                            
    

In [58]:

movie_filenames=["blackdahliathe_dialog.txt", "basicinstinct_dialog.txt", "basic_dialog.txt","girlwiththedragontattoothe_dialog.txt", "manhattanmurdermystery_dialog.txt", 'afewgoodmen_dialog.txt', "8mm_dialog.txt", "backdraft_dialog.txt"]

unacceptable_starters=["VOICE (cont'd)", "VOICE (CONT'D)", "VOICE OVER (CONT'D)", "VOICE OVER (cont'd)", "DISSOLVE", "CUT", "CUT TO", 'FADE', 'FADE OUT', 'FADE IN', 'PAN', 'CONTINUED', "CONT'D", '', ' ', "VOICE", "VOICE OVER", 'CUT TO', 'DISSOLVE TO', 'THE END', 'FADE TO BLACK', "DISSOLVE TO:", "CUT TO:", "FADE TO:"]

punct_remover=str.maketrans('','', '"#$%&()*+-/:;<=>?@[\\]^_`{|}~')

movie_character_dicts=read_scripts(movie_filenames, unacceptable_starters, punct_remover)

In [59]:
#fixes where characters are referred to by separate names
def combine_characters(movie_character_dict, movie_title, character_name_to_keep, character_name_to_remove):
    movie_subset=[entry for entry in movie_character_dict if entry['movie_title']==movie_title]
    if character_name_to_remove not in [entry['character'] for entry in movie_subset] or character_name_to_keep not in [entry['character'] for entry in movie_subset]:
        return
    to_remove=[entry for entry in movie_character_dict if entry['character']==character_name_to_remove][0]
    keeper=[entry for entry in movie_subset if entry['character']==character_name_to_keep][0]
    keeper['raw_dialogue']+=to_remove['raw_dialogue']
    keeper['num_words']+=to_remove['num_words']
    movie_character_dict.remove(to_remove)

In [60]:
#random fixes

combine_characters(movie_character_dicts, "8MM", "AMY", "AMY'S VOICE")
combine_characters(movie_character_dicts, "8mm", "DINO VELVET", "DINO")
combine_characters(movie_character_dicts, "8mm", "DINO VELVET", "DINO VELVET VOICE")
combine_characters(movie_character_dicts, "8mm", "WELLES", "WELLES VOICE")
combine_characters(movie_character_dicts, "8mm", "WELLES", "WELLES' VOICE")

combine_characters(movie_character_dicts, "MANHATTAN MURDER MYSTERIES", "HELEN", "HELEN'S VOICE")

combine_characters(movie_character_dicts, 'The Black Dahlia', "CAPTAIN VASQUEZ", 'VASQUEZ')
combine_characters(movie_character_dicts, 'The Black Dahlia', "JOHNNY VOGEL", 'JOHNNY')
combine_characters(movie_character_dicts, 'The Black Dahlia', "JOHNNY VOGEL", 'VOGEL')
combine_characters(movie_character_dicts, "The Black Dahlia", "LEE BLANCHARD", "LEE")

combine_characters(movie_character_dicts, 'The Black Dahlia', "ELLIS LOEW", 'LOEW')
combine_characters(movie_character_dicts, 'The Black Dahlia', "RUSS MILLARD", 'MILLARD')
combine_characters(movie_character_dicts, 'BASIC INSTINCT', "CAPTAIN TALCOTT", 'TALCOTT')
combine_characters(movie_character_dicts, 'BASIC INSTINCT', "CAPTAIN TALCOTT", 'CAPT. TALCOTT')

combine_characters(movie_character_dicts, 'Basic', "DUNBAR", 'DUN BAR')
combine_characters(movie_character_dicts, 'Basic', "MUELLER", 'MUE:LLER')
combine_characters(movie_character_dicts, 'Basic', "OSBORNE", 'OSB0RNE')

combine_characters(movie_character_dicts, 'THE GIRL WITH THE DRAGON TATTOO', "GREGOR", 'GREGER')
combine_characters(movie_character_dicts, 'THE GIRL WITH THE DRAGON TATTOO', "GREGOR", 'GREGER')
combine_characters(movie_character_dicts, 'THE GIRL WITH THE DRAGON TATTOO', "BLOMKVIST", 'BLOMVIST')
combine_characters(movie_character_dicts, 'THE GIRL WITH THE DRAGON TATTOO', "HARRIET", 'HARRIE')
combine_characters(movie_character_dicts, 'THE GIRL WITH THE DRAGON TATTOO', "WENNERSTROM", 'WENNERSTROM ON TV')
combine_characters(movie_character_dicts, 'THE GIRL WITH THE DRAGON TATTOO', "VANGER", 'YOUNGER VANGER')

In [61]:
sample_dataframe=pd.DataFrame.from_dict(movie_character_dicts)

In [62]:
# Source: Professor Wilkens, INFO 3350
#convenience function to read the emotion lexicon into the variable emolex
def read_emolex(filepath=None):
    '''
    Takes a file path to the emolex lexicon file.
    Returns a dictionary of emolex sentiment values.
    '''
    if filepath==None: # Try to find the emolex file
        filepath = os.path.join('data','emolex.txt')
        if os.path.isfile(filepath):
            pass
        elif os.path.isfile('emolex.txt'):
            filepath = 'emolex.txt'
        else:
            raise FileNotFoundError('No EmoLex file found')
    emolex = defaultdict(dict) # Like Counter(), defaultdict eases dictionary creation
    with open(filepath, 'r') as f:
    # emolex file format is: word emotion value
        for line in f:
            word, emotion, value = line.strip().split()
            emolex[word][emotion] = int(value)
    return emolex

# Get EmoLex data. Make sure you set the right file path above.
emolex = read_emolex()


In [63]:
%%time
sample_dataframe['token_list']=[[token for token in nlp(doc) if not (token.is_punct or token.is_space or token.is_stop)] for doc in sample_dataframe['raw_dialogue']]

Wall time: 30.2 s


In [64]:
sample_dataframe.head()

Unnamed: 0,character,movie_title,year,is_villain,raw_dialogue,num_words,token_list
0,LEE BLANCHARD,The Black Dahlia,0,False,"Code three, Bleichert!To the halls of Tripoli...",989,"[Code, Bleichert!To, halls, Tripoli, shitbirds..."
1,BUCKY,The Black Dahlia,0,False,You come all the way down here to roust Impres...,3125,"[come, way, roust, Impressive, Whatta, Mex, pa..."
2,ROOKIE,The Black Dahlia,0,False,"Yep, three years in the Canal Zone. Nothin' bu...",23,"[Yep, years, Canal, Zone, Nothin', skeeter, bi..."
3,TOM,The Black Dahlia,0,False,"You, that's what. You know Lee Blanchard over ...",62,"[know, Lee, Blanchard, Central, Warrants, part..."
4,SECRETARY,The Black Dahlia,0,False,Officer Bleichert.,3,"[Officer, Bleichert]"


In [72]:
def add_emotion_scores(token_list, lexicon=None):
    string_list=[token.text.lower().strip() for token in token_list]
    re=[[],[],[],[],[],[],[],[],[],[]]
    if lexicon==None:
        lexicon=read_emolex()
    for token in string_list:
        if token in lexicon:
            score_list=list(lexicon[token].values())
            for score_index in range(len(score_list)):
                re[score_index].append(score_list[score_index])
    re=[score if len(score)>0 else [0] for score in re]
    to_return = [np.mean(emotion) for emotion in re]
    return to_return
    

    

In [73]:
%%time
all_emotions = [add_emotion_scores(entry) for entry in sample_dataframe['token_list']]
sample_dataframe['mean_anger'], sample_dataframe['mean_anticipation'], sample_dataframe['mean_disgust'], sample_dataframe['mean_fear'], sample_dataframe['mean_joy'], sample_dataframe['mean_negative'], sample_dataframe['mean_positive'], sample_dataframe['mean_sadness'], sample_dataframe['mean_surprise'], sample_dataframe['mean_trust'] = np.flipud(np.rot90(all_emotions))

Wall time: 42.4 s


In [74]:
def add_embeddings(token_list, vector_length):
    token_list=[token for token in token_list if token.has_vector]
    doc_matrix=np.zeros([len(token_list), vector_length])
    for i in range(len(doc_matrix)):
        doc_matrix[i]=token_list[i].vector
    return np.average(doc_matrix, axis=0)

In [75]:
%%time
vector_length=nlp.vocab.vectors_length
sample_dataframe['embeddings']=[add_embeddings(entry, vector_length) for entry in sample_dataframe['token_list']]

  avg = a.mean(axis)
  ret = um.true_divide(
Wall time: 477 ms


In [76]:
sample_dataframe.head()

Unnamed: 0,character,movie_title,year,is_villain,raw_dialogue,num_words,token_list,mean_anger,mean_anticipation,mean_disgust,mean_fear,mean_joy,mean_negative,mean_positive,mean_sadness,mean_surprise,mean_trust,embeddings
0,LEE BLANCHARD,The Black Dahlia,0,False,"Code three, Bleichert!To the halls of Tripoli...",989,"[Code, Bleichert!To, halls, Tripoli, shitbirds...",0.096774,0.059908,0.0553,0.147465,0.064516,0.165899,0.202765,0.096774,0.069124,0.110599,"[-0.06939335917577519, 0.06660951127792883, -0..."
1,BUCKY,The Black Dahlia,0,False,You come all the way down here to roust Impres...,3125,"[come, way, roust, Impressive, Whatta, Mex, pa...",0.086572,0.111307,0.063604,0.106007,0.077739,0.171378,0.183746,0.095406,0.060071,0.118375,"[-0.06832003521418073, 0.10440315664141776, -0..."
2,ROOKIE,The Black Dahlia,0,False,"Yep, three years in the Canal Zone. Nothin' bu...",23,"[Yep, years, Canal, Zone, Nothin', skeeter, bi...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[-0.1939576046111492, -0.14697723950331026, -0..."
3,TOM,The Black Dahlia,0,False,"You, that's what. You know Lee Blanchard over ...",62,"[know, Lee, Blanchard, Central, Warrants, part...",0.066667,0.066667,0.066667,0.066667,0.066667,0.2,0.2,0.066667,0.0,0.266667,"[-0.06670887317058855, 0.11280478690429752, -0..."
4,SECRETARY,The Black Dahlia,0,False,Officer Bleichert.,3,"[Officer, Bleichert]",0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,"[-0.29197999835014343, 0.14518000185489655, -0..."


In [78]:
sample_dataframe.iloc[122]

character                                                    BLOMKVIST
movie_title                            THE GIRL WITH THE DRAGON TATTOO
year                                                                 0
is_villain                                                       False
raw_dialogue         What is this, the media event of the year Don'...
num_words                                                         8155
token_list           [media, event, year, try, play, wo, 4A, EXT, C...
mean_anger                                                   0.0536797
mean_anticipation                                            0.0805195
mean_disgust                                                 0.0380952
mean_fear                                                    0.0909091
mean_joy                                                     0.0588745
mean_negative                                                 0.141126
mean_positive                                                 0.134199
mean_s

In [77]:
sample_dataframe.to_csv("sample.csv")