In [26]:
from __future__ import division
import glob
import codecs
import re
from math import log
from collections import Counter
from collections import defaultdict
from heapq import heapify, nlargest

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from google.cloud import language
from google.cloud.language import enums
from google.cloud.language import types

%env GOOGLE_APPLICATION_CREDENTIALS=/Users/emersonsjsu/GitHub/EarlGrey/google_private_key.json
WORKING_DIRECTORY = "/Users/emersonsjsu/GitHub/EarlGrey"


env: GOOGLE_APPLICATION_CREDENTIALS=/Users/emersonsjsu/GitHub/EarlGrey/google_private_key.json


In [2]:
# Clear char_df
FIRST_EPISODE_OFFSET = 101
LAST_EPISODE_IN_SEASON = {25: 1, 47: 2, 73: 3, 99: 4, 125: 5, 151: 6, 176: 7}
regex = re.compile('[^a-zA-Z ]')


def get_season(episode_no):
    return LAST_EPISODE_IN_SEASON.get(episode_no, 0)


def sanitize_name(name_line):
    name = name_line.strip()  # Voice overs count as same character
    if name.endswith('V.O.'):
        name = name[:-5]
    # Strip parentheticals from name
    if name.find('(') != -1:
        name = name[:name.find('(')]
    # Strip 'S VOICE from name
    if name.endswith('\'S VOICE'):
        name = name[:-8]
    # Strip 'S COM VOICE from name
    if name.endswith('\'S COM VOICE'):
        name = name[:-12]
    return name.strip()


# counters_dict expects {char: (counter_obj, [s1count,s2count, ..., s7count])}
def process_script(file_path, counters_dict):
    current_file = codecs.open(file_path, "r", encoding='utf-8', errors="ignore")
    episode_no = int(current_file.name[current_file.name.rfind('/') + 1:-4]) - FIRST_EPISODE_OFFSET
    is_dialogue = False
    character_name = ""
    lines = current_file.readlines()
    for line in lines:
        # Character dialogue has ended, reset current character
        if line.strip() == '':
            is_dialogue = False
        # All names are preceded by exactly 5 tabs
        elif line[:5] == '\t\t\t\t\t' and line[5:6] != '\t' and not line.strip().startswith('('):
            # We have found a name! Update current name
            character_name = sanitize_name(line)
            is_dialogue = True
        # If we are still looking at a character's dialogue
        elif is_dialogue:
            # Update character's counter object
            counters_dict[character_name][0].update(regex.sub('', line).lower().split())
    current_file.close()

    # Check if we are end of a season
    season_no = get_season(episode_no)
    if season_no:
        # Update wc per season for each character in counters_dict
        for v in counters_dict.itervalues():
            # Set season wc for every character. Subtract sum of previous values because counter_obj
            # keeps cumulative word count
            v[1][season_no - 1] = sum(v[0].values()) - sum(v[1])


character_wc_dict = defaultdict(lambda: (Counter(), [0] * 7))
#for full_path in sorted(glob.glob(WORKING_DIRECTORY+'/scripts/148.txt')):  # Only read one file for debug purposes
for full_path in sorted(glob.glob(WORKING_DIRECTORY + '/scripts/*.txt')):
    process_script(full_path, character_wc_dict)

#counters_dict['PICARD'][0].most_common(10)
character_wc_dict['PICARD'][1]


S1E25: [29921, 0, 0, 0, 0, 0, 0]


S2E47: [29921, 21786, 0, 0, 0, 0, 0]


S3E73: [29921, 21786, 25218, 0, 0, 0, 0]


S4E99: [29921, 21786, 25218, 23199, 0, 0, 0]


S5E125: [29921, 21786, 25218, 23199, 23540, 0, 0]


S6E151: [29921, 21786, 25218, 23199, 23540, 19013, 0]


S7E176: [29921, 21786, 25218, 23199, 23540, 19013, 24844]


[29921, 21786, 25218, 23199, 23540, 19013, 24844]

In [3]:
universal_wc = reduce((lambda x, y: x + y), map(lambda x: x[0], character_wc_dict.itervalues()))
universal_wc


Counter({u'raining': 1,
         u'foul': 5,
         u'four': 551,
         u'fortythird': 1,
         u'berlinghoff': 1,
         u'spiders': 4,
         u'hanging': 8,
         u'centimeter': 6,
         u'localized': 14,
         u'disobeying': 2,
         u'mutinied': 1,
         u'hmmmm': 5,
         u'mutinies': 1,
         u'jonos': 1,
         u'cleponji': 1,
         u'courtmartial': 5,
         u'rational': 16,
         u'rashly': 1,
         u'oceans': 4,
         u'offduty': 1,
         u'bile': 1,
         u'stipulate': 1,
         u'pigment': 1,
         u'tantalizing': 2,
         u'leisurely': 1,
         u'fur': 3,
         u'stabbed': 8,
         u'bringing': 31,
         u'disturb': 12,
         u'recollections': 3,
         u'liaisons': 1,
         u'grueling': 1,
         u'wooden': 3,
         u'wednesday': 1,
         u'happenings': 1,
         u'persisted': 1,
         u'woods': 2,
         u'ninetwo': 1,
         u'commented': 3,
         u'capitalist': 1,
   

In [None]:
char_total_wc = []
for k, v in character_wc_dict.iteritems():
    tup = sum(v[1]), k
    char_total_wc.append(tup)
main_character_wc = nlargest(9, char_total_wc, lambda x: x[0])
main_characters = [x[1] for x in main_character_wc]
main_characters


[u'PICARD',
 u'DATA',
 u'RIKER',
 u'GEORDI',
 u'BEVERLY',
 u'TROI',
 u'WORF',
 u'WESLEY',
 u'Q']

In [114]:
anyone_speaks = sum([x[0] for x in char_total_wc])
for person in main_characters:
    counter = character_wc_dict[person][0]
    person_speaks = [wc[0] for wc in main_character_wc if wc[1] == person][0]
    defining_wc = {}
    for word, count in counter.iteritems():
        defining_wc[word] = (count ** 1.3)/person_speaks / (universal_wc[word]/anyone_speaks)
    print(person)
    top_n = nlargest(8, defining_wc.iteritems(), lambda x: x[1])
    print([str(x[0]) for x in top_n])


PICARD
['number', 'mister', 'captains', 'log', 'stardate', 'supplemental', 'to', 'the']
DATA
['however', 'approximately', 'appears', 'correct', 'translating', 'am', 'sir', 'appear']
RIKER
['soren', 'rice', 'minuet', 'strap', 'decompress', 'william', 'yuta', 'kazago']
GEORDI
['reg', 'commodore', 'shipley', 'visors', 'yeah', 'bochra', 'leah', 'visor']
BEVERLY
['ccs', 'nana', 'cortical', 'chilton', 'alyssa', 'stimulator', 'alissa', 'inoprovaline']
TROI
['mother', 'ian', 'overbearing', 'repress', 'sensing', 'mmmhmm', 'ceramics', 'chandra']
WORF
['hailed', 'uncloaking', 'alexander', 'kahless', 'incoming', 'battelh', 'adoptive', 'torva']
WESLEY
['mom', 'davies', 'repulser', 'activator', 'icospectrogram', 'custodian', 'rosseau', 'prixus']
Q
['capitan', 'capitain', 'mortals', 'sordid', 'pedantic', 'roam', 'smugness', 'humanities']


In [96]:
# Plot character's total words per season
for person in main_characters:
    box_fig = plt.figure('%s words over seasons' % person)
    plt.bar(range(1, 8), character_wc_dict[person][1])
    plt.xlabel('Season')
    plt.ylabel('Number of words said')
    plt.title('%s\'s words said over the 7 seasons of TNG' % person)
    plt.show()


In [107]:
# Plot total words per season
wc_per_season = reduce(lambda x, y: [sum(pair) for pair in zip(x,y)], map(lambda x: x[1], character_wc_dict.itervalues()))
plt.bar(range(1,8), wc_per_season)
plt.xlabel('Season')
plt.ylabel('Number of words said')
plt.title('Total words said over the 7 seasons of TNG')
plt.show()


In [121]:
plt.bar(range(len(main_character_wc)), [x[0] for x in main_character_wc])
plt.xlabel('Character')
plt.ylabel('Number of words said')
plt.xticks(range(len(main_character_wc)), [x[1] for x in main_character_wc])
plt.xticks(rotation=70)
plt.title('Words said per character per season')
plt.show()