In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import altair as alt

In [2]:
# original data from 
# https://data.world/data-society/the-simpsons-by-the-data

In [3]:
# character names and ids
df_ch = pd.read_csv('./data/simpsons_characters.csv', error_bad_lines=False)

# memorable lines spoken
df_l = pd.read_csv('./data/simpsons_script_lines.csv', error_bad_lines=False)

# episodes 
df_ep = pd.read_csv('./data/simpsons_episodes.csv', error_bad_lines=False)

# did not use the location data

b'Skipping line 8084: expected 13 fields, saw 20\nSkipping line 52607: expected 13 fields, saw 21\nSkipping line 59910: expected 13 fields, saw 21\n'
b'Skipping line 71801: expected 13 fields, saw 20\nSkipping line 73539: expected 13 fields, saw 21\nSkipping line 77230: expected 13 fields, saw 21\nSkipping line 78953: expected 13 fields, saw 21\nSkipping line 81138: expected 13 fields, saw 20\nSkipping line 86746: expected 13 fields, saw 22\nSkipping line 101154: expected 13 fields, saw 21\nSkipping line 115438: expected 13 fields, saw 20\nSkipping line 117573: expected 13 fields, saw 22\nSkipping line 130610: expected 13 fields, saw 22\n'
b'Skipping line 152970: expected 13 fields, saw 22\nSkipping line 153017: expected 13 fields, saw 20\nSkipping line 153018: expected 13 fields, saw 30\nSkipping line 154080: expected 13 fields, saw 20\nSkipping line 154082: expected 13 fields, saw 20\nSkipping line 154084: expected 13 fields, saw 20\nSkipping line 154086: expected 13 fields, saw 20\n

In [4]:
#df_l.head()
print(df_ch.shape)
df_ch = df_ch.sort_values(['id'])
# df_ch.head(30)

(6722, 4)


## Get the characters with the most, middle, and least memborable lines

In [5]:
def create_ints_for_ids(x):
    try:
        number = int(x['character_id'])
        x['character_id'] = number
    except:
        x['character_id'] = 9999
    return x

In [6]:
# ensure all character ids are ints
df_l= df_l.apply(lambda x: create_ints_for_ids(x), axis=1)
# df_l.head()

In [7]:
# collect the list of characters with the most and least memorable lines
# df_lc['character_id'].astype('int')
df_lc = df_l.groupby(['character_id'])['spoken_words'].count()
df_lc = pd.DataFrame(df_lc)
df_lc = df_lc.reset_index()
# df_lc['character_id'].astype('int')
df_lc = df_lc.sort_values(['spoken_words'], ascending=False)
# df_lc.head(20)

In [8]:
def add_names(x, df_ch= df_ch):
    try:
        number = int(x['character_id'])
        character = df_ch[df_ch['id'] == number]['name']
        x['name'] = list(character)[0]
        x['character_id'] = number
    except:
        x['character_id'] = 9999
        x['name'] = 'unknown'
    return x

In [9]:
df_lc = df_lc.apply(lambda x: add_names(x), axis=1)
df_lc.head()

Unnamed: 0,character_id,spoken_words,name
1,2,28166,Homer Simpson
0,1,13289,Marge Simpson
7,8,13200,Bart Simpson
8,9,10893,Lisa Simpson
14,15,3121,C. Montgomery Burns


In [10]:
len(df_lc)

6720

### Characters with memorable lines broken down into thirds

In [11]:
# most memorable lines
df_l_top = df_lc.iloc[:20, :]
# df_l_top
len(df_l_top)

20

In [12]:
# middle most memorable lines
df_l_mid = df_lc[df_lc['spoken_words'] < 600]
df_l_mid = df_l_mid[df_l_mid['spoken_words'] > 250]
len(df_l_mid)


20

In [13]:
# least most memorable lines
df_l_low = df_lc[df_lc['spoken_words'] < 250]
# df_l_low

In [14]:
len(df_l_low)

6680

In [15]:
# tuples representing the (character_id, name) for each of the thirds
high = df_l_top.apply(lambda x: x['name'], axis=1)
mid = df_l_mid.apply(lambda x: x['name'], axis=1)
low = df_l_low.apply(lambda x: x['name'], axis=1)[:60]

In [16]:
#mid
low

804                 Kirk Van Houten
508                  Snake Jailbird
1394                Cletus Spuckler
418                    Troy McClure
74                            Crowd
145                           DOLPH
4                     Todd Flanders
342                     Lionel Hutz
1096             Rainier Wolfcastle
55                         Narrator
456                     Miss Hoover
2347                  Gil Gunderson
1489                 The Rich Texan
38                             Kids
929     Captain Horatio McCallister
409                            HERB
1954     Manjula Nahasapeemapetilon
213                    Judge Snyder
120                    Rod Flanders
304                  Maude Flanders
270                  Jasper Beardly
259                            Kang
407                    Mona Simpson
305                   Helen Lovejoy
307                             Man
344                Dr. Nick Riviera
560                           Louie
117                         

In [17]:
# add a list to the character objects that is a tuple of (episode, lines from episode)

### Get character lines per episode

In [18]:
# count lines per episode --> character object

In [19]:
# df_l.sample(10)

In [20]:
df_l.shape

(158248, 13)

In [21]:
df_one = df_l.iloc[:10000, :]
df_two = df_l.iloc[10000:20000, :]
df_three = df_l.iloc[20000:30000, :]
df_four = df_l.iloc[30000:40000, :]
df_five = df_l.iloc[40000:50000, :]
df_six = df_l.iloc[50000:60000, :]
df_seven = df_l.iloc[60000:70000, :]
df_eight = df_l.iloc[70000:80000, :]
df_nine = df_l.iloc[80000:90000, :]
df_ten = df_l.iloc[90000:100000, :]
df_eleven = df_l.iloc[100000:110000, :]
df_twelve = df_l.iloc[110000:120000, :]
df_thirteen = df_l.iloc[120000:130000, :]
df_fourteen = df_l.iloc[130000:140000, :]
df_fifteen = df_l.iloc[140000:145000, :]
df_sixteen = df_l.iloc[150000:, :]

In [22]:
def character_episode_lines(character_df, df_l=df_l):
    
    episodes = list(df_l['episode_id'].unique())
    character_df[episodes] = 0
    print(character_df.shape)

    for i in range(len(df_l)):
        try:
            character = df_l.iloc[i, 8]
            episode = df_l.iloc[i, 1]

            character_df.loc[character, episode] += 1       
        except:
            continue

    character_df = character_df.dropna()
    character_df = character_df.drop('placeholder', axis=1)

    return character_df

In [23]:
def create_character_df(chars):
    empty_column = np.zeros(len(chars))
    character_df = pd.DataFrame({'placeholder': empty_column}, index=chars)
    return character_df

In [24]:
def create_character_csv(df, character_scope, line_df_portion):
    """
    df is the character line counted by episode portion of the total lines
    character_scope and line_df_portion are strings specifying what part of the total lines
        df is counted
    """
    df = df.reset_index()
    df = df.rename({'index': 'name tuple'}, axis=1)
    
    file_name = f"./data/many_csv/{character_scope}/{character_scope}_{line_df_portion}.csv"
    df.to_csv(file_name, index=False)
    
    return("created: ", file_name)

In [25]:
portion_list = [(df_one, 'one'), (df_two, 'two'), (df_three, 'three'), (df_four, 'four'), (df_five, 'five'),
                (df_six, 'six'), (df_seven, 'seven'), (df_eight, 'eight'), (df_nine, 'nine'), (df_ten, 'ten'), 
                (df_eleven, 'eleven'), (df_twelve, 'twelve'), (df_thirteen, 'thirteen'), (df_fourteen, 'fourteen'),
                (df_sixteen, 'sixteen')]

In [26]:
def create_third_csv(third, string_name_third, portion_list):
    """
    arguments:
    - third is the list of (id, name) tuples for the third of the characters 
    - string name of the third is a parameter entered explicitly 
    - portion_list is the list of tuples (<df name>, string of df name) specfiying
        which part of the overall memorable line dataframe being worked with

    Secondary Effects:
    - function creates the csv files in the specified folder for each portion of the third

    returns:
    - string stating how many csv files were created
    """
    count = 0
    
    for d in portion_list:
        # blank dataframe to work from, need empty df each time
        df_ = create_character_df(third)
        df_lines = character_episode_lines(df_, d[0])
        create_character_csv(df_lines, string_name_third, d[1])
        count += 1
        
    return(f"created {count} csv files.")

### The command to create the partial csv for the Third

In [27]:
# create_third_csv(low, 'low', portion_list)

(60, 49)
(60, 37)
(60, 41)
(60, 40)
(60, 41)
(60, 34)
(60, 35)
(60, 38)
(60, 39)
(60, 43)
(60, 41)
(60, 48)
(60, 40)
(60, 42)
(60, 57)


'created 15 csv files.'

#### The fifteenth block has some issues and needs special consideration

In [28]:
#df = create_character_df(low)

In [29]:
# df_fif = df_fifteen.dropna()
# len(df_fif)

4120

In [30]:
# df_l = character_episode_lines(df, df_fif)

(60, 21)


In [31]:
# create_character_csv(df_l, 'low', 'fifteen')

('created: ', './data/many_csv/low/low_fifteen.csv')

## Combine CSVs into thirds

In [32]:
def combine_csv_files(df_one, df_two):

    """
    df_one has to be the dataframe with a larger number of columns
    """

    if len(df_one) != len(df_two):
        return("Problem with selection of thirds")

    cols = list(df_two.columns)

    for c in cols:
        try:
            df_one[c] += df_two[c]
        except:
            df_one[c] = df_two[c]

    return df_one

In [33]:
high_csv_files = ['high_eight.csv', 'high_five.csv', 'high_nine.csv', 'high_six.csv', 'high_thirteen.csv', 'high_two.csv', 
                  'high_eleven.csv', 'high_four.csv', 'high_one.csv', 'high_sixteen.csv', 'high_three.csv',
                  'high_fifteen.csv', 'high_fourteen.csv', 'high_seven.csv', 'high_ten.csv', 'high_twelve.csv']

In [34]:
mid_csv_files = ['mid_eight.csv', 'mid_five.csv', 'mid_nine.csv','mid_six.csv', 'mid_thirteen.csv',  
                 'mid_two.csv', 'mid_eleven.csv', 'mid_four.csv', 'mid_one.csv', 'mid_sixteen.csv', 'mid_three.csv',
                'mid_fifteen.csv', 'mid_fourteen.csv', 'mid_seven.csv', 'mid_ten.csv', 'mid_twelve.csv']
 

In [35]:
low_csv_files = ['low_eight.csv', 'low_five.csv', 'low_nine.csv', 'low_six.csv', 'low_thirteen.csv', 'low_two.csv',
                 'low_eleven.csv', 'low_four.csv', 'low_one.csv', 'low_sixteen.csv', 'low_three.csv',
                 'low_fifteen.csv', 'low_fourteen.csv', 'low_seven.csv', 'low_ten.csv', 'low_twelve.csv']

In [36]:
high_path = "./data/many_csv/high/"
mid_path = "./data/many_csv/mid/"
low_path = "./data/many_csv/low/"

In [37]:
def iterate_csv_files(path_name, csv_list):
    df_one = pd.read_csv(path_name + csv_list[0])
    df_one = df_one.set_index('name tuple')
 
    for f in csv_list[1:]:
        df_f = pd.read_csv(path_name + f)
        df_f = df_f.set_index('name tuple')
    
        if len(df_one.columns) > len(df_f.columns):
            df_one = combine_csv_files(df_one, df_f)
        else:
            df_one = combine_csv_files(df_f, df_one)
        
    return df_one

In [38]:
high_csv = iterate_csv_files(high_path, high_csv_files)

In [39]:
high_csv.to_csv('./data/combined_thirds/high_third.csv')

In [40]:
mid_csv = iterate_csv_files(mid_path, mid_csv_files)

In [41]:
mid_csv.to_csv('./data/combined_thirds/mid_third.csv')

In [42]:
low_csv = iterate_csv_files(low_path, low_csv_files)

In [43]:
low_csv.to_csv('./data/combined_thirds/low_third.csv')