# Caculating descriptive statistics by film and by year

In [1]:
import pandas as pd
import os
from glob import glob
import numpy as np

### Load the data

In [2]:
actors = pd.read_csv("../data/actor-metrics.csv")
actors.head()

Unnamed: 0,year,film,actor,characters,imdb,gender,race,race_simple,words,sentences
0,1989,born-on-the-fourth-of-july,Dale Dye,COLONEL,http://www.imdb.com/name/nm0245653/,male,White/Caucasian,White,216.0,15.0
1,1989,born-on-the-fourth-of-july,David Warshofsky,LIEUTENANT,http://www.imdb.com/name/nm0913175/,male,White/Caucasian,White,290.0,40.0
2,1989,born-on-the-fourth-of-july,Frank Whaley,TIMMY,http://www.imdb.com/name/nm0001844/,male,White/Caucasian,White,104.0,14.0
3,1989,born-on-the-fourth-of-july,Jerry Levine,STEVE,http://www.imdb.com/name/nm0505842/,male,White/Caucasian,White,300.0,35.0
4,1989,born-on-the-fourth-of-july,John F. Kennedy,KENNEDY,,male,White/Caucasian,White,217.0,5.0


### Define some analysis functions

In [3]:
def single_group(col):
    """
    Groups on one col 
    Sums sentences and words
    Calcs percents of columns against whole
    Returns data frame
    """
    return actors.groupby(col).agg({
        "sentences": np.sum, # sum number of sentences
        "words": np.sum, # sum number of words
        "actor": "count"
    }).assign(
        percent_sentences = lambda frame: ( frame['sentences'] / frame['sentences'].sum() ) * 100,
        percent_words = lambda frame: ( frame['words'] / frame['words'].sum()) * 100,
        percent_actor = lambda frame: ( 
            (frame["actor"] / frame['actor'].sum()) * 100
        ) 
    )

In [4]:
def double_group(col1, col2):
    """
    Groups on two cols 
    Counts actors and percent for race or gender 
    Calcs percents of columns against whole
    Returns data frame
    """
    
    grouped = actors.groupby([col1, col2]).agg({
        "actor": "count"
        # unstack to remove multindex
    }).unstack() \
    .fillna(0).astype(int)

    # drop the extra multi column
    grouped.columns = grouped.columns.get_level_values(1)

    # create new col for total
    if col2 == "gender":
        grouped['total'] = grouped['male'] + grouped['female']
        # make percent cols
        grouped['female_percent'] = (grouped['female'] / grouped['total'] ) * 100
        grouped['male_percent'] = ( grouped['male'] / grouped['total']) * 100
    elif col2 == "race_simple":
        # make a total column
        grouped['total'] = grouped['POC'] + grouped['White']
        # make a percent column
        grouped['white_percent'] = (grouped['White'] / grouped['total'] ) * 100
        grouped['poc_percent'] = ( grouped['POC'] / grouped['total']) * 100

    # remove the column's namne
    grouped.columns.name = ""

    # display
    return grouped

In [5]:
def double_word_group(unit, col1, col2):
    """
    Takes words/sentences as "unit" and two arbitrary columns
    """
    # group by year and gender
    bycols = actors.groupby([col1, col2]).agg({
        unit: np.sum,
    }).unstack() \
    .fillna(0).astype(int)
    # drop the extra multi column
    bycols.columns = bycols.columns.droplevel()

    # math:
    if col2 == "gender":
        bycols['total_{}'.format(unit)] = bycols['female'] + bycols['male']

        # make a percent column
        bycols['female_percent'] = (bycols['female'] / bycols['total_{0}'.format(unit)] ) * 100
        bycols['male_percent'] = ( bycols['male'] / bycols['total_{0}'.format(unit)]) * 100
    elif col2 == "race_simple":
        
        bycols['total_{0}'.format(unit)] = bycols['POC'] + bycols['White']

        # make a percent column
        bycols['white_percent'] = (bycols['White'] / bycols['total_{0}'.format(unit)] ) * 100
        bycols['poc_percent'] = ( bycols['POC'] / bycols['total_{0}'.format(unit)]) * 100
        
    # clean up the column
    bycols.columns.name = ""

    # display last three columns for clarity
    return bycols.iloc[:, 2:]


### Calculate the statistics

** Casting by gender across all films, all years **

In [6]:
single_group('gender')

Unnamed: 0_level_0,sentences,words,actor,percent_sentences,percent_words,percent_actor
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
female,8114.0,51119.0,84,30.792,29.607942,31.698113
male,18237.0,121534.0,181,69.208,70.392058,68.301887


** Casting by race across all films, all years **

In [7]:
single_group('race')

Unnamed: 0_level_0,sentences,words,actor,percent_sentences,percent_words,percent_actor
race,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2+ Race,252.0,1621.0,9,0.95632,0.938877,3.396226
Asian,81.0,667.0,3,0.307389,0.386324,1.132075
Black/African American,1915.0,12031.0,15,7.267276,6.968312,5.660377
Hispanic/Latino,129.0,853.0,3,0.489545,0.494055,1.132075
White/Caucasian,23974.0,157481.0,235,90.979469,91.212432,88.679245


**Casting by simplified race across all films, all years**

In [8]:
single_group('race_simple')

Unnamed: 0_level_0,sentences,words,actor,percent_sentences,percent_words,percent_actor
race_simple,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
POC,2377.0,15172.0,30,9.020531,8.787568,11.320755
White,23974.0,157481.0,235,90.979469,91.212432,88.679245


** Casting by gender and year **

In [9]:
double_group('year', 'gender')

Unnamed: 0_level_0,female,male,total,female_percent,male_percent
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1989,12,37,49,24.489796,75.510204
2015,40,77,117,34.188034,65.811966
2017,32,67,99,32.323232,67.676768


** Casting by race and year **

In [10]:
double_group('year', 'race_simple')

Unnamed: 0_level_0,POC,White,total,white_percent,poc_percent
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1989,4,45,49,91.836735,8.163265
2015,13,104,117,88.888889,11.111111
2017,13,86,99,86.868687,13.131313


** Number of sentences by gender by year **

In [11]:
double_word_group('sentences', 'year', 'gender')

Unnamed: 0_level_0,total_sentences,female_percent,male_percent
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1989,5708,27.014716,72.985284
2015,10741,30.676846,69.323154
2017,9902,33.094324,66.905676


** Number of words by year and gender **

In [12]:
double_word_group('words', 'year', 'gender')

Unnamed: 0_level_0,total_words,female_percent,male_percent
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1989,33725,26.407709,73.592291
2015,72918,27.95606,72.04394
2017,66010,33.067717,66.932283


** Number of sentences by race and year **

In [13]:
double_word_group('sentences', 'year', 'race_simple')

Unnamed: 0_level_0,total_sentences,white_percent,poc_percent
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1989,5708,86.580238,13.419762
2015,10741,94.488409,5.511591
2017,9902,89.70915,10.29085


** Number of words by race and year **

In [14]:
double_word_group('words', 'year', 'race_simple')

Unnamed: 0_level_0,total_words,white_percent,poc_percent
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1989,33725,84.853966,15.146034
2015,72918,94.45953,5.54047
2017,66010,90.87411,9.12589


** Casting by race and film **

In [15]:
double_group('film', 'race_simple').sort_values("poc_percent", ascending = False)

Unnamed: 0_level_0,POC,White,total,white_percent,poc_percent
film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
get-out,5,6,11,54.545455,45.454545
the-martian,5,9,14,64.285714,35.714286
driving-miss-daisy,2,4,6,66.666667,33.333333
room,3,9,12,75.0,25.0
mad-max,2,10,12,83.333333,16.666667
lady-bird,2,10,12,83.333333,16.666667
three-billboards-outside-ebbing-missouri,3,15,18,83.333333,16.666667
the-big-short,3,20,23,86.956522,13.043478
field-of-dreams,1,7,8,87.5,12.5
darkest-hour,1,10,11,90.909091,9.090909


** Casting by gender and film **

In [16]:
double_group('film', 'gender').sort_values(
    "female_percent", ascending = False
)

Unnamed: 0_level_0,female,male,total,female_percent,male_percent
film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
mad-max,9,3,12,75.0,25.0
brooklyn,13,5,18,72.222222,27.777778
lady-bird,7,5,12,58.333333,41.666667
call-me-by-your-name,3,3,6,50.0,50.0
room,6,6,12,50.0,50.0
phantom-thread,2,2,4,50.0,50.0
driving-miss-daisy,3,3,6,50.0,50.0
three-billboards-outside-ebbing-missouri,7,11,18,38.888889,61.111111
field-of-dreams,3,5,8,37.5,62.5
get-out,4,7,11,36.363636,63.636364


** Number of words by gender and film **

In [17]:
double_word_group("words", "film", "gender").sort_values(
    by = "female_percent",
    ascending = False
)

Unnamed: 0_level_0,total_words,female_percent,male_percent
film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
lady-bird,7664,77.622651,22.377349
mad-max,3130,73.99361,26.00639
brooklyn,10591,70.125578,29.874422
room,8373,62.617939,37.382061
phantom-thread,5541,49.5037,50.4963
my-left-foot,4749,45.441146,54.558854
driving-miss-daisy,9675,42.821705,57.178295
three-billboards-outside-ebbing-missouri,9480,35.896624,64.103376
the-post,11135,35.671307,64.328693
get-out,7094,28.742599,71.257401


**Number of sentences by gender and film**

In [18]:
double_word_group("sentences", "film", "gender").sort_values(
    by = "female_percent",
    ascending = False
)

Unnamed: 0_level_0,total_sentences,female_percent,male_percent
film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
lady-bird,1148,78.310105,21.689895
brooklyn,1469,73.31518,26.68482
mad-max,694,72.766571,27.233429
room,1414,61.386139,38.613861
phantom-thread,783,47.509579,52.490421
my-left-foot,980,44.795918,55.204082
driving-miss-daisy,1520,44.342105,55.657895
three-billboards-outside-ebbing-missouri,1195,37.154812,62.845188
the-post,1528,36.060209,63.939791
get-out,1330,31.954887,68.045113


**Number of sentences by race and film**

In [19]:
double_word_group("sentences", "film", "race_simple").sort_values(
    by = "poc_percent",
    ascending = False
)

Unnamed: 0_level_0,total_sentences,white_percent,poc_percent
film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
get-out,1330,50.902256,49.097744
driving-miss-daisy,1520,64.210526,35.789474
the-martian,1507,75.779695,24.220305
field-of-dreams,1299,84.064665,15.935335
the-shape-of-water,1347,85.077951,14.922049
mad-max,694,90.778098,9.221902
room,1414,93.493635,6.506365
three-billboards-outside-ebbing-missouri,1195,93.640167,6.359833
lady-bird,1148,95.731707,4.268293
the-big-short,1782,96.015713,3.984287


** Number of words by race and film **

In [20]:
double_word_group("words", "film", "race_simple").sort_values(
    by = "poc_percent",
    ascending = False
)

Unnamed: 0_level_0,total_words,white_percent,poc_percent
film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
get-out,7094,53.354948,46.645052
driving-miss-daisy,9675,63.503876,36.496124
the-martian,10550,75.459716,24.540284
the-shape-of-water,8036,83.586361,16.413639
field-of-dreams,8933,84.674801,15.325199
mad-max,3130,89.776358,10.223642
three-billboards-outside-ebbing-missouri,9480,91.751055,8.248945
room,8373,92.643019,7.356981
born-on-the-fourth-of-july,5151,95.961949,4.038051
the-big-short,13241,96.110566,3.889434


---

---

---