# Calculating word frequencies by category

This notebook calculates how often each non-"stop word" appeared in the screenplay of each film, as spoken by the actors included in our analysis, and grouped by race and gender. The output files are limited to words that were spoken __at least five times__ by actors of a given group in a given film.

In [1]:
import pandas as pd

In [2]:
CHAR_COLS = [
    "year",
    "film",
    "actor"
]

In [3]:
actor_info = pd.read_csv("../data/actor-metrics.csv")[
    CHAR_COLS + [
    "gender",
    "race_simple"
]].sort_values(CHAR_COLS)

In [4]:
character_words = pd.read_csv("../data/character-word-counts.csv")

actor_words = pd.merge(
    character_words,
    actor_info,
    on = CHAR_COLS,
    how = "left"
)

actor_words.head()

Unnamed: 0,year,film,character,actor,word,count,gender,race_simple
0,1989,born-on-the-fourth-of-july,COACH,Richard Grusin,4th,1,male,White
1,1989,born-on-the-fourth-of-july,COACH,Richard Grusin,awrightl,1,male,White
2,1989,born-on-the-fourth-of-july,COACH,Richard Grusin,babies,1,male,White
3,1989,born-on-the-fourth-of-july,COACH,Richard Grusin,baby,2,male,White
4,1989,born-on-the-fourth-of-july,COACH,Richard Grusin,bleed,1,male,White


### Preview the total counts of non-stopwords

In [5]:
actor_words.groupby([
    "year",
    "film",
    "gender",
])["count"].sum().unstack().fillna(0).astype(int)

Unnamed: 0_level_0,gender,female,male
year,film,Unnamed: 2_level_1,Unnamed: 3_level_1
1989,born-on-the-fourth-of-july,149,1414
1989,dead-poets-society,63,1583
1989,driving-miss-daisy,1170,2032
1989,field-of-dreams,560,2106
1989,my-left-foot,604,811
2015,bridge-of-spies,87,3075
2015,brooklyn,1854,736
2015,mad-max,730,266
2015,room,1368,940
2015,spotlight,406,3723


## Create tidy dataframe of word counts by group

In [6]:
def get_word_counts_by_group(var):
    return actor_words.rename(
            columns = { var: "group" }
        ).groupby([
            "year",
            "film",
            "group",
            "word"
        ] )["count"].sum().reset_index()

In [7]:
gender_freqs = get_word_counts_by_group("gender")
    
race_freqs = get_word_counts_by_group("race_simple")

gender_freqs.head()

Unnamed: 0,year,film,group,word,count
0,1989,born-on-the-fourth-of-july,female,age,1
1,1989,born-on-the-fourth-of-july,female,agree,1
2,1989,born-on-the-fourth-of-july,female,answer,1
3,1989,born-on-the-fourth-of-july,female,asked,1
4,1989,born-on-the-fourth-of-july,female,birthday,3


In [8]:
def get_top_words_by_group(df):
    return df.sort_values("count", ascending = False)\
        .sort_values([ "year", "film", "group" ])\
        .pipe(lambda x: x[x["count"] >= 5])

In [9]:
top_words_by_race = get_top_words_by_group(race_freqs)
top_words_by_gender = get_top_words_by_group(gender_freqs)

top_words_by_gender.head()

Unnamed: 0,year,film,group,word,count
75,1989,born-on-the-fourth-of-july,female,ronnie,19
93,1989,born-on-the-fourth-of-july,female,tommy,5
405,1989,born-on-the-fourth-of-july,male,hey,22
251,1989,born-on-the-fourth-of-july,male,dad,20
643,1989,born-on-the-fourth-of-july,male,ronnie,17


In [10]:
top_words_by_race.to_csv("../output/top-words-by-race.csv", index = False)
top_words_by_gender.to_csv("../output/top-words-by-gender.csv", index = False)

---

---

---