# Parts of Speech
Do female speakers really tend to use more adjectives/adverbs (flowery language)? How many commands do they use.

For this analysis, I use the [PennTreebank POS Tagger](https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html). It marks both verb base forms (for commands) and modals (for forms like "must, could, should").

In [1]:
import pandas as pd
import nltk

In [2]:
movie_df = pd.read_pickle(r"C:/Users/cassi/Desktop/Data_Science/Animated-Movie-Gendered-Dialogue/private/all_movies_toktype.pkl")

In [4]:
movie_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14096 entries, 0 to 14095
Data columns (total 14 columns):
Disney_Period       14096 non-null object
Gender              14096 non-null object
Movie               14096 non-null object
Role                14096 non-null object
Song                14096 non-null object
Speaker             14096 non-null object
Speaker_Status      14096 non-null object
Text                14096 non-null object
UTTERANCE_NUMBER    14096 non-null int64
Year                14096 non-null int64
Tokens              14096 non-null object
Types               14096 non-null object
Token_Count         14096 non-null int64
Type_Count          14096 non-null int64
dtypes: int64(4), object(10)
memory usage: 991.2+ KB


In [6]:
movie_df['POS'] = movie_df["Tokens"].map(nltk.pos_tag)

In [7]:
movie_df.head()

Unnamed: 0,Disney_Period,Gender,Movie,Role,Song,Speaker,Speaker_Status,Text,UTTERANCE_NUMBER,Year,Tokens,Types,Token_Count,Type_Count,POS
0,EARLY,f,Snow White,ANT,D,queen,NON-P,slave in the magic mirror come from the farthe...,1,1937,"[slave, in, the, magic, mirror, come, from, th...","{., !, slave, thy, face, from, me, the, and, c...",26,24,"[(slave, NN), (in, IN), (the, DT), (magic, JJ)..."
1,EARLY,m,Snow White,ANT,D,mirror,NON-P,"what wouldst thou know, my queen ?",2,1937,"[what, wouldst, thou, know, ,, my, queen, ?]","{,, what, thou, queen, ?, wouldst, know, my}",8,8,"[(what, WP), (wouldst, VBZ), (thou, NN), (know..."
2,EARLY,f,Snow White,ANT,D,queen,NON-P,"magic mirror on the wall, who is the fairest o...",3,1937,"[magic, mirror, on, the, wall, ,, who, is, the...","{,, the, all, is, mirror, fairest, ?, on, one,...",14,13,"[(magic, JJ), (mirror, NN), (on, IN), (the, DT..."
3,EARLY,m,Snow White,ANT,D,mirror,NON-P,"famed is thy beauty, majesty. but hold, a love...",4,1937,"[famed, is, thy, beauty, ,, majesty, ., but, h...","{., lovely, famed, maid, thy, her, than, ,, is...",33,27,"[(famed, VBN), (is, VBZ), (thy, JJ), (beauty, ..."
4,EARLY,f,Snow White,ANT,D,queen,NON-P,alas for her ! reveal her name.,5,1937,"[alas, for, her, !, reveal, her, name, .]","{., name, !, alas, for, reveal, her}",8,7,"[(alas, NN), (for, IN), (her, PRP$), (!, .), (..."


In [10]:
pos_dict = {}
for pos in movie_df['POS'].iloc[0]:
    if pos[1] in pos_dict:
        pos_dict[pos[1]] += 1
    else:
        pos_dict[pos[1]] = 1

In [11]:
pos_dict

{'NN': 9,
 'IN': 3,
 'DT': 2,
 'JJ': 2,
 'VBN': 1,
 'JJS': 1,
 'CC': 1,
 'VBP': 1,
 '.': 3,
 'VB': 2,
 'PRP': 1}

In [12]:
tag_fd = nltk.FreqDist(tag for (word, tag) in movie_df['POS'].iloc[0])

In [16]:
tag_fd.most_common()

[('NN', 9),
 ('IN', 3),
 ('.', 3),
 ('DT', 2),
 ('JJ', 2),
 ('VB', 2),
 ('VBN', 1),
 ('JJS', 1),
 ('CC', 1),
 ('VBP', 1),
 ('PRP', 1)]

In [18]:
movie_df['Tag_Freq'] = movie_df['POS'].map(lambda x: nltk.FreqDist(tag for (word, tag) in x))

In [26]:
'VBN' in movie_df['Tag_Freq'].iloc[0]

True

In [35]:
movie_df['Tag_Freq'].iloc[0]['VBN']

1

In [36]:
movie_df['Command_Count'] = movie_df['Tag_Freq'].map(lambda x: x['VB'])

In [37]:
movie_df.head()

Unnamed: 0,Disney_Period,Gender,Movie,Role,Song,Speaker,Speaker_Status,Text,UTTERANCE_NUMBER,Year,Tokens,Types,Token_Count,Type_Count,POS,Tag_Freq,Command_Count
0,EARLY,f,Snow White,ANT,D,queen,NON-P,slave in the magic mirror come from the farthe...,1,1937,"[slave, in, the, magic, mirror, come, from, th...","{., !, slave, thy, face, from, me, the, and, c...",26,24,"[(slave, NN), (in, IN), (the, DT), (magic, JJ)...","{'NN': 9, 'IN': 3, 'DT': 2, 'JJ': 2, 'VBN': 1,...",2
1,EARLY,m,Snow White,ANT,D,mirror,NON-P,"what wouldst thou know, my queen ?",2,1937,"[what, wouldst, thou, know, ,, my, queen, ?]","{,, what, thou, queen, ?, wouldst, know, my}",8,8,"[(what, WP), (wouldst, VBZ), (thou, NN), (know...","{'WP': 1, 'VBZ': 1, 'NN': 3, ',': 1, 'PRP$': 1...",0
2,EARLY,f,Snow White,ANT,D,queen,NON-P,"magic mirror on the wall, who is the fairest o...",3,1937,"[magic, mirror, on, the, wall, ,, who, is, the...","{,, the, all, is, mirror, fairest, ?, on, one,...",14,13,"[(magic, JJ), (mirror, NN), (on, IN), (the, DT...","{'JJ': 1, 'NN': 2, 'IN': 2, 'DT': 3, ',': 1, '...",0
3,EARLY,m,Snow White,ANT,D,mirror,NON-P,"famed is thy beauty, majesty. but hold, a love...",4,1937,"[famed, is, thy, beauty, ,, majesty, ., but, h...","{., lovely, famed, maid, thy, her, than, ,, is...",33,27,"[(famed, VBN), (is, VBZ), (thy, JJ), (beauty, ...","{'VBN': 1, 'VBZ': 2, 'JJ': 3, 'NN': 7, ',': 3,...",1
4,EARLY,f,Snow White,ANT,D,queen,NON-P,alas for her ! reveal her name.,5,1937,"[alas, for, her, !, reveal, her, name, .]","{., name, !, alas, for, reveal, her}",8,7,"[(alas, NN), (for, IN), (her, PRP$), (!, .), (...","{'NN': 3, 'IN': 1, 'PRP$': 2, '.': 2}",0


In [40]:
movie_df['POS'].iloc[0] #not all that accurate

[('slave', 'NN'), ('in', 'IN'), ('the', 'DT'), ('magic', 'JJ'), ('mirror', 'NN'), ('come', 'VBN'), ('from', 'IN'), ('the', 'DT'), ('farthest', 'JJS'), ('space', 'NN'), ('through', 'IN'), ('wind', 'NN'), ('and', 'CC'), ('darkness', 'NN'), ('i', 'NN'), ('summon', 'VBP'), ('thee', 'NN'), ('.', '.'), ('speak', 'NN'), ('!', '.'), ('let', 'VB'), ('me', 'PRP'), ('see', 'VB'), ('thy', 'JJ'), ('face', 'NN'), ('.', '.')]

In [39]:
%pprint

Pretty printing has been turned OFF


In [41]:
movie_df['Modal_Count'] = movie_df['Tag_Freq'].map(lambda x: x['MD'])

In [44]:
movie_df[movie_df.Modal_Count > 1]

Unnamed: 0,Disney_Period,Gender,Movie,Role,Song,Speaker,Speaker_Status,Text,UTTERANCE_NUMBER,Year,Tokens,Types,Token_Count,Type_Count,POS,Tag_Freq,Command_Count,Modal_Count
7,EARLY,f,Snow White,PRO,S,snow white,PRINCESS,want to know a secret ? promise not to tell ? ...,8,1937,"[want, to, know, a, secret, ?, promise, not, t...","{we, soon, dreaming, echoing, are, hoping, wis...",115,55,"[(want, NN), (to, TO), (know, VB), (a, DT), (s...","{'NN': 21, 'TO': 7, 'VB': 11, 'DT': 9, 'JJ': 8...",11,3
21,EARLY,f,Snow White,PRO,D,snow white,PRINCESS,hello there. what's the matter ? where's your ...,22,1937,"[hello, there, ., what, 's, the, matter, ?, wh...","{'re, ., !, cry, are, they, ?, matter, you, 's...",69,44,"[(hello, NN), (there, RB), (., .), (what, WP),...","{'NN': 9, 'RB': 5, '.': 14, 'WP': 1, 'VBZ': 3,...",7,3
22,EARLY,m,Snow White,ANT,D,guard,NON-P,i can't ! i can't do it ! forgive me. i beg of...,23,1937,"[i, ca, n't, !, i, ca, n't, do, it, !, forgive...","{me, forgive, ., ,, your, !, highness, ca, n't...",22,14,"[(i, NN), (ca, MD), (n't, RB), (!, .), (i, VB)...","{'NN': 3, 'MD': 2, 'RB': 2, '.': 4, 'VB': 2, '...",2,2
35,EARLY,f,Snow White,PRO,D,snow white,PRINCESS,i can't sleep in the ground like you. or in a ...,36,1937,"[i, ca, n't, sleep, in, the, ground, like, you...","{., or, would, stay, ?, you, big, me, the, gro...",46,35,"[(i, NN), (ca, MD), (n't, RB), (sleep, VB), (i...","{'NN': 5, 'MD': 3, 'RB': 5, 'VB': 3, 'IN': 5, ...",3,3
38,EARLY,f,Snow White,PRO,D,snow white,PRINCESS,hello ? may i come in ? shh. oh! what a cute l...,39,1937,"[hello, ?, may, i, come, in, ?, shh, ., oh, !,...","{., !, little, may, too, ?, stocking, 's, from...",51,35,"[(hello, NNS), (?, .), (may, MD), (i, VB), (co...","{'NNS': 4, '.': 10, 'MD': 2, 'VB': 3, 'IN': 3,...",3,2
41,EARLY,f,Snow White,PRO,D,snow white,PRINCESS,"and just look at that broom. tsk, tsk, tsk ! t...",42,1937,"[and, just, look, at, that, broom, ., tsk, ,, ...","{., would, !, tsk, they, 'd, you, why, ,, moth...",34,23,"[(and, CC), (just, RB), (look, VB), (at, IN), ...","{'CC': 1, 'RB': 2, 'VB': 2, 'IN': 1, 'DT': 2, ...",2,2
42,EARLY,f,Snow White,PRO,D,snow white,PRINCESS,"maybe they have no mother. then, they're orpha...",43,1937,"[maybe, they, have, no, mother, ., then, ,, th...","{house, ., 're, stay, we, orphans, they, too, ...",38,27,"[(maybe, RB), (they, PRP), (have, VBP), (no, D...","{'RB': 5, 'PRP': 6, 'VBP': 3, 'DT': 3, 'NN': 3...",4,2
86,EARLY,m,Snow White,HELPER,D,sneezy,NON-P,don't do it. take them away. my nose ! my hay ...,87,1937,"[do, n't, do, it, ., take, them, away, ., my, ...","{., away, !, stand, oh, fever, ca, n't, hay, n...",30,18,"[(do, VBP), (n't, RB), (do, VB), (it, PRP), (....","{'VBP': 2, 'RB': 4, 'VB': 5, 'PRP': 4, '.': 6,...",5,2
89,EARLY,m,Snow White,HELPER,D,sneezy,NON-P,i couldn't help it. i can't tell. when you got...,90,1937,"[i, could, n't, help, it, ., i, ca, n't, tell,...","{help, ., tell, ,, could, ca, when, n't, iii, ...",30,19,"[(i, NN), (could, MD), (n't, RB), (help, VB), ...","{'NN': 5, 'MD': 2, 'RB': 5, 'VB': 3, 'PRP': 3,...",3,2
92,EARLY,m,Snow White,HELPER,D,various dwarfs,NON-P,don't let go. hold him tight. i'll tie it. mak...,93,1937,"[do, n't, let, go, ., hold, him, tight, ., i, ...","{make, ., ,, knot, hold, that, n't, there, tie...",26,19,"[(do, VBP), (n't, RB), (let, VB), (go, VB), (....","{'VBP': 1, 'RB': 2, 'VB': 6, '.': 5, 'PRP': 3,...",6,2


In [43]:
movie_df['POS'].iloc[3] #not a command....

[('famed', 'VBN'), ('is', 'VBZ'), ('thy', 'JJ'), ('beauty', 'NN'), (',', ','), ('majesty', 'NN'), ('.', '.'), ('but', 'CC'), ('hold', 'VBP'), (',', ','), ('a', 'DT'), ('lovely', 'RB'), ('maid', 'NN'), ('i', 'NN'), ('see', 'VBP'), ('.', '.'), ('rags', 'NNS'), ('can', 'MD'), ('not', 'RB'), ('hide', 'VB'), ('her', 'PRP$'), ('gentle', 'JJ'), ('grace', 'NN'), ('.', '.'), ('alas', 'NN'), (',', ','), ('she', 'PRP'), ('is', 'VBZ'), ('more', 'RBR'), ('fair', 'JJ'), ('than', 'IN'), ('thee', 'NN'), ('.', '.')]

In [46]:
movie_df.groupby('Gender')['Command_Count'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
f,4437.0,0.815867,1.750471,0.0,0.0,0.0,1.0,53.0
m,9221.0,0.773777,1.343745,0.0,0.0,0.0,1.0,29.0
n,438.0,0.687215,1.692728,0.0,0.0,0.0,1.0,25.0


In [50]:
movie_df[movie_df.Command_Count == 53].POS #lol, it's let it go

6835    [(the, DT), (snow, JJ), (glows, VBZ), (white, ...
Name: POS, dtype: object

In [51]:
movie_df.POS.iloc[6835]

[('the', 'DT'), ('snow', 'JJ'), ('glows', 'VBZ'), ('white', 'JJ'), ('on', 'IN'), ('the', 'DT'), ('mountain', 'NN'), ('tonight', 'NN'), (',', ','), ('not', 'RB'), ('a', 'DT'), ('footprint', 'NN'), ('to', 'TO'), ('be', 'VB'), ('seen', 'VBN'), ('.', '.'), ('a', 'DT'), ('kingdom', 'NN'), ('of', 'IN'), ('isolation', 'NN'), (',', ','), ('and', 'CC'), ('it', 'PRP'), ('looks', 'VBZ'), ('like', 'IN'), ('i', 'NN'), ("'m", 'VBP'), ('the', 'DT'), ('queen', 'NN'), ('.', '.'), ('the', 'DT'), ('wind', 'NN'), ('is', 'VBZ'), ('howling', 'VBG'), ('like', 'IN'), ('this', 'DT'), ('swirling', 'VBG'), ('storm', 'NN'), ('inside', 'NN'), ('.', '.'), ('could', 'MD'), ("n't", 'RB'), ('keep', 'VB'), ('it', 'PRP'), ('in', 'IN'), (',', ','), ('heaven', 'JJ'), ('knows', 'NNS'), ('i', 'VBP'), ('tried', 'VBN'), ('.', '.'), ('do', 'VBP'), ("n't", 'RB'), ('let', 'VB'), ('them', 'PRP'), ('in', 'IN'), (',', ','), ('do', 'VBP'), ("n't", 'RB'), ('let', 'VB'), ('them', 'PRP'), ('see', 'VB'), (',', ','), ('be', 'VB'), ('the'

Questions:
* How to get rid of "to VB" forms?
* How to get modals that only address another speaker (ie "you should" instead of "I should")?

In [None]:
#try the stanford tagger instead
