# Analysis

This notebook is NEW CONTINUING: I am piping in data objects created from other notebooks and analyzing them.

I will be combining datasets here and analyzing them based on my research questions.

## Loading in the data

In [1]:
# loading packages
import numpy as np
import pandas as pd
import nltk
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [2]:
# load in the data frames

f1 = open('characters_update_df.pkl', 'rb')
characters_df = pickle.load(f1)
f1.close()

f2 = open('conversations_df.pkl', 'rb')
conversations_df = pickle.load(f2)
f2.close()

f3 = open('movies_df.pkl', 'rb')
movies_df = pickle.load(f3)
f3.close

# load in utterances from csv
utterances_df = pd.read_csv('./private/utterances_df.csv', index_col=0)

Let's make sure they all look okay.

In [3]:
characters_df.head()

Unnamed: 0,character_ID,character_name,movie_ID,movie_title,gender
0,u0,BIANCA,m0,10 things i hate about you,F
2,u2,CAMERON,m0,10 things i hate about you,M
4,u4,JOEY,m0,10 things i hate about you,M
5,u5,KAT,m0,10 things i hate about you,F
6,u6,MANDELLA,m0,10 things i hate about you,F


In [4]:
conversations_df.head()

Unnamed: 0,conversation_ID,character1_ID,character2_ID,movie_ID,dialogue
0,0,u0,u2,m0,L194
0,0,u0,u2,m0,L195
0,0,u0,u2,m0,L196
0,0,u0,u2,m0,L197
1,1,u0,u2,m0,L198


In [5]:
movies_df.head()

Unnamed: 0,movie_title,movie_year,genres,movie_decade
0,10 things i hate about you,1999,"['comedy', 'romance']",1990
1,1492: conquest of paradise,1992,"['adventure', 'biography', 'drama', 'history']",1990
2,15 minutes,2001,"['action', 'crime', 'drama', 'thriller']",2000
3,2001: a space odyssey,1968,"['adventure', 'mystery', 'sci-fi']",1960
4,48 hrs.,1982,"['action', 'comedy', 'crime', 'drama', 'thrill...",1980


In [6]:
utterances_df.head()

Unnamed: 0,line_ID,character_ID,movie_ID,character_name,utterance,sents,tokens,pos_tag,sent_count,token_count,avg_sent_length
0,L1045,u0,m0,BIANCA,They do not!,['They do not!'],"['They', 'do', 'not', '!']","[(They, 'PRON'), (do, 'VERB'), (not, 'PART'), ...",1,4,4.0
1,L1044,u2,m0,CAMERON,They do to!,['They do to!'],"['They', 'do', 'to', '!']","[(They, 'PRON'), (do, 'VERB'), (to, 'PART'), (...",1,4,4.0
2,L985,u0,m0,BIANCA,I hope so.,['I hope so.'],"['I', 'hope', 'so', '.']","[(I, 'PRON'), (hope, 'VERB'), (so, 'ADV'), (.,...",1,4,4.0
3,L984,u2,m0,CAMERON,She okay?,['She okay?'],"['She', 'okay', '?']","[(She, 'PRON'), (okay, 'ADJ'), (?, 'PUNCT')]",1,3,3.0
4,L925,u0,m0,BIANCA,Let's go.,"[""Let's go.""]","['Let', ""'s"", 'go', '.']","[(Let, 'VERB'), ('s, 'PRON'), (go, 'VERB'), (....",1,4,4.0


utterance df will have to be POS tagged within the analysis document...

In [22]:
# the pos_tag column is not working as a list of tuples
utterances_df['pos_tag'] = utterances_df['pos_tag'].fillna("[]").apply(lambda x: eval(x))

SyntaxError: invalid syntax (<string>, line 1)

They all look good I will review some basic information about the data and then compile the data.

## Data Overview

Let's review some basic information about the corpus before diving in.

In [None]:
# how many movies in the corpus?
movies_df.movie_title.count()

In [None]:
# what's the earliest year represented?
movies_df.movie_year.min()

In [None]:
# what's the latest year represented?
movies_df.movie_year.max()

In [None]:
# how many characters are in the corpus?
characters_df.character_name.count()

In [None]:
# how many utterances?
utterances_df.line_ID.count()

In [None]:
# how many tokens?
utterances_df.token_count.sum()

## Compiling the Data

### Linguistic Analysis

In [8]:
# because I am looking at discourse, I will use the utterances data frame and load other data into it

# adding gender
discourse_df = pd.merge(utterances_df, characters_df)

# adding movie year and decade
discourse_df = pd.merge(discourse_df, movies_df)

# adding conversation data
discourse_df = pd.merge(discourse_df, conversations_df, left_on='line_ID', right_on='dialogue').drop(columns=['character1_ID', 'character2_ID', 'movie_ID_y', 'dialogue'], axis=1)

#rename column
discourse_df.rename(columns={"movie_ID_x": "movie_ID"}, inplace=True)

In [10]:
# there are 267 utterances that are empty, I will remove them
discourse_df = discourse_df[discourse_df.token_count != 0]

In [11]:
discourse_df.head()

Unnamed: 0,line_ID,character_ID,movie_ID,character_name,utterance,sents,tokens,pos_tag,sent_count,token_count,avg_sent_length,movie_title,gender,movie_year,genres,movie_decade,conversation_ID
0,L1045,u0,m0,BIANCA,They do not!,['They do not!'],"['They', 'do', 'not', '!']","[(They, 'PRON'), (do, 'VERB'), (not, 'PART'), ...",1,4,4.0,10 things i hate about you,F,1999,"['comedy', 'romance']",1990,24
1,L985,u0,m0,BIANCA,I hope so.,['I hope so.'],"['I', 'hope', 'so', '.']","[(I, 'PRON'), (hope, 'VERB'), (so, 'ADV'), (.,...",1,4,4.0,10 things i hate about you,F,1999,"['comedy', 'romance']",1990,23
2,L925,u0,m0,BIANCA,Let's go.,"[""Let's go.""]","['Let', ""'s"", 'go', '.']","[(Let, 'VERB'), ('s, 'PRON'), (go, 'VERB'), (....",1,4,4.0,10 things i hate about you,F,1999,"['comedy', 'romance']",1990,22
3,L872,u0,m0,BIANCA,Okay -- you're gonna need to learn how to lie.,"[""Okay -- you're gonna need to learn how to li...","['Okay', '--', 'you', ""'re"", 'gon', 'na', 'nee...","[(Okay, 'INTJ'), (--, 'PUNCT'), (you, 'PRON'),...",1,13,13.0,10 things i hate about you,F,1999,"['comedy', 'romance']",1990,21
4,L870,u0,m0,BIANCA,I'm kidding. You know how sometimes you just ...,"[""I'm kidding."", 'You know how sometimes you j...","['I', ""'m"", 'kidding', '.', 'You', 'know', 'ho...","[(I, 'PRON'), ('m, 'AUX'), (kidding, 'VERB'), ...",3,25,8.333333,10 things i hate about you,F,1999,"['comedy', 'romance']",1990,21


In [13]:
discourse_df.describe()

Unnamed: 0,sent_count,token_count,avg_sent_length,movie_year,movie_decade,conversation_ID
count,304403.0,304403.0,304403.0,304403.0,304403.0,304403.0
mean,1.695433,13.73612,7.856031,1988.106428,1983.434822,41482.442111
std,1.252371,14.712649,5.155597,17.141863,17.204418,23872.389444
min,1.0,1.0,1.0,1927.0,1920.0,0.0
25%,1.0,5.0,4.5,1984.0,1980.0,20784.0
50%,1.0,9.0,7.0,1995.0,1990.0,41575.0
75%,2.0,17.0,10.0,1999.0,1990.0,62115.0
max,45.0,684.0,122.0,2010.0,2010.0,83096.0


Per turn there is a wide range on the token counts, from 1 to 684. There is also a wide range per turn on sentence counts, from 1 to 45. I need to decide if I will include the max values in my analysis to follow. For now they will remain in.

#### Tokens

I am looking at token counts to see if there is a difference between turns by gender.

In [None]:
discourse_df.groupby('gender').token_count.mean()

Only 0.79 difference between average token counts of male and female characters. Characters with unknown gender markers have the longest turns at 14.7. Female characters have the shortest tokens per turn. Female and ambiguous characters fall below the mean and male and unknown are above the mean.

In [None]:
# raw counts of tokens across the corpus
discourse_df.groupby('gender').token_count.sum()

Raw count doesn't help for comparison across categories, but by looking at this we can see that there are more male characters because the average token count above does not differ as much as the raw counts.

In [None]:
# what is the shortest turn by token count?
discourse_df.groupby('gender').token_count.min()

Characters across all gender markers have utterances that are only one token long. I expect these will be interjections of some kind.

In [None]:
discourse_df[discourse_df.token_count==1]

At quick glance, some answers and greetings.

In [None]:
# what's the longest turn by token count?
discourse_df.groupby('gender').token_count.max()

In [None]:
# this is the longest token count per utterance
discourse_df[discourse_df.token_count==684]
# seems like it may be a narration intro...it may be removed from analysis

At the total corpus level there are some small differences between gender at the token level. I will factor in movie decade to see if any differences can be detected across time.

#### Token and Movie Decade

In [None]:
discourse_df.groupby(['movie_decade','gender']).token_count.mean()

In [None]:
discourse_df.groupby(['movie_decade','gender']).token_count.min()
# looks like not all movies have as short of utterances

In [None]:
discourse_df.groupby(['movie_decade','gender']).token_count.max()

#### Sentences

In [None]:
# how many sentences per turn?
discourse_df.groupby('gender').sent_count.mean()

The average utterance is less than two sentences long. There is not much difference across the categories. On average, male utterances have slightly more sentences. Again, unknown gender has the most (but barely) average sentences per utterance.

In [None]:
discourse_df.groupby('gender').sent_count.sum()
# more sentences for male characters

This is another view of what we saw above: there are more male characters, which is why the raw sentence count is higher but not the average sentence count.

In [None]:
# how many tokens per sentence?
discourse_df.groupby('gender').avg_sent_length.mean()
# average sentence length across all genders does not seem to be too wide of a spread

For each gender, sentences have on average around 8 words, although female characters have the shortest averages sentences at 7.67.

In [None]:
discourse_df.groupby('gender').sent_count.max()
# the longest turn by number of sentences

Commentary

In [None]:
discourse_df[(discourse_df.sent_count==18) & (discourse_df.gender=='A')]

In [None]:
discourse_df[(discourse_df.sent_count==33) & (discourse_df.gender=='F')]

In [None]:
discourse_df[discourse_df.sent_count==45]

In [None]:
discourse_df[(discourse_df.sent_count==24) & (discourse_df.gender=='unknown')]

The longest utterance by sentence and longest utterance by token count are from male characters.

#### Sentences and Movie Decade

How does the sentence level information change by decade?

In [None]:
discourse_df.groupby(['movie_decade','gender']).sent_count.mean()

COMMENTARY

In [None]:
discourse_df.groupby(['movie_decade','gender']).sent_count.min()

COMMENTARY

In [None]:
discourse_df.groupby(['movie_decade','gender']).sent_count.max()

COMMENTARY

In [None]:
discourse_df.groupby(['movie_decade', 'gender']).avg_sent_length.mean()

COMMENTARY

#### POS

I will ignore noun and verbs, as they are the most basic elements of phrase structure. I will look at adjectives and adverbs to see how often the speaker modifies their words, interjections to check for interruptions, and conjunctions to get an idea about sentence complexity.

The parts of speech I will look at are the following:
* ADV (adverb)
* ADJ (adjective)
* CCONJ (coordinating conjunction)
* INTJ (interjection)

In [17]:
discourse_df.head(1)

Unnamed: 0,line_ID,character_ID,movie_ID,character_name,utterance,sents,tokens,pos_tag,sent_count,token_count,avg_sent_length,movie_title,gender,movie_year,genres,movie_decade,conversation_ID
0,L1045,u0,m0,BIANCA,They do not!,['They do not!'],"['They', 'do', 'not', '!']","[(They, 'PRON'), (do, 'VERB'), (not, 'PART'), ...",1,4,4.0,10 things i hate about you,F,1999,"['comedy', 'romance']",1990,24


'('

In [14]:
# adverbs
def get_adv(x):
    pattern = r'ADV'
    advs = re.findall(pattern, ' '.join(str(z) for (y,z) in x))
    return advs

# adding data to the data frames
discourse_df['adv_count'] = discourse_df.pos_tag.apply(get_adv).str.len()

ValueError: not enough values to unpack (expected 2, got 1)

In [None]:
# adjectives
def get_adj(x):
    pattern = r'ADJ'
    adjs = re.findall(pattern, ' '.join(str(z) for (y,z) in x))
    return adjs

# adding data to the data frames
discourse_df['adj_count'] = discourse_df.pos_tag.apply(get_adj).str.len()

In [None]:
# conjunctions
def get_conj(x):
    pattern = r'CONJ'
    conjs = re.findall(pattern, ' '.join(str(z) for (y,z) in x))
    return conjs

# adding data to the data frames
discourse_df['conj_count'] = discourse_df.pos_tag.apply(get_conj).str.len()

In [None]:
# interjections
def get_intj(x):
    pattern = r'INTJ'
    intjs = re.findall(pattern, ' '.join(str(z) for (y,z) in x))
    return intjs

# adding data to the data frames
discourse_df['intj_count'] = discourse_df.pos_tag.apply(get_intj).str.len()

In [None]:
discourse_df.head()

Let's see how each gender uses these different parts of speech. Because the POS I am analyzing are not required, the usages may be low and a min of 0 per turn can be expected. This is indicated by flashing the head of the data frame above.

In [None]:
print('There are',discourse_df.rb_count.sum(),'adverbs in the corpus.')
print('There are',discourse_df.jj_count.sum(),'adjectives in the corpus.')
print('There are',discourse_df.cc_count.sum(),'conjunctions in the corpus.')
print('There are',discourse_df.uh_count.sum(),'interjections in the corpus.')

Adverbs are by far the most common POS out of the four selected for analysis.

In [None]:
discourse_df.groupby('gender').agg({'rb_count': ['mean', 'min', 'max', 'std']})

Female characters have the highest mean of adverb usage at 0.97. So on avarage, almost every turn a female character will use an adverb in this corpus. However, the highest number of adverbs in one turn is from a male character with 35.

As the boxplot shows the tails across all genders are very long. The turns with 0 instances are bringing down the average.

In [None]:
sns.catplot(data=discourse_df, x='gender', y='rb_count', kind='box')
plt.title('Adverb Usage by Gender')
plt.show()

In [None]:
discourse_df.groupby('gender').agg({'jj_count': ['mean', 'min', 'max', 'std']})

Characters with unknown gender have the highest mean usage of adjectives at .66, they also have the largest standard deviation, so there is most variability in these character's usage. Again, the most adjectives in a turn is from a male character with 53.

In [None]:
sns.catplot(data=discourse_df, x='gender', y='jj_count', kind='box')
plt.title('Adjective Usage by Gender')
plt.show()

In [None]:
discourse_df.groupby('gender').agg({'cc_count': ['mean', 'min', 'max', 'std']})

'Unknown' characters have the most conjunctions, but only by 0.01. Overall usage seems to be very consistent across the board. Once again, male characters have the highest number of conjunctions for one turn.

In [None]:
sns.catplot(data=discourse_df, x='gender', y='jj_count', kind='box')
plt.title('Conjunction Usage by Gender')
plt.show()

In [None]:
discourse_df.groupby('gender').agg({'uh_count': ['mean', 'min', 'max', 'std']})

Very low usage across the board, with female being the higest at 0.08. Both male and female characters have 7 interjections as a maximum per turn.

In [None]:
sns.catplot(data=discourse_df, x='gender', y='uh_count', kind='box')
plt.title('Interjection Usage by Gender')
plt.show()

#### Lexical Items

I will look to see if these hedging words appear more in one specific gender's speech or not:
* I guess
* I think
* Maybe
* Might
* Perhaps
* Possibly

In [None]:
# i guess
def get_guess(x):
    pattern = r'\\bI guess\\b'
    guesses = re.findall(pattern, ' '.join(str(y) for y in x))
    return guesses

# adding data to the data frames
discourse_df['guess_count'] = discourse_df.tokens.apply(get_guess).str.len()

In [None]:
# i think
def get_think(x):
    pattern = r'\\bI think\\b'
    thinks = re.findall(pattern, ' '.join(str(y) for y in x))
    return thinks

# adding data to the data frames
discourse_df['think_count'] = discourse_df.tokens.apply(get_think).str.len()

In [None]:
# maybe
def get_maybe(x):
    pattern = r'\\b[mM]aybe\\b'
    maybes = re.findall(pattern, ' '.join(str(y) for y in x))
    return maybes

# adding data to the data frames
discourse_df['maybe_count'] = discourse_df.tokens.apply(get_maybe).str.len()

In [None]:
# might
def get_might(x):
    pattern = r'\\b[mM]ight\\b'
    mights = re.findall(pattern, ' '.join(str(y) for y in x))
    return mights

# adding data to the data frames
discourse_df['might_count'] = discourse_df.tokens.apply(get_might).str.len()

In [None]:
# perhaps
def get_perhaps(x):
    pattern = r'\\b[pP]erhaps\\b'
    perhapses = re.findall(pattern, ' '.join(str(y) for y in x))
    return perhapses

# adding data to the data frames
discourse_df['perhaps_count'] = discourse_df.tokens.apply(get_perhaps).str.len()

In [None]:
# possibly
def get_possibly(x):
    pattern = r'\\b[pP]ossibly\\b'
    possiblys = re.findall(pattern, ' '.join(str(y) for y in x))
    return possiblys

# adding data to the data frames
discourse_df['possibly_count'] = discourse_df.tokens.apply(get_possibly).str.len()

In [None]:
discourse_df.maybe_count.sum()

### Gender/Movie Analysis

In [None]:
# it may be interesting to see character/gender information by year/decade

gender_df = pd.merge(characters_df, movies_df)

In [None]:
gender_df.head()

In [None]:
gender_df.groupby(['movie_decade', 'gender']).count()

In [None]:
gender_df[gender_df.movie_decade==1920]

Aside from the 1920s, which only had two movies, all other decades have more male characters than female characters.

## Conversations between genders

I will add gender markers to the conversation_df and see how the conversations are split up by the gender of each speaker.

In [None]:
# add gender for character1_ID
conversations_df = pd.merge(conversations_df, characters_df, left_on='character1_ID', right_on='character_ID').drop(columns=['movie_ID_y', 'movie_title', 'character_ID'], axis=1)

In [None]:
# add gender for character2_ID
conversations_df = pd.merge(conversations_df, characters_df, left_on='character2_ID', right_on='character_ID').drop(columns=['movie_ID', 'movie_title', 'character_ID'], axis=1)

In [None]:
conversations_df

In [None]:
# everything got merged, lets rename the columns to tidy up some
conversations_df.rename(columns={"movie_ID_x": "movie_ID", "character_name_x": "character1_name", "gender_x": "gender1", "character_name_y": "character2_name", "gender_y": "gender2"}, inplace=True)

In [None]:
# rearrange the columns
conversations_df = conversations_df[['character1_ID', 'character1_name', 'gender1', 'character2_ID', 'character2_name', 'gender2', 'movie_ID', 'dialogue']]

In [None]:
conversations_df.head()

Possible gender pairings:

A - A<br>
F - F<br>
M - M<br>
U - U<br>
A - F<br>
A - M<br>
A - U<br>
F - M<br>
F - U<br>
M - U<br>

In [None]:
# function to return conversational gender pairs
def gender_pairs(a, b):
    if a == 'M' and b == 'F' or a == 'F' and b == 'M':
        return 'F:M'
    elif a == 'M' and b == 'A' or a == 'A' and b == 'M':
        return 'A:M'
    elif a == 'M' and b == 'unknown' or a == 'unknown' and b == 'M':
        return 'M:unknown'
    elif a == 'A' and b =='F' or a == 'F' and b =='A':
        return 'A:F'
    elif a == 'A' and b == 'unknown' or a == 'unknown' and b == 'A':
        return 'A:unknown'
    elif a == 'F' and b == 'unknown' or a == 'unknown' and b == 'F':
        return 'F:unknown'
    elif a == 'A' and b == 'A':
        return 'A:A'
    elif a == 'F' and b == 'F':
        return 'F:F'
    elif a == 'M' and b == 'M':
        return 'M:M'
    else:
        return 'unknown:unknown'    

In [None]:
# add column combining gender pairs
conversations_df['gender_pair'] = conversations_df.apply(lambda x: gender_pairs(a = x['gender1'], b = x['gender2']), axis=1)

In [None]:
conversations_df.head()

In [None]:
conversations_df.groupby('gender_pair').count()

In [None]:
sns.countplot(x = "gender_pair",
             data = conversations_df)
plt.xticks(rotation=75)
plt.show()