In [275]:
from collections import defaultdict, Counter
from tqdm import tqdm
import csv
import os
import pandas as pd
import numpy as np
import random

# Gender inference

Many of the conversations in the manosphere focus on relationships between men and women. 

In [107]:
ROOT = '/mnt/data0/lucy/manosphere/'
ANN_FILE = ROOT + 'data/ann_sig_entities.csv'
COREF_LOGS = '/mnt/data0/dtadimeti/manosphere/logs/'
COREF_REDDIT = COREF_LOGS + 'coref_reddit/'
COREF_FORUMS = COREF_LOGS + 'coref_forums/'
COREF_CONTROL = COREF_LOGS + 'coref_control/'
COREF_RESULTS = ROOT + 'logs/coref_results/'

In [188]:
def load_vocabulary(): 
    words = []
    with open(ANN_FILE, 'r') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader: 
            if row['keep'] == 'Y': 
                if row['entity'].lower() == 'she' or row['entity'].lower() == 'he': 
                    continue
                words.append(row['entity'].lower())
    return words

Check that categories are all neat and clean with no NaNs. 

In [258]:
reddit_df = pd.read_csv(COREF_RESULTS + 'coref_reddit_df.csv')
cats = set(reddit_df.community.unique()) | set(forum_df.community.unique())
print(cats)
forum_df = pd.read_csv(COREF_RESULTS + 'coref_forum_df.csv')
df = pd.concat([reddit_df, forum_df])
control_df = pd.read_csv(COREF_RESULTS + 'coref_CONTROL_df.csv')
print(control_df.community.unique())

{'Femcels', 'the_attraction', 'TRP', 'pua_forum', 'MRA', 'red_pill_talk', 'Incels', 'MGTOW', 'avfm', 'rooshv', 'mgtow', 'incels', 'FDS', 'PUA'}
['CONTROL']


In [252]:
fem_total = sum(df['fem'].to_list())
masc_total = sum(df['masc'].to_list())
print("% fem:", fem_total / (fem_total + masc_total))
print("% masc:", masc_total / (fem_total + masc_total))

% fem: 0.5852356166540743
% masc: 0.4147643833459257


### Word coverage

We want to get a sense of how many words are gendered. Words that are not gendered likely don't show up often enough to matter much, but there is a long tail that could be important to consider. 

**Step 1**: semantically gendered nouns, or nouns gendered by definition

We left out nouns that are socially gendered, e.g. "nurse". We use singular and plural forms. 

Hoyle et al. (2019) - man, men, boy, boys, father, fathers, son, sons, brother, bothers, husband, husbands, uncle, uncles, nephew, nephews, emperor, emperors, king, kings, prince, princes, duke, dukes, lord, lords, knight, knights, waiter, waiters, actor, actors, god, gods, policeman, policemen, postman, postmen, hero, heros, wizard, wizards, steward, stewards, woman, women, girl, girls, mother, mothers, daughter, daugheters, sister, sisters, wife, wives, aunt, aunts, niece, nieces, empress, empresses, queen, queens, princess, princesses, duchess, duchesses, lady, ladies, dame, dames, waitress, waitresses, actress, actresses, goddess, goddesses, policewoman, policewomen, postwoman, postwomen, heroine, heroines, witch, witches, stewardess, stewardesses

Additional - male, males, dude, dudes, boyfriend, boyfriends, bf, female, females, chick, chicks, girlfriend, girlfriends, gf

In [271]:
men_markers = 'man, men, boy, boys, father, fathers, son, sons, brother, bothers, husband, husbands, uncle, uncles, nephew, nephews, emperor, emperors, king, kings, prince, princes, duke, dukes, lord, lords, knight, knights, waiter, waiters, actor, actors, god, gods, policeman, policemen, postman, postmen, hero, heros, wizard, wizards, steward, stewards, '
men_markers += 'male, males, dude, dudes, boyfriend, boyfriends, bf'
men_markers = set(men_markers.split(', '))
women_markers = 'woman, women, girl, girls, mother, mothers, daughter, daugheters, sister, sisters, wife, wives, aunt, aunts, niece, nieces, empress, empresses, queen, queens, princess, princesses, duchess, duchesses, lady, ladies, dame, dames, waitress, waitresses, actress, actresses, goddess, goddesses, policewoman, policewomen, postwoman, postwomen, heroine, heroines, witch, witches, stewardess, stewardesses, '
women_markers += 'female, females, chick, chicks, girlfriend, girlfriends, gf'
women_markers = set(women_markers.split(', '))
print(men_markers)
print(women_markers)

{'fathers', 'emperors', 'actors', 'boy', 'kings', 'dude', 'dudes', 'postmen', 'nephew', 'hero', 'male', 'steward', 'sons', 'postman', 'lord', 'bf', 'king', 'waiter', 'boyfriend', 'bothers', 'knights', 'wizards', 'brother', 'boyfriends', 'males', 'waiters', 'princes', 'gods', 'prince', 'uncle', 'emperor', 'dukes', 'actor', 'nephews', 'boys', 'duke', 'father', 'husbands', 'knight', 'heros', 'policemen', 'wizard', 'stewards', 'men', 'son', 'husband', 'god', 'policeman', 'man', 'uncles', 'lords'}
{'girls', 'wives', 'chicks', 'empresses', 'princess', 'lady', 'queen', 'actresses', 'women', 'witches', 'princesses', 'gf', 'dame', 'stewardess', 'females', 'wife', 'actress', 'duchess', 'heroines', 'nieces', 'girlfriend', 'ladies', 'aunts', 'waitress', 'girl', 'queens', 'policewoman', 'duchesses', 'policewomen', 'waitresses', 'goddesses', 'female', 'daughter', 'mothers', 'postwomen', 'aunt', 'heroine', 'woman', 'sisters', 'daugheters', 'empress', 'chick', 'goddess', 'stewardesses', 'girlfriends',

In [277]:
marked_vocab_men = set()
marked_vocab_women = set()
vocab = load_vocabulary()
for w in vocab: 
    w_tokens = set(w.split())
    if w_tokens & men_markers: 
        marked_vocab_men.add(w)
    elif w_tokens & women_markers:
        marked_vocab_women.add(w)
marked_vocab = marked_vocab_men | marked_vocab_women
print("# of marked men:", len(marked_vocab_men), random.sample(marked_vocab_men, 10))
print("# of marked women:", len(marked_vocab_women), random.sample(marked_vocab_women, 10))

# of marked men: 695 ['good boys', 'beta boyfriend', 'male groups', 'alpha males', 'hot dude', 'handsome dude', 'weak man', 'business men', 'human male', 'male ones']
# of marked women: 1009 ['polish women', 'college girl', 'feminist woman', 'sorority sisters', '20 girls', 'local girl', 'white girls', '1 woman', 'traditional women', 'perfect woman']


**Step 2:** coreference resolution for singular nouns 

In [253]:
totals = df.groupby('word').sum()
totals['total'] = totals['fem'] + totals['masc']
totals = totals[totals['total'] > 20]
totals.sort_values(by=['total'])

Unnamed: 0_level_0,year,fem,masc,they,it,you,total
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
crackhead,50383,8,13,0,14,0,21
real mgtow,36301,3,18,10,30,0,21
fat pig,54405,18,3,7,12,0,21
fat one,50400,15,6,4,12,0,21
current gf,96698,18,3,30,48,0,21
...,...,...,...,...,...,...,...
wife,229635,97171,865,91,905,655,98036
man,231646,4311,226660,314,396,4190,230971
guy,231646,4657,280506,610,4066,732,285163
woman,235626,356676,1716,1824,2649,1392,358392


In [254]:
solid_labels = totals.index.to_list()
vocab = load_vocabulary()
missing = set(vocab) - set(solid_labels)
print("NO COREF SIGNAL:", len(missing) / len(vocab))
print("COREF SIGNAL:",len(solid_labels)/ len(vocab))
# TODO: should probably get a better seed set
counted = 0
new_missing = set()
for w in missing: 
    tokens = w.split()
    if tokens[-1] in gendered_words: 
        counted += 1
    elif tokens[0] in gendered_words: 
        counted += 1
    else: 
        new_missing.add(w)
print("NO COREF + NO MARKERS:", len(new_missing) / len(vocab))
print(new_missing)

NO COREF SIGNAL: 0.688549387384662
COREF SIGNAL: 0.3114506126153381
NO COREF + NO MARKERS: 0.5242777189532597
{'cheating whores', 'religious zealots', 'ugly subhuman', '4 kids', 'reasonable people', 'jewish community', 'childrens', 'egalitarians', 'bigots', 'fucking dick', 'successful ones', 'older kids', 'first love', 'complete loser', 'business people', 'adults', 'toastmasters', 'lender', 'happy family', 'goths', 'ideologues', 'boogeyman', 'selectors', 'djs', 'sedditor', 'sjw crowd', 'musicians', 'other humans', 'two others', 'afghans', 'hot bitches', 'matriarch', 'supermodels', 'public figures', 'sexual being', 'instructors', 'oppressed group', 'good therapist', 'victors', 'bisexuals', 'directors', 'perfect match', 'shitty ones', 'white folks', 'volcels', 'tradesman', 'blackcels', 'japanese people', 'dreamers', 'potential rapist', 'co workers', 'awkward person', 'priests', 'fucktoy', 'hunters', 'most kids', 'travellers', 'young people', 'normals', 'ideologue', 'psychologists', 'danc

**Step 3**: plural nouns take on gender of singular nouns

**Step 4:** bigrams take on unigram gender if modifier does not change semantic gender

### Popular fem words in manosphere++

In [255]:
def show_top_fem(cat): 
    cat_df = df[df.community == cat]
    cat_totals = cat_df.groupby('word').sum()
    cat_totals['total'] = cat_totals['fem'] + cat_totals['masc'] 
    # filter to only those that appear at least 10 times as she or he
    cat_totals = cat_totals[cat_totals['total'] > 10] 
    cat_totals['fem_frac'] = cat_totals['fem'] / (cat_totals['fem'] + cat_totals['masc'])
    cat_fem = cat_totals[cat_totals['fem_frac'] == 1]
    return cat_fem.sort_values(by=['total'])

In [259]:
for cat in cats: 
    print('------' + cat + '-------')
    print(show_top_fem(cat).head())

------Femcels-------
                year  fem  masc  they  it  you  total  fem_frac
word                                                           
poor woman      4037   11     0     0   0    0     11       1.0
actress         4037   12     0     0   0    0     12       1.0
beautiful girl  4037   12     0     0   1    0     12       1.0
hot girl        4037   12     0     0   0    0     12       1.0
other woman     4037   13     0     4   0    0     13       1.0
------the_attraction-------
                    year  fem  masc  they  it  you  total  fem_frac
word                                                               
poor woman         12058   11     0     0   0    0     11       1.0
sorority girl      12061   11     0     0   0    0     11       1.0
duff                6021   11     0     0   1    0     11       1.0
escort             16083   11     0     1   2    0     11       1.0
unattractive girl  10045   11     0     0   0    0     11       1.0
------TRP-------
          

### Popular masc words in reddit

In [260]:
def show_top_masc(cat): 
    cat_df = df[df.community == cat]
    cat_totals = cat_df.groupby('word').sum()
    cat_totals['total'] = cat_totals['fem'] + cat_totals['masc'] 
    # filter to only those that appear at least 10 times as she or he
    cat_totals = cat_totals[cat_totals['total'] > 10] 
    cat_totals['masc_frac'] = cat_totals['masc'] / (cat_totals['fem'] + cat_totals['masc'])
    cat_masc = cat_totals[cat_totals['masc_frac'] == 1]
    return cat_masc.sort_values(by=['total'])

In [261]:
for cat in cats: 
    print('------' + cat + '-------')
    print(show_top_masc(cat).head())

------Femcels-------
             year  fem  masc  they  it  you  total  masc_frac
word                                                         
black man    4037    0    12     0   0    0     12        1.0
same guy     4037    0    13     0   0    0     13        1.0
average man  4037    0    15     0   0    0     15        1.0
first guy    4037    0    15     0   0    0     15        1.0
ugly guy     4037    0    15     0   0    0     15        1.0
------the_attraction-------
              year  fem  masc  they  it  you  total  masc_frac
word                                                          
white guy    12059    0    11     0   0    0     11        1.0
cabbie       10040    0    11     0   1    0     11        1.0
hitman       10047    0    11     0   0    1     11        1.0
rich guy      8036    0    11     0   0    0     11        1.0
married guy   8038    0    11     0   0    0     11        1.0
------TRP-------
              year  fem  masc  they  it  you  total  masc_f

### Popular neut words in reddit

In [262]:
def show_top_neut(cat): 
    cat_df = df[df.community == cat]
    cat_totals = cat_df.groupby('word').sum()
    cat_totals['total'] = cat_totals['fem'] + cat_totals['masc'] 
    # filter to only those that appear at least 10 times as she or he
    cat_totals = cat_totals[cat_totals['total'] > 10] 
    cat_totals['fem_frac'] = cat_totals['fem'] / (cat_totals['fem'] + cat_totals['masc'])
    cat_neut = cat_totals[cat_totals.fem_frac.between(0.48, 0.52)]
    return cat_neut.sort_values(by=['total'], ascending = False)

In [263]:
for cat in cats: 
    print('------' + cat + '-------')
    print(show_top_neut(cat).head())

------Femcels-------
            year  fem  masc  they  it  you  total  fem_frac
word                                                       
partner     4037   58    58    33  31    0    116  0.500000
child       4037   30    32     3  16    0     62  0.483871
teacher     4037   26    25     0   0    0     51  0.509804
looksmatch  4037   12    13     1  12    0     25  0.480000
classmate   4037    6     6     0   0    0     12  0.500000
------the_attraction-------
              year  fem  masc  they  it  you  total  fem_frac
word                                                         
cousin       28161  247   239     2   5    4    486  0.508230
friend here  16068   28    28     1   0    0     56  0.500000
colleague    22122   25    26     0   0    0     51  0.490196
caveman      18090   11    11     0   9    0     22  0.500000
counselor    18090    9     9     0   0    0     18  0.500000
------TRP-------
           year  fem  masc  they   it  you  total  fem_frac
word                

### Words with "it" 

In [174]:
df.sort_values(by=['it'], ascending = False).head(15)

Unnamed: 0,month,community,word,fem,masc,neut,it,you
164956,2018-05,Incels,incel,19,108,0,114,0
16802,2018-04,Incels,incel,12,120,0,98,0
164828,2018-05,Incels,incels,13,10,0,98,0
16720,2018-04,Incels,incels,7,15,0,84,0
228451,2018-07,Incels,incel,11,99,0,83,0
146995,2018-08,Incels,incel,14,89,0,82,0
261759,2018-06,Incels,incel,25,112,0,81,0
327476,2018-03,Incels,incels,4,15,0,77,1
261810,2018-06,Incels,incels,9,10,0,76,0
11053,2012-11,MRA,child,37,94,0,73,0


### Gender over time 

In [232]:
# These are the words whose % fem ranges over time are the largest,
# maybe top three words with biggest % range in each community, e.g. “cat	10% - 50%”, only calculate fraction 
# if there are more than 10 occurrences in each community and month. 

def show_top_fem_range(cat):
    # filtering for the argument category
    cat_df = df[df.community == cat]
    totals = cat_df.groupby(['year', 'word'], as_index = False).sum()
    totals['total'] = totals['fem'] + totals['masc']
    totals = totals[totals['total'] > 10]
    totals['fem_frac'] = totals['fem'] / (totals['fem'] + totals['masc'])
    
    # filter for words that show up in more than 1 of the months/time periods 
    # (initially picked 85 to be more than half of all the months but idk if needed)
    is_multi = totals["word"].value_counts() > 1
    filtered = totals[totals["word"].isin(is_multi[is_multi].index)]
    
    # get the max and min fem_frac for each word
    word_keys = filtered['word'].unique().tolist()
    max_fems = []
    min_fems = []
    max_months = []
    min_months = []
    for word in word_keys: 
        df_subset = filtered[filtered['word'] == word]
        max_fem = df_subset['fem_frac'].max()
        min_fem = df_subset['fem_frac'].min()
        max_month = df_subset[df_subset['fem_frac'] == max_fem]['year'].max()
        min_month = df_subset[df_subset['fem_frac'] == min_fem]['year'].min()
        
        max_fems.append(max_fem)
        min_fems.append(min_fem)
        max_months.append(max_month)
        min_months.append(min_month)
    
    
    d = {'word': [], 'min month': [], 'min': [], 'max month': [], 'max':[], 'diff': []}
    for i in range(len(word_keys)):
        d['word'].append(word_keys[i])
        d['min month'].append(min_months[i])
        d['min'].append(min_fems[i])
        d['max month'].append(max_months[i])
        d['max'].append(max_fems[i])
        d['diff'].append(max_fems[i] - min_fems[i])
    
    diffs = pd.DataFrame(data=d)
    return diffs.sort_values(by = ['diff'], ascending = False)


In [237]:
# words in each month in Incels that appear more than 10 times in that month
for cat in cats: 
    print('------' + cat + '-------')
    print(show_top_fem_range(cat).head())

------PUA-------
              word  min month       min  max month       max      diff
74     good friend       2015  0.181818       2019  0.666667  0.484848
90      one friend       2012  0.200000       2011  0.666667  0.466667
120           baby       2012  0.545455       2019  1.000000  0.454545
138   other person       2013  0.222222       2017  0.666667  0.444444
29   mutual friend       2019  0.500000       2010  0.937500  0.437500
------MRA-------
          word  min month       min  max month       max      diff
126     anyone       2012  0.045455       2017  0.642857  0.597403
141       cops       2016  0.347826       2013  0.857143  0.509317
216  bartender       2017  0.250000       2019  0.750000  0.500000
44         cop       2010  0.272727       2018  0.750000  0.477273
220  detective       2018  0.000000       2019  0.470588  0.470588
------Incels-------
           word  min month       min  max month       max      diff
13   girlfriend       2019  0.094544       2013  0

TypeError: can only concatenate str (not "float") to str

### Popular words in control

In [161]:
control_df = pd.read_csv('pronoun_control_df.csv')

def show_top_fem_control(): 
    control_totals = control_df.groupby('word').sum()
    control_totals['total'] = control_totals['fem'] + control_totals['masc'] 
    # filter to only those that appear more than 10 times as she or he
    control_totals = control_totals[control_totals['total'] > 10] 
    control_totals['fem_frac'] = control_totals['fem'] / (control_totals['fem'] + control_totals['masc'])
    control_fem = control_totals[control_totals['fem_frac'] == 1]
    return control_fem.sort_values(by=['total'], ascending = False)

def show_top_masc_control(): 
    control_totals = control_df.groupby('word').sum()
    control_totals['total'] = control_totals['fem'] + control_totals['masc'] 
    # filter to only those that appear at least 10 times as she or he
    control_totals = control_totals[control_totals['total'] > 10] 
    control_totals['masc_frac'] = control_totals['masc'] / (control_totals['fem'] + control_totals['masc'])
    control_masc = control_totals[control_totals['masc_frac'] == 1]
    return control_masc.sort_values(by=['total'], ascending = False)

def show_top_neut_control(): 
    control_totals = control_df.groupby('word').sum()
    control_totals['total'] = control_totals['fem'] + control_totals['masc'] 
    # filter to only those that appear at least 10 times as she or he
    control_totals = control_totals[control_totals['total'] > 10] 
    control_totals['fem_frac'] = control_totals['fem'] / (control_totals['fem'] + control_totals['masc'])
    control_neut = control_totals[control_totals['fem_frac'].between(0.45, 0.55)]
    return control_neut.sort_values(by=['total'], ascending = False)

FileNotFoundError: [Errno 2] File b'pronoun_control_df.csv' does not exist: b'pronoun_control_df.csv'

In [None]:
show_top_neut_control()

### Gender differences

In [54]:
# len(df.word.unique()
df = df.groupby('word').sum()
df['total'] = df['fem'] + df['masc']
df = df[df['total'] > 10] 
df['fem_frac'] = df['fem'] / (df['fem'] + df['masc'])
df = df.sort_values(by=['fem'], ascending = False)
df


Unnamed: 0_level_0,fem,masc,neut,total,fem_frac
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
girl,308205,2567,226,310772,0.991740
woman,248514,1024,838,249538,0.995896
wife,74069,576,55,74645,0.992283
mother,31271,275,139,31546,0.991283
mom,23830,893,186,24723,0.963880
...,...,...,...,...,...
physicist,0,29,1,29,0.000000
tall man,0,59,1,59,0.000000
soccer player,0,11,0,11,0.000000
great leader,0,12,0,12,0.000000


In [56]:
control_df = control_df.groupby('word').sum()
control_df['total'] = control_df['fem'] + control_df['masc']
control_df = control_df[control_df['total'] > 10] 
control_df['fem_frac'] = control_df['fem'] / (control_df['fem'] + control_df['masc'])
control_df = control_df.sort_values(by=['fem'], ascending = False)
control_df


Unnamed: 0_level_0,fem,masc,neut,total,fem_frac
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
mom,22693,764,200,23457,0.967430
wife,22389,165,18,22554,0.992684
girl,22147,269,29,22416,0.988000
woman,17238,131,56,17369,0.992458
mother,13385,107,53,13492,0.992069
...,...,...,...,...,...
composer,0,29,7,29,0.000000
common man,0,34,1,34,0.000000
colonel,0,25,0,25,0.000000
college kid,0,13,1,13,0.000000


In [57]:
merged_df = df.merge(control_df, how='inner', left_index=True, right_index=True)
merged_df

Unnamed: 0_level_0,fem_x,masc_x,neut_x,total_x,fem_frac_x,fem_y,masc_y,neut_y,total_y,fem_frac_y
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
girl,308205,2567,226,310772,0.991740,22147,269,29,22416,0.988000
woman,248514,1024,838,249538,0.995896,17238,131,56,17369,0.992458
wife,74069,576,55,74645,0.992283,22389,165,18,22554,0.992684
mother,31271,275,139,31546,0.991283,13385,107,53,13492,0.992069
mom,23830,893,186,24723,0.963880,22693,764,200,23457,0.967430
...,...,...,...,...,...,...,...,...,...,...
layman,0,11,0,11,0.000000,1,11,0,12,0.083333
first man,0,69,0,69,0.000000,0,31,0,31,0.000000
physicist,0,29,1,29,0.000000,1,22,2,23,0.043478
tall man,0,59,1,59,0.000000,0,17,0,17,0.000000


In [59]:
merged_df['difference'] = (merged_df['fem_frac_x'] - merged_df['fem_frac_y']).abs()
merged_df = merged_df.sort_values(by=['difference'], ascending = False)

In [61]:
merged_df

Unnamed: 0_level_0,fem_x,masc_x,neut_x,total_x,fem_frac_x,fem_y,masc_y,neut_y,total_y,fem_frac_y,difference
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
brat,21,5,1,26,0.807692,2,10,1,12,0.166667,0.641026
expert,68,35,25,103,0.660194,8,48,31,56,0.142857,0.517337
band,15,2,382,17,0.882353,8,13,1799,21,0.380952,0.501401
cunt,789,205,60,994,0.793763,35,77,12,112,0.312500,0.481263
sitter,11,3,1,14,0.785714,7,14,0,21,0.333333,0.452381
...,...,...,...,...,...,...,...,...,...,...,...
trans woman,131,0,1,131,1.000000,73,0,4,73,1.000000,0.000000
grown woman,120,0,5,120,1.000000,32,0,0,32,1.000000,0.000000
own daughter,112,0,0,112,1.000000,48,0,0,48,1.000000,0.000000
great woman,103,0,0,103,1.000000,12,0,0,12,1.000000,0.000000


### Pronoun sparsity

In [84]:
df = pd.read_csv('pronoun_df.csv')
# df = df.groupby('word').sum()
# df.shape[0]

# total vocab words that show up in reddit with masc/fem/neut pronouns is 6373

df_totals = df.groupby('word').sum()
# df_totals['total'] = df_totals['fem'] + df_totals['masc'] 
df_totals['total'] = df_totals['fem'] + df_totals['masc'] + df_totals['neut']

df_totals['neut_frac'] = df_totals['neut'] / (df_totals['fem'] + df_totals['masc'] + df_totals['neut'])
df_totals 

df_neut = df_totals[df_totals['neut_frac'] >= 0.5].sort_values(by = ['neut_frac'], ascending = False)
# df_neut.head(20)
df_neut

# df_sparse = df_totals[df_totals['total'] <= 10]
# df_sparse



# 4300 words have less than 10 occurrences with masc or fem pronouns => ~70%
# 1438 words have less than 10 occurrences with masc or fem or neut pronouns => ~23%

# 3716 words are mostly "they" words => ~60%



Unnamed: 0_level_0,fem,masc,neut,total,neut_frac
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
zombies,0,0,97,97,1.0
monarchs,0,0,18,18,1.0
mobile users,0,0,1,1,1.0
mockingbird,0,0,1,1,1.0
moderate feminists,0,0,86,86,1.0
...,...,...,...,...,...
walking wallet,2,0,2,4,0.5
total loser,0,2,2,4,0.5
changs,0,2,2,4,0.5
chick magnet,1,0,1,2,0.5


### Evaluation 

Comparing booknlp coref vs. spacy coref

In [212]:
gold_masc = set()
gold_fem = set()
with open(ROOT + 'logs/gender_gold_labels.csv', 'r') as infile: 
    reader = csv.DictReader(infile)
    for row in reader: 
        if row['gendered?'] == 'm':
            gold_masc.add(row['word (singular)'].lower())
        if row['gendered?'] == 'f': 
            gold_fem.add(row['word (singular)'].lower())

In [213]:
david_labels = Counter()
with open(ROOT + 'logs/temp_gender.txt', 'r') as infile: 
    reader = csv.DictReader(infile, delimiter='\t')
    for row in reader: 
        if row['proper'] != 'nom': continue
        if (float(row['he/him/his']) + float(row['she/her'])) < 3: continue
        david_labels[row['term']] = float(row['she/her']) / (float(row['he/him/his']) + float(row['she/her']))

In [214]:
df = pd.read_csv(COREF_RESULTS + 'coref_reddit_df.csv')

In [215]:
df = df.groupby('word').sum()
df['fem_frac'] = df['fem'] / (df['fem'] + df['masc'])
df = df[['fem_frac']].dropna()
df = df.to_dict()

In [216]:
spacy_labels = df['fem_frac']

In [219]:
# average score for m words
spacy_scores = []
david_scores = []
for w in gold_masc: 
    if w in spacy_labels and w in david_labels: 
        spacy_scores.append(spacy_labels[w])
        david_scores.append(david_labels[w])
print("masc words")
print("SPACY:", np.mean(spacy_scores), "BOOKNLP:", np.mean(david_scores))

masc words
SPACY: 0.20069466570475966 BOOKNLP: 0.31480068170482933


In [221]:
# average score for f words
spacy_scores = []
david_scores = []
for w in gold_fem: 
    if w in spacy_labels and w in david_labels: 
        spacy_scores.append(spacy_labels[w])
        david_scores.append(david_labels[w])
print("fem words")
print("SPACY:", np.mean(spacy_scores), "BOOKNLP:", np.mean(david_scores))

fem words
SPACY: 0.8515987971507715 BOOKNLP: 0.7529589257733309
