In [1]:
from collections import defaultdict, Counter
from tqdm import tqdm
import csv
import os
import pandas as pd
import numpy as np
import random
import nltk
from nltk.stem import PorterStemmer
import inflect
import json

# Gender inference

Many of the conversations in the manosphere focus on relationships between men and women. 

In [2]:
ROOT = '/mnt/data0/lucy/manosphere/'
ANN_FILE = ROOT + 'data/ann_sig_entities.csv'
COREF_RESULTS = ROOT + 'logs/coref_results/'

In [3]:
def load_vocabulary(): 
    words = []
    with open(ANN_FILE, 'r') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader: 
            if row['keep'] == 'Y': 
                words.append(row['entity'].lower())
    return words

Check that categories are all neat and clean with no NaNs. 

In [4]:
reddit_df = pd.read_csv(COREF_RESULTS + 'coref_reddit_df.csv')
forum_df = pd.read_csv(COREF_RESULTS + 'coref_forum_df.csv')
cats = set(reddit_df.community.unique()) | set(forum_df.community.unique())
assert len(cats) == len(set(reddit_df.community.unique())) + len(set(forum_df.community.unique()))
df = pd.concat([reddit_df, forum_df])
control_df = pd.read_csv(COREF_RESULTS + 'coref_CONTROL_df.csv')
all_df = pd.concat([reddit_df, forum_df, control_df])
cats = set(all_df.community.unique())
print(cats)

{'the_attraction', 'Femcels', 'mgtow', 'red_pill_talk', 'MGTOW', 'pua_forum', 'incels', 'PUA', 'rooshv', 'avfm', 'MRA', 'Incels', 'CONTROL', 'TRP', 'FDS'}


In [5]:
# baseline rate
fem_total = sum(df['fem'].to_list())
masc_total = sum(df['masc'].to_list())
print("% fem:", fem_total / (fem_total + masc_total))
print("% masc:", masc_total / (fem_total + masc_total))

% fem: 0.5852356166540743
% masc: 0.4147643833459257


### Word coverage

We want to get a sense of how many words are gendered. Words that are not gendered likely don't show up often enough to matter much, but there is a long tail that could be important to consider. 

First, we get a sense of how many words actually have coref labels (e.g. more than 20 labels): 

In [6]:
def get_dataframe_coverage(this_df, cutoff, marked_set): 
    '''
    @inputs: 
    - this_df: input dataframe
    - cutoff: int indicating frequency cutoff
    - marked_set: set of words gendered by definition
    @outputs: 
    - missing: words missing masc/fem gender signal
    - solid_labels: words with gender coref fem/masc signal
    '''
    totals = this_df.groupby('word').sum()
    totals['total'] = totals['fem'] + totals['masc']
    totals = totals[totals['total'] > cutoff]
    totals = totals[totals['total'] > totals['they']] # exclude mostly plural words
    totals.sort_values(by=['total'])
    solid_labels = set(totals.index.to_list())
    vocab = load_vocabulary()
    missing = set(vocab) - solid_labels - marked_set
    print("NO GENDER SIGNAL:", len(missing) / len(vocab))
    print("COREF SIGNAL:",len(solid_labels)/ len(vocab))
    print("MARKED SIGNAL:", len(marked_set) / len(vocab))
    print()
    return missing, solid_labels

In [7]:
cutoff = 10
mano_missing, mano_solid_labels = get_dataframe_coverage(df, cutoff, set())
control_missing, control_solid_labels = get_dataframe_coverage(control_df, cutoff, set())

NO GENDER SIGNAL: 0.6621805534553152
COREF SIGNAL: 0.3378194465446847
MARKED SIGNAL: 0.0

NO GENDER SIGNAL: 0.8173295024950854
COREF SIGNAL: 0.18267049750491457
MARKED SIGNAL: 0.0



### Gender Labeling Steps

Output is a dictionary of terms to labels

In [8]:
mano_gender_labels = {} # {term : fem frac}
control_gender_labels = {} # {term : fem frac}

The number of words without coref signals is pretty low! It seems like we need **_multiple ways_** to infer the gender of an entity. 

**Step 1**: semantically gendered nouns, or nouns gendered by definition

We leave out nouns that are socially gendered, e.g. "nurse". We use singular and plural forms. 

Hoyle et al. (2019) - man, men, boy, boys, father, fathers, son, sons, brother, bothers, husband, husbands, uncle, uncles, nephew, nephews, emperor, emperors, king, kings, prince, princes, duke, dukes, lord, lords, knight, knights, waiter, waiters, actor, actors, god, gods, policeman, policemen, postman, postmen, hero, heros, wizard, wizards, steward, stewards, woman, women, girl, girls, mother, mothers, daughter, daughters, sister, sisters, wife, wives, aunt, aunts, niece, nieces, empress, empresses, queen, queens, princess, princesses, duchess, duchesses, lady, ladies, dame, dames, waitress, waitresses, actress, actresses, goddess, goddesses, policewoman, policewomen, postwoman, postwomen, heroine, heroines, witch, witches, stewardess, stewardesses

Additional - male, males, dude, dudes, guy, guys, boyfriend, boyfriends, bf, female, females, chick, chicks, girlfriend, girlfriends, gf, gal, gals, bro, transmen, transwomen, she, he

In [9]:
men_markers = 'man, men, boy, boys, father, fathers, son, sons, brother, bothers, husband, husbands, uncle, uncles, nephew, nephews, emperor, emperors, king, kings, prince, princes, duke, dukes, lord, lords, knight, knights, waiter, waiters, actor, actors, god, gods, policeman, policemen, postman, postmen, hero, heros, wizard, wizards, steward, stewards, '
men_markers += 'male, males, dude, dudes, boyfriend, boyfriends, bf, guy, guys, bro, transmen, he'
men_markers = set(men_markers.split(', '))
women_markers = 'woman, women, girl, girls, mother, mothers, daughter, daugheters, sister, sisters, wife, wives, aunt, aunts, niece, nieces, empress, empresses, queen, queens, princess, princesses, duchess, duchesses, lady, ladies, dame, dames, waitress, waitresses, actress, actresses, goddess, goddesses, policewoman, policewomen, postwoman, postwomen, heroine, heroines, witch, witches, stewardess, stewardesses, '
women_markers += 'female, females, chick, chicks, girlfriend, girlfriends, gf, gal, gals, transwomen, she'
women_markers = set(women_markers.split(', '))
print(men_markers)
print(women_markers)

{'uncle', 'actors', 'boyfriends', 'brother', 'wizards', 'bf', 'husbands', 'steward', 'duke', 'princes', 'male', 'god', 'sons', 'heros', 'guy', 'son', 'wizard', 'boy', 'bro', 'males', 'policeman', 'bothers', 'postman', 'lord', 'actor', 'nephews', 'king', 'uncles', 'boys', 'hero', 'guys', 'waiter', 'nephew', 'fathers', 'dukes', 'boyfriend', 'dudes', 'prince', 'policemen', 'kings', 'gods', 'stewards', 'he', 'knight', 'dude', 'father', 'emperor', 'men', 'husband', 'transmen', 'waiters', 'emperors', 'knights', 'lords', 'man', 'postmen'}
{'chicks', 'females', 'goddess', 'princesses', 'sisters', 'duchess', 'women', 'mother', 'postwomen', 'chick', 'lady', 'stewardess', 'dame', 'princess', 'duchesses', 'gal', 'witches', 'female', 'waitress', 'wife', 'empress', 'queen', 'sister', 'policewoman', 'girlfriend', 'woman', 'heroines', 'empresses', 'she', 'policewomen', 'daughter', 'transwomen', 'niece', 'actress', 'aunts', 'girls', 'waitresses', 'ladies', 'witch', 'goddesses', 'queens', 'actresses', '

In [10]:
marked_vocab_men = set()
marked_vocab_women = set()
vocab = load_vocabulary() 
for w in vocab: 
    w_tokens = set(w.split())
    if w_tokens & men_markers: 
        marked_vocab_men.add(w)
        mano_gender_labels[w] = 0 # masc
        control_gender_labels[w] = 0
    elif w_tokens & women_markers:
        marked_vocab_women.add(w)
        mano_gender_labels[w] = 1 # fem
        control_gender_labels[w] = 1
marked_vocab = marked_vocab_men | marked_vocab_women
print("Count, fraction of vocab, examples")
print()
print("marked men:".upper())
print(len(marked_vocab_men), round(len(marked_vocab_men) / len(vocab), 3)) 
print(random.sample(marked_vocab_men, 10))
print()
print("marked women:".upper())
print(len(marked_vocab_women), round(len(marked_vocab_women) / len(vocab), 3)) 
print(random.sample(marked_vocab_women, 10))

Count, fraction of vocab, examples

MARKED MEN:
967 0.146
['incel man', '3 guys', '50 men', 'russian men', 'sub8 men', 'middle man', 'smart man', 'confident man', 'funny man', '2 men']

MARKED WOMEN:
1013 0.153
['female leaders', 'woman president', 'big girl', 'female rapist', 'aunts', 'mexican women', 'girlfriends', 'great wife', 'spanish women', 'younger woman']


**Step 2:** coreference resolution for singular nouns 

First, we filter the dataframes to words that are not explicitly marked, and we again calculate coverage.

In [11]:
unmarked_df = df[~df['word'].isin(marked_vocab)]
unmarked_control_df = control_df[~control_df['word'].isin(marked_vocab)]
unmarked_all_df = pd.concat([unmarked_df, unmarked_control_df])

In [12]:
mano_missing, mano_solid_label = get_dataframe_coverage(unmarked_df, cutoff, marked_vocab)
control_missing, control_solid_label = get_dataframe_coverage(unmarked_control_df, cutoff, marked_vocab)

NO GENDER SIGNAL: 0.49130500529260546
COREF SIGNAL: 0.20928474217450477
MARKED SIGNAL: 0.2994102525328898

NO GENDER SIGNAL: 0.5781037350672917
COREF SIGNAL: 0.12248601239981854
MARKED SIGNAL: 0.2994102525328898



We include words that are "covered" in the `gender_labels` dict. 

In [13]:
def update_gender_labels(df, gender_labels): 
    word_totals = df.groupby('word').sum()
    word_totals['total'] = word_totals['fem'] + word_totals['masc']
    word_totals = word_totals[word_totals['total'] > cutoff]
    word_totals = word_totals[word_totals['total'] > word_totals['they']]
    word_totals['fem frac'] = word_totals['fem'] / (word_totals['fem'] + word_totals['masc'])
    gender_labels_to_add = word_totals['fem frac'].to_dict()
    gender_labels.update(gender_labels_to_add)
    return gender_labels

In [14]:
mano_gender_labels = update_gender_labels(unmarked_df, mano_gender_labels)
control_gender_labels = update_gender_labels(unmarked_control_df, control_gender_labels)

**Step 3**: plural nouns take on gender of singular nouns

Using python inflect reduces no gender signal by ~10% - a good start!

In [15]:
mano_found_plural = set()
p = inflect.engine()

for w in mano_missing: 
    if p.singular_noun(w) is not False:
        if p.singular_noun(w) in mano_gender_labels:
            mano_gender_labels[w] = mano_gender_labels[p.singular_noun(w)]
            mano_found_plural.add(w)

control_found_plural = set()
for w in control_missing: 
    if p.singular_noun(w) is not False:
        if p.singular_noun(w) in control_gender_labels:
            control_gender_labels[w] = control_gender_labels[p.singular_noun(w)]
            control_found_plural.add(w)

#### Just testing out inflect

In [16]:
p = inflect.engine()
for w in mano_missing: 
    if p.singular_noun(w) is not False:
        print("The singular of ", w, " is ", p.singular_noun(w))

The singular of  widows  is  widow
The singular of  serial rapists  is  serial rapist
The singular of  bad people  is  bad person
The singular of  main characters  is  main character
The singular of  ladyboys  is  ladyboy
The singular of  sexists  is  sexist
The singular of  drug companies  is  drug company
The singular of  cunts  is  cunt
The singular of  other members  is  other member
The singular of  ministers  is  minister
The singular of  suspects  is  suspect
The singular of  breeders  is  breeder
The singular of  jackass  is  jackas
The singular of  musicians  is  musician
The singular of  three people  is  three person
The singular of  other victims  is  other victim
The singular of  semites  is  semite
The singular of  judges  is  judge
The singular of  hard workers  is  hard worker
The singular of  older friends  is  older friend
The singular of  eunuchs  is  eunuch
The singular of  lobby groups  is  lobby group
The singular of  victors  is  victor
The singular of  neurotypi

The singular of  loved ones  is  loved one
The singular of  younger brothers  is  younger brother
The singular of  hostesses  is  hostess
The singular of  catholic priests  is  catholic priest
The singular of  poor kids  is  poor kid
The singular of  introverts  is  introvert
The singular of  sales people  is  sales person
The singular of  legislators  is  legislator
The singular of  norwegians  is  norwegian
The singular of  execs  is  exec
The singular of  opponents  is  opponent
The singular of  trad cons  is  trad con
The singular of  happy people  is  happy person
The singular of  risk takers  is  risk taker
The singular of  brazilians  is  brazilian
The singular of  drunks  is  drunk
The singular of  westerners  is  westerner
The singular of  aussies  is  aussy
The singular of  smart ass  is  smart as
The singular of  tramps  is  tramp
The singular of  builders  is  builder
The singular of  posers  is  poser
The singular of  single people  is  single person
The singular of  innoc

The singular of  poor souls  is  poor soul
The singular of  everyday people  is  everyday person
The singular of  prosecutors  is  prosecutor
The singular of  shitty ones  is  shitty one
The singular of  adulterers  is  adulterer
The singular of  2 people  is  2 person
The singular of  trps  is  trp
The singular of  diplomats  is  diplomat
The singular of  romanians  is  romanian
The singular of  normies  is  normy
The singular of  mercenaries  is  mercenary
The singular of  thirsty betas  is  thirsty beta
The singular of  exes  is  ex
The singular of  morons  is  moron
The singular of  three kids  is  three kid
The singular of  other losers  is  other loser
The singular of  home dads  is  home dad
The singular of  ethniks  is  ethnik
The singular of  most kids  is  most kid
The singular of  actual rapists  is  actual rapist
The singular of  two parents  is  two parent
The singular of  chiefs  is  chief
The singular of  few members  is  few member
The singular of  propagandists  is  pr

The singular of  other minorities  is  other minority
The singular of  many betas  is  many beta
The singular of  rockstars  is  rockstar
The singular of  other persons  is  other person
The singular of  university students  is  university student
The singular of  cosplayers  is  cosplayer
The singular of  donors  is  donor
The singular of  spoiled brats  is  spoiled brat
The singular of  swedes  is  swede
The singular of  sloots  is  sloot
The singular of  perpetual victims  is  perpetual victim
The singular of  pals  is  pal
The singular of  white supremacists  is  white supremacist
The singular of  bps  is  bp
The singular of  locals  is  local
The singular of  better people  is  better person
The singular of  5 children  is  5 child
The singular of  actual people  is  actual person
The singular of  hypocrites  is  hypocrite
The singular of  miners  is  miner
The singular of  cretins  is  cretin
The singular of  americans  is  american
The singular of  mds  is  md
The singular of  f

The singular of  third parties  is  third party
The singular of  spoiled children  is  spoiled child
The singular of  chad friends  is  chad friend
The singular of  toxic people  is  toxic person
The singular of  families  is  family
The singular of  most teachers  is  most teacher
The singular of  right wingers  is  right winger
The singular of  cashiers  is  cashier
The singular of  four children  is  four child
The singular of  soccer players  is  soccer player
The singular of  young couples  is  young couple
The singular of  beckies  is  becky
The singular of  murderers  is  murderer
The singular of  only cucks  is  only cuck
The singular of  pedos  is  pedo
The singular of  mates  is  mate
The singular of  personal trainers  is  personal trainer
The singular of  fangirls  is  fangirl
The singular of  two brothers  is  two brother
The singular of  black americans  is  black american
The singular of  companions  is  companion
The singular of  bad bitches  is  bad bitch
The singular 

Recalculate coverage after including plural matches. 

In [17]:
mano_found_total = mano_found_plural | marked_vocab
mano_still_missing = set(vocab) - mano_solid_labels - mano_found_total

print(len(mano_still_missing))
print("NO GENDER SIGNAL:", len(mano_still_missing) / len(vocab))
print(random.sample(mano_still_missing, 50))
print("PLURAL->SINGULAR PERCENTAGE:", round(len(mano_found_plural) / len(vocab),4))
print()

control_found_total = control_found_plural | marked_vocab
control_still_missing = set(vocab) - control_solid_labels - control_found_total
print(len(control_still_missing))
print("NO GENDER SIGNAL:", len(control_still_missing) / len(vocab))
print(random.sample(control_still_missing, 50))

2417
NO GENDER SIGNAL: 0.3654922123090882
['gaming community', '2 children', 'outlier', 'serb', 'child abusers', 'perfect person', 'barely anyone', 'entire school', 'couples', 'turbo manlet', 'many parents', 'normal human', 'middle easterners', 'fucking liar', 'nan', 'two kids', 'drug user', 'educators', 'anyone', 'goys', 'spotter', 'attention seekers', 'new subscribers', '5 friends', 'servicemen', 'fb friends', 'cool friends', 'intimate partners', 'participant', 'many groups', 'single gender', 'higher ups', 'libertarians', 'taxpayer', 'black population', 'mercenaries', 'other party', 'neo nazi', 'ethnicels', 'transgender people', 'little group', 'afghans', 'islamists', 'total population', 'heretics', 'white one', 'leper', 'transpeople', 'fucking morons', 'keyboard warriors']
PLURAL->SINGULAR PERCENTAGE: 0.1258

3291
NO GENDER SIGNAL: 0.49765613186148494
['concern troll', 'rvf members', 'jocks', 'rich beta', 'endorsed contributors', 'frat bros', 'airhead', 'hb8', 'living beings', 'simp

In [18]:
for w in mano_still_missing:
    if p.singular_noun(w) in mano_still_missing and p.plural_noun(w) in mano_still_missing:
        print(p.singular_noun(w), p.plural_noun(w))

quebecois quebecois
human species human species
offspring offspring


**Step 4:** bigrams take on unigram gender if modifier does not change semantic gender

First, can take a glance at what unigrams tend to be commonly missing.

See if the main unigram of a bigram is in solid labels or marked vocab. 

In [19]:
mano_found_unigram = set()
control_found_unigram = set()

common_missing = Counter()
even_now_still_missing = Counter()
for w in mano_still_missing: 
    tokens = w.split()
    if len(tokens) > 1: 
        unigram = tokens[-1]
        common_missing[unigram] += 1
        if unigram in mano_gender_labels:
            mano_gender_labels[w] = mano_gender_labels[unigram]
            mano_found_unigram.add(w)
        else: 
            even_now_still_missing[unigram] += 1
            
for w in control_still_missing:
    tokens = w.split()
    if len(tokens) > 1: 
        unigram = tokens[-1]
        if unigram in control_gender_labels: 
            control_gender_labels[w] = control_gender_labels[unigram]
            control_found_unigram.add(w)    
            
print("UNIGRAMS THAT HAVE LABELS:", len(mano_found_unigram), round(len(mano_found_unigram)/len(vocab), 3))
print()
# what unigrams were missing?
for tup in common_missing.most_common(20): 
    print(tup[0], tup[1], round(tup[1]/len(vocab), 3))
print()
# what unigrams will always be missing no matter what we do (with the methods so far)?
for tup in even_now_still_missing.most_common(20): 
    print(tup[0], tup[1], round(tup[1]/len(vocab), 3))

UNIGRAMS THAT HAVE LABELS: 601 0.091

people 138 0.021
person 45 0.007
group 36 0.005
friends 32 0.005
family 28 0.004
feminists 25 0.004
community 24 0.004
children 22 0.003
ones 21 0.003
kids 21 0.003
parents 19 0.003
population 17 0.003
one 16 0.002
victims 15 0.002
groups 14 0.002
here 14 0.002
members 13 0.002
generation 13 0.002
asshole 11 0.002
feminist 10 0.002

group 36 0.005
family 28 0.004
community 24 0.004
ones 21 0.003
parents 19 0.003
population 17 0.003
one 16 0.002
groups 14 0.002
here 14 0.002
members 13 0.002
generation 13 0.002
pussy 10 0.002
couples 10 0.002
else 9 0.001
users 9 0.001
folks 9 0.001
couple 9 0.001
team 9 0.001
americans 9 0.001
class 8 0.001


Recalculate coverage after including unigram matches.

In [20]:
mano_still_missing = set(vocab) - set(mano_gender_labels.keys())
print("NO GENDER SIGNAL:", len(mano_still_missing) / len(vocab))
print(random.sample(mano_still_missing, 500))
print(len(mano_still_missing))

control_still_missing = set(vocab) - set(control_gender_labels.keys())
print("NO GENDER SIGNAL:", len(control_still_missing) / len(vocab))
print(random.sample(control_still_missing, 500))
print(len(control_still_missing))

NO GENDER SIGNAL: 0.27461061545440796
['single parenthood', 'abusive parents', 'child services', 'greats', 'trainee', 'devs', 'those interested', 'protected class', 'stupid one', 'bub', 'dominicans', 'israelis', 'high schoolers', 'consumers', 'whole family', 'heterosexuals', 'one couple', 'dissenters', 'live band', 'web developer', 'high schooler', 'decision maker', 'team', 'potential dates', 'shitty ones', 'royals', 'opposite gender', 'real ones', 'weenie', 'leftists', 'those accused', 'enemies', 'mods here', 'college graduates', 'womyns', 'everbody', 'attention seekers', 'lucky few', 'silent majority', 'advertiser', 'cuckqueers', 'aggressors', 'shitposter', 'broski', 'stunner', 'aztec', 'feminist trolls', 'wrongly accused', 'naturals', 'roomies', 'other candidates', 'spergs', 'responder', 'traditionalist', 'hardly anyone', 'one family', 'sex havers', 'armenians', 'shrink', 'gypsies', 'mra sub', 'spammers', 'most ppl', 'orbiter', 'other parents', 'clan', 'investor', 'beneficiaries', '

Save the jsons into files. 

Since we're doing this in a jupyter notebook, it's very important to run cells in order and avoid repeating them!! 

In [21]:
with open(COREF_RESULTS + 'mano_gender_labels.json', 'w') as outfile: 
    json.dump(mano_gender_labels, outfile)
with open(COREF_RESULTS + 'control_gender_labels.json', 'w') as outfile: 
    json.dump(control_gender_labels, outfile)

Divya said that it doesn't seem like modifiers change the gender of already-gendered unigrams. 

In [31]:
common_modifiers = Counter()
for w in mano_still_missing: 
    tokens = w.split()
    if len(tokens) > 1: 
        modifier = tokens[0]
        common_modifiers[modifier] += 1
for tup in common_modifiers.most_common(): 
    print(tup[0], tup[1], tup[1]/len(vocab))

other 26 0.003931649780734916
most 13 0.001965824890367458
good 11 0.0016633902918493876
white 9 0.0013609556933313171
many 9 0.0013609556933313171
entire 8 0.0012097383940722819
one 6 0.0009073037955542114
new 6 0.0009073037955542114
someone 6 0.0009073037955542114
social 6 0.0009073037955542114
whole 6 0.0009073037955542114
big 5 0.0007560864962951762
black 5 0.0007560864962951762
fucking 5 0.0007560864962951762
old 5 0.0007560864962951762
sex 5 0.0007560864962951762
middle 5 0.0007560864962951762
feminist 5 0.0007560864962951762
great 5 0.0007560864962951762
everyone 4 0.0006048691970361409
older 4 0.0006048691970361409
average 4 0.0006048691970361409
keyboard 4 0.0006048691970361409
college 4 0.0006048691970361409
sexual 4 0.0006048691970361409
two 3 0.0004536518977771057
those 3 0.0004536518977771057
illegal 3 0.0004536518977771057
total 3 0.0004536518977771057
nobody 3 0.0004536518977771057
potential 3 0.0004536518977771057
real 3 0.0004536518977771057
stupid 3 0.0004536518977771

### Popular fem words in manosphere++

In [24]:
def show_top_fem(cat, this_df): 
    cat_df = this_df[this_df.community == cat]
    cat_totals = cat_df.groupby('word').sum()
    cat_totals['total'] = cat_totals['fem'] + cat_totals['masc'] 
    # filter to only those that appear at least 10 times as she or he
    cat_totals = cat_totals[cat_totals['total'] > 10] 
    cat_totals['fem_frac'] = cat_totals['fem'] / (cat_totals['fem'] + cat_totals['masc'])
    cat_fem = cat_totals[cat_totals['fem_frac'] == 1]
    return cat_fem.sort_values(by=['total'])

In [25]:
for cat in cats: 
    print('------' + cat + '-------')
    print(show_top_fem(cat, unmarked_all_df).head())

------the_attraction-------
             year  fem  masc  they  it  you  total  fem_frac
word                                                        
duff         6021   11     0     0   1    0     11       1.0
escort      16083   11     0     1   2    0     11       1.0
playmate    14070   12     0     0   0    0     12       1.0
hot blonde  12064   12     0     0   1    0     12       1.0
stacy       10040   12     0     0   0    1     12       1.0
------CONTROL-------
                 year  fem  masc  they    it  you  total  fem_frac
word                                                              
exgf            12085   11     0     0     3    0     11       1.0
home mom        12095   12     0     2     0    0     12       1.0
married couple  18135   12     0    48     1    0     12       1.0
supermodel      14108   12     0     0     5    0     12       1.0
community       28175   15     0   633  2782    1     15       1.0
------rooshv-------
                year  fem  masc  th

### Popular masc words in reddit

In [379]:
def show_top_masc(cat, this_df): 
    cat_df = this_df[this_df.community == cat]
    cat_totals = cat_df.groupby('word').sum()
    cat_totals['total'] = cat_totals['fem'] + cat_totals['masc'] 
    # filter to only those that appear at least 10 times as she or he
    cat_totals = cat_totals[cat_totals['total'] > 10] 
    cat_totals['masc_frac'] = cat_totals['masc'] / (cat_totals['fem'] + cat_totals['masc'])
    cat_masc = cat_totals[cat_totals['masc_frac'] == 1]
    return cat_masc.sort_values(by=['total'])

In [380]:
for cat in cats: 
    print('------' + cat + '-------')
    print(show_top_masc(cat, unmarked_df).head())

------Femcels-------
Empty DataFrame
Columns: [year, fem, masc, they, it, you, total, masc_frac]
Index: []
------the_attraction-------
             year  fem  masc  they  it  you  total  masc_frac
word                                                         
cabbie      10040    0    11     0   1    0     11        1.0
hitman      10047    0    11     0   0    1     11        1.0
one kid     16079    0    13     0   0    0     13        1.0
best buddy  18095    0    14     0   0    0     14        1.0
captain     16078    0    15     0   0    0     15        1.0
------TRP-------
                    year  fem  masc  they  it  you  total  masc_frac
word                                                                
beginner           14112    0    11     2  13    0     11        1.0
physicist          12099    0    11     0   1    0     11        1.0
mailman            12093    0    11     0   2    0     11        1.0
college professor  12099    0    11     0   0    0     11        1.0


### Popular neut words in reddit

In [381]:
def show_top_neut(cat, this_df): 
    cat_df = this_df[this_df.community == cat]
    cat_totals = cat_df.groupby('word').sum()
    cat_totals['total'] = cat_totals['fem'] + cat_totals['masc'] 
    # filter to only those that appear at least 10 times as she or he
    cat_totals = cat_totals[cat_totals['total'] > 10] 
    cat_totals['fem_frac'] = cat_totals['fem'] / (cat_totals['fem'] + cat_totals['masc'])
    cat_neut = cat_totals[cat_totals.fem_frac.between(0.48, 0.52)]
    return cat_neut.sort_values(by=['total'], ascending = False)

In [382]:
for cat in cats: 
    print('------' + cat + '-------')
    print(show_top_neut(cat, unmarked_df).head())

------Femcels-------
            year  fem  masc  they  it  you  total  fem_frac
word                                                       
partner     4037   58    58    33  31    0    116  0.500000
child       4037   30    32     3  16    0     62  0.483871
teacher     4037   26    25     0   0    0     51  0.509804
looksmatch  4037   12    13     1  12    0     25  0.480000
classmate   4037    6     6     0   0    0     12  0.500000
------the_attraction-------
              year  fem  masc  they  it  you  total  fem_frac
word                                                         
cousin       28161  247   239     2   5    4    486  0.508230
friend here  16068   28    28     1   0    0     56  0.500000
colleague    22122   25    26     0   0    0     51  0.490196
caveman      18090   11    11     0   9    0     22  0.500000
counselor    18090    9     9     0   0    0     18  0.500000
------TRP-------
           year  fem  masc  they   it  you  total  fem_frac
word                

### Words with "it" 

In [383]:
df.sort_values(by=['it'], ascending = False).head(15)

Unnamed: 0,year,community,word,fem,masc,they,it,you
27024,2018,MGTOW,mgtow,354,657,431,3230,6
45598,2019,MGTOW,mgtow,309,443,365,2513,2
33362,2017,MGTOW,mgtow,236,484,327,2164,2
26988,2018,Incels,incels,140,179,3019,1097,4
15892,2016,MGTOW,mgtow,82,199,170,989,3
27124,2018,Incels,incel,170,1167,153,947,1
1722,2013,MRA,mrm,15,22,54,933,0
52965,2015,mgtow,mgtow,93,189,123,887,4
49822,2017,mgtow,mgtow,89,198,127,878,5
50430,2016,mgtow,mgtow,58,151,93,767,1


### Gender over time 

These results suggest that it is precarious to look at gender over time, e.g. pronoun sparsity doesn't seem to allow us to do that robustly. 

In [347]:
# These are the words whose % fem ranges over time are the largest,
# maybe top three words with biggest % range in each community, e.g. “cat	10% - 50%”, only calculate fraction 
# if there are more than 10 occurrences in each community and month. 

def show_top_fem_range(cat):
    # filtering for the argument category
    cat_df = df[df.community == cat]
    totals = cat_df.groupby(['year', 'word'], as_index = False).sum()
    totals['total'] = totals['fem'] + totals['masc']
    totals = totals[totals['total'] > 10]
    totals['fem_frac'] = totals['fem'] / (totals['fem'] + totals['masc'])
    
    # filter for words that show up in more than 1 of the months/time periods 
    # (initially picked 85 to be more than half of all the months but idk if needed)
    is_multi = totals["word"].value_counts() > 1
    filtered = totals[totals["word"].isin(is_multi[is_multi].index)]
    
    # get the max and min fem_frac for each word
    word_keys = filtered['word'].unique().tolist()
    max_fems = []
    min_fems = []
    max_months = []
    min_months = []
    for word in word_keys: 
        df_subset = filtered[filtered['word'] == word]
        max_fem = df_subset['fem_frac'].max()
        min_fem = df_subset['fem_frac'].min()
        max_month = df_subset[df_subset['fem_frac'] == max_fem]['year'].max()
        min_month = df_subset[df_subset['fem_frac'] == min_fem]['year'].min()
        
        max_fems.append(max_fem)
        min_fems.append(min_fem)
        max_months.append(max_month)
        min_months.append(min_month)
    
    
    d = {'word': [], 'min month': [], 'min': [], 'max month': [], 'max':[], 'diff': []}
    for i in range(len(word_keys)):
        d['word'].append(word_keys[i])
        d['min month'].append(min_months[i])
        d['min'].append(min_fems[i])
        d['max month'].append(max_months[i])
        d['max'].append(max_fems[i])
        d['diff'].append(max_fems[i] - min_fems[i])
    
    diffs = pd.DataFrame(data=d)
    return diffs.sort_values(by = ['diff'], ascending = False)


In [348]:
# words in each month in Incels that appear more than 10 times in that month
for cat in cats: 
    print('------' + cat + '-------')
    print(show_top_fem_range(cat).head())

------Femcels-------
       word  min month       min  max month       max      diff
11   doctor       2018  0.153846       2019  0.392857  0.239011
35  partner       2018  0.320000       2019  0.549451  0.229451
15   female       2018  0.705882       2019  0.894737  0.188854
17  femcels       2018  0.312500       2019  0.461538  0.149038
37      sis       2019  0.606061       2018  0.733333  0.127273
------the_attraction-------
           word  min month       min  max month       max      diff
136      waiter       2007  0.071429       2008  0.562500  0.491071
22      asshole       2006  0.421053       2010  0.909091  0.488038
147  one friend       2008  0.166667       2009  0.615385  0.448718
128    stranger       2006  0.285714       2009  0.727273  0.441558
64          gal       2006  0.500000       2007  0.923077  0.423077
------TRP-------
            word  min month       min  max month       max      diff
398       spouse       2019  0.200000       2016  0.816901  0.616901
442 

### Gender differences

Lucy hasn't edited this section

In [54]:
# len(df.word.unique()
df = df.groupby('word').sum()
df['total'] = df['fem'] + df['masc']
df = df[df['total'] > 10] 
df['fem_frac'] = df['fem'] / (df['fem'] + df['masc'])
df = df.sort_values(by=['fem'], ascending = False)
df


Unnamed: 0_level_0,fem,masc,neut,total,fem_frac
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
girl,308205,2567,226,310772,0.991740
woman,248514,1024,838,249538,0.995896
wife,74069,576,55,74645,0.992283
mother,31271,275,139,31546,0.991283
mom,23830,893,186,24723,0.963880
...,...,...,...,...,...
physicist,0,29,1,29,0.000000
tall man,0,59,1,59,0.000000
soccer player,0,11,0,11,0.000000
great leader,0,12,0,12,0.000000


In [56]:
control_df = control_df.groupby('word').sum()
control_df['total'] = control_df['fem'] + control_df['masc']
control_df = control_df[control_df['total'] > 10] 
control_df['fem_frac'] = control_df['fem'] / (control_df['fem'] + control_df['masc'])
control_df = control_df.sort_values(by=['fem'], ascending = False)
control_df


Unnamed: 0_level_0,fem,masc,neut,total,fem_frac
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
mom,22693,764,200,23457,0.967430
wife,22389,165,18,22554,0.992684
girl,22147,269,29,22416,0.988000
woman,17238,131,56,17369,0.992458
mother,13385,107,53,13492,0.992069
...,...,...,...,...,...
composer,0,29,7,29,0.000000
common man,0,34,1,34,0.000000
colonel,0,25,0,25,0.000000
college kid,0,13,1,13,0.000000


In [57]:
merged_df = df.merge(control_df, how='inner', left_index=True, right_index=True)
merged_df

Unnamed: 0_level_0,fem_x,masc_x,neut_x,total_x,fem_frac_x,fem_y,masc_y,neut_y,total_y,fem_frac_y
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
girl,308205,2567,226,310772,0.991740,22147,269,29,22416,0.988000
woman,248514,1024,838,249538,0.995896,17238,131,56,17369,0.992458
wife,74069,576,55,74645,0.992283,22389,165,18,22554,0.992684
mother,31271,275,139,31546,0.991283,13385,107,53,13492,0.992069
mom,23830,893,186,24723,0.963880,22693,764,200,23457,0.967430
...,...,...,...,...,...,...,...,...,...,...
layman,0,11,0,11,0.000000,1,11,0,12,0.083333
first man,0,69,0,69,0.000000,0,31,0,31,0.000000
physicist,0,29,1,29,0.000000,1,22,2,23,0.043478
tall man,0,59,1,59,0.000000,0,17,0,17,0.000000


In [59]:
merged_df['difference'] = (merged_df['fem_frac_x'] - merged_df['fem_frac_y']).abs()
merged_df = merged_df.sort_values(by=['difference'], ascending = False)

In [61]:
merged_df

Unnamed: 0_level_0,fem_x,masc_x,neut_x,total_x,fem_frac_x,fem_y,masc_y,neut_y,total_y,fem_frac_y,difference
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
brat,21,5,1,26,0.807692,2,10,1,12,0.166667,0.641026
expert,68,35,25,103,0.660194,8,48,31,56,0.142857,0.517337
band,15,2,382,17,0.882353,8,13,1799,21,0.380952,0.501401
cunt,789,205,60,994,0.793763,35,77,12,112,0.312500,0.481263
sitter,11,3,1,14,0.785714,7,14,0,21,0.333333,0.452381
...,...,...,...,...,...,...,...,...,...,...,...
trans woman,131,0,1,131,1.000000,73,0,4,73,1.000000,0.000000
grown woman,120,0,5,120,1.000000,32,0,0,32,1.000000,0.000000
own daughter,112,0,0,112,1.000000,48,0,0,48,1.000000,0.000000
great woman,103,0,0,103,1.000000,12,0,0,12,1.000000,0.000000


### Evaluation 

This code compares booknlp coref vs. spacy coref on hand-labeled data.

In [439]:
gold_masc = set()
gold_fem = set()
with open(ROOT + 'logs/gender_gold_labels.csv', 'r') as infile: 
    reader = csv.DictReader(infile)
    for row in reader: 
        if row['gendered?'] == 'm':
            gold_masc.add(row['word (singular)'].lower())
        if row['gendered?'] == 'f': 
            gold_fem.add(row['word (singular)'].lower())

In [440]:
david_labels = Counter()
with open(ROOT + 'logs/booknlp_gender.txt', 'r') as infile: 
    reader = csv.DictReader(infile, delimiter='\t')
    for row in reader: 
        if row['proper'] != 'nom': continue
        if (float(row['he/him/his']) + float(row['she/her'])) < 3: continue
        david_labels[row['term']] = float(row['she/her']) / (float(row['he/him/his']) + float(row['she/her']))

In [441]:
df = pd.read_csv(COREF_RESULTS + 'coref_reddit_df.csv')

In [442]:
df = df.groupby('word').sum()
df['fem_frac'] = df['fem'] / (df['fem'] + df['masc'])
df = df[['fem_frac']].dropna()
df = df.to_dict()

In [443]:
spacy_labels = df['fem_frac']

In [444]:
# average score for m words
spacy_scores = []
david_scores = []
for w in gold_masc: 
    if w in spacy_labels and w in david_labels: 
        spacy_scores.append(spacy_labels[w])
        david_scores.append(david_labels[w])
print("masc words")
print("SPACY:", np.mean(spacy_scores), "BOOKNLP:", np.mean(david_scores))

masc words
SPACY: 0.1892641812506059 BOOKNLP: 0.32969645429077915


In [445]:
# average score for f words
spacy_scores = []
david_scores = []
for w in gold_fem: 
    if w in spacy_labels and w in david_labels: 
        spacy_scores.append(spacy_labels[w])
        david_scores.append(david_labels[w])
print("fem words")
print("SPACY:", np.mean(spacy_scores), "BOOKNLP:", np.mean(david_scores))

fem words
SPACY: 0.8552746832971022 BOOKNLP: 0.7529589257733309
