In [1]:
import pandas as pd
import numpy as np
from datasets import load_dataset
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# This needs to be defined by the drop down in the UI
subgroup1 = "woman"
subgroup2 = "man"

In [3]:
data = load_dataset("c4", "en", split= "train", streaming = True)

In [4]:
grab_n = 10000
# For streaming data
print('Note: Just taking the first %s instances.' % grab_n)
data_head = data.take(grab_n)
#data_head = [["there is a woman with a hairbrush"],["there is a woman with a hairbrush"],["there is a woman with a hairbrush"],["there is a man with a dog"],["there is a man with a dog"]]
df = pd.DataFrame(data_head, columns=["text"])
# If not streaming, use:
#df = pd.json_normalize(data)

Note: Just taking the first 10000 instances.


In [5]:
def count_vocab_frequencies(df):
    """
    Based on an input pandas DataFrame with a 'text' column, 
    this function will count the occurrences of all words
    with a frequency higher than 'cutoff' and will return another DataFrame
    with the rows corresponding to the different vocabulary words
    and the column to the count count of that word.
    """
    # Move this up as a constant in larger code.
    batch_size = 10
    
    # We do this to calculate per-word statistics
    df['text'] = df['text'].str.lower()
    # Regex for pulling out single words
    cvec = CountVectorizer(token_pattern=u"(?u)\\b\\w+\\b", lowercase=True)
    
    # We also do this because we need to have the tokenization per sentence 
    # so that we can look at co-occurrences of words across sentences for nPMI calculation
    sent_tokenizer = cvec.build_tokenizer()
    df['tokenized'] = df.text.apply(sent_tokenizer)
    
    # Fast calculation of single word counts
    cvec.fit(df.text)
    document_matrix = cvec.transform(df.text)
    batches = np.linspace(0, df.shape[0], batch_size).astype(int)
    i = 0
    tf = []
    while i < len(batches) - 1:
        batch_result = np.sum(document_matrix[batches[i]:batches[i+1]].toarray(), axis=0)
        tf.append(batch_result)
        i += 1
    term_freq_df = pd.DataFrame([np.sum(tf, axis=0)], columns=cvec.get_feature_names()).transpose()
    
    # Now organize everything into the dataframes
    term_freq_df.columns = ['count']
    term_freq_df.index.name = 'word'
    sorted_term_freq_df = pd.DataFrame(term_freq_df.sort_values(by='count', ascending=False)['count'])
    return sorted_term_freq_df, df

In [6]:
term_df, df = count_vocab_frequencies(df)
# p(word).  Note that multiple occurrences of a word in a sentence increases its probability.
# We may want to do something about that.
term_df['proportion'] = term_df['count']/float(sum(term_df['count']))
# Sanity check
print(term_df.head())
print(term_df.tail())

       count  proportion
word                    
the   186019    0.050628
and   107893    0.029365
to    103090    0.028058
of     89417    0.024336
a      81307    0.022129
             count    proportion
word                            
interestel       1  2.721674e-07
interethnic      1  2.721674e-07
interfaced       1  2.721674e-07
interfacing      1  2.721674e-07
𐌼𐌿𐌽𐌳𐍃            1  2.721674e-07


In [7]:
def get_PMI(df_coo, subgroup):
    # PMI(x;y) = h(y) - h(y|x)
    #          = h(subgroup) - h(subgroup|word)
    #          = log (p(subgroup|word) / p(subgroup))

    # p(subgroup)
    subgroup_prob = term_df.loc[subgroup]['proportion']
    # Apply a function to all words to calculate log p(subgroup|word)
    # The word is indexed by mlb.classes_ ; 
    # we pull out the word using the mlb.classes_ index and then get its count using our main term_df
    pmi_df = pd.DataFrame(df_coo.apply(lambda x: np.log(x.values/term_df.loc[mlb.classes_[x.index]]['count']/subgroup_prob)))
    # If all went well, this will be correlated with high frequency words
    # Until normalizing
    return pmi_df

In [8]:
from sklearn.preprocessing import MultiLabelBinarizer

# Makes a sparse vector (shape: # sentences x # words),
# with the count of each word per sentence.
mlb = MultiLabelBinarizer()
df_mlb = pd.DataFrame(mlb.fit_transform(df['tokenized']))
df_pair = pd.DataFrame(columns=[subgroup1, subgroup2])
for subgroup in (subgroup1, subgroup2):
    # Index of the subgroup word in the sparse vector
    subgroup_idx = np.where(mlb.classes_ == subgroup)[0][0]
    # Dataframe for the subgroup (with counts)
    df_subgroup = df_mlb.iloc[:, subgroup_idx]
    # Create cooccurence matrix for the given subgroup and all other words.
    # Note it also includes the word itself, so that count should be subtracted 
    # (the word will always co-occur with itself)
    df_coo = pd.DataFrame(df_mlb.T.dot(df_subgroup))#.drop(index=subgroup_idx, axis=1)
    pmi_df = get_PMI(df_coo, subgroup)
    df_pair[subgroup] = pmi_df

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [9]:
pmi_bias = pd.DataFrame(df_pair[subgroup1] - df_pair[subgroup2])

In [10]:
# Words that only occur with one or the other
s1_only_words = pmi_bias[pmi_bias[0].values==np.inf]
s2_only_words = pmi_bias[pmi_bias[0].values==-np.inf]

# Filter
pmi_bias_filtered = pmi_bias[(np.inf > pmi_bias[0]) & (pmi_bias[0] > -np.inf)].sort_values(by=[0])

In [11]:
n = 50
print("Top %s most %s-biased words" % (n,subgroup2))
pmi_bias_filtered[:n]

Top 50 most man-biased words


Unnamed: 0_level_0,0
word,Unnamed: 1_level_1
pure,-2.055189
foot,-2.055189
failed,-1.864134
decade,-1.692284
route,-1.692284
leg,-1.627745
minds,-1.627745
squad,-1.627745
league,-1.627745
gained,-1.558752


In [12]:
print("Top %s most %s-biased words" % (n,subgroup1))
pmi_bias_filtered[-n:].sort_values(by=[0], ascending=False)

Top 50 most woman-biased words


Unnamed: 0_level_0,0
word,Unnamed: 1_level_1
quit,2.872065
representation,2.689743
vitamins,2.689743
swedish,2.689743
complement,2.689743
flights,2.4666
mai,2.4666
childbirth,2.4666
miraculously,2.4666
alice,2.4666
