In [1]:
import pandas as pd
import numpy as np
from datasets import load_dataset
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# This needs to be defined by the drop down
subgroup = "woman"

In [3]:
data = load_dataset("c4", "en", split= "train", streaming = True)

In [4]:
grab_n = 5000
# For streaming data
print('Note: Just taking the first %s instances.' % grab_n)
data_head = data.take(grab_n)
df = pd.DataFrame(data_head)
# If not streaming, use:
#df = pd.json_normalize(data)

Note: Just taking the first 5000 instances.


In [5]:
def count_vocab_frequencies(df):
    """
    Based on an input pandas DataFrame with a 'text' column, 
    this function will count the occurrences of all words
    with a frequency higher than 'cutoff' and will return another DataFrame
    with the rows corresponding to the different vocabulary words
    and the column to the count count of that word.
    """
    # Move this up as a constant in larger code.
    batch_size = 10
    
    # We do this to calculate per-word statistics
    df['text'] = df['text'].str.lower()
    # Regex for pulling out single words
    cvec = CountVectorizer(token_pattern=u"(?u)\\b\\w+\\b", lowercase=True)
    
    # We also do this because we need to have the tokenization per sentence 
    # so that we can look at co-occurrences of words across sentences for nPMI calculation
    sent_tokenizer = cvec.build_tokenizer()
    df['tokenized'] = df.text.apply(sent_tokenizer)
    
    # Fast calculation of single word counts
    cvec.fit(df.text)
    document_matrix = cvec.transform(df.text)
    batches = np.linspace(0, df.shape[0], batch_size).astype(int)
    i = 0
    tf = []
    while i < len(batches) - 1:
        batch_result = np.sum(document_matrix[batches[i]:batches[i+1]].toarray(), axis=0)
        tf.append(batch_result)
        i += 1
    term_freq_df = pd.DataFrame([np.sum(tf, axis=0)], columns=cvec.get_feature_names()).transpose()
    
    # Now organize everything into the dataframes
    term_freq_df.columns = ['count']
    term_freq_df.index.name = 'word'
    sorted_term_freq_df = pd.DataFrame(term_freq_df.sort_values(by='count', ascending=False)['count'])
    return sorted_term_freq_df, df

In [6]:
term_df, df = count_vocab_frequencies(df)
# p(word).  Note that multiple occurrences of a word in a sentence increases its probability.
term_df['proportion'] = term_df['count']/float(sum(term_df['count']))
# Sanity check
print(term_df.head())
print(term_df.tail())

      count  proportion
word                   
the   97414    0.051017
and   55730    0.029186
to    53855    0.028205
of    46970    0.024599
a     42078    0.022037
               count    proportion
word                              
intraflora         1  5.237121e-07
intramural         1  5.237121e-07
intramurals        1  5.237121e-07
intramuscular      1  5.237121e-07
ﬂoors              1  5.237121e-07


In [7]:
from sklearn.preprocessing import MultiLabelBinarizer

# Makes a sparse vector (shape: # sentences x # words),
# with the count of each word per sentence.
mlb = MultiLabelBinarizer()
df_mlb = pd.DataFrame(mlb.fit_transform(df['tokenized']))
# Index of the subgroup word in the sparse vector
subgroup_idx = np.where(mlb.classes_ == subgroup)[0][0]
# Dataframe for the subgroup (with counts)
df_subgroup = df_mlb.iloc[:, subgroup_idx]
# Create cooccurence matrix for the given subgroup and all other words.
# Note it also includes the word itself, so that count should be subtracted 
# (the word will always co-occur with itself)
df_coo = pd.DataFrame(df_mlb.T.dot(df_subgroup))

In [8]:
# PMI(x;y) = h(y) - h(y|x)
#          = h(subgroup) - h(subgroup|word)
#          = log p(subgroup|word) - log p(subgroup))

# log p(subgroup)
subgroup_prob = np.log(term_df.loc[subgroup]['proportion'])
# Apply a function to all words to calculate log p(subgroup|word)
# The word is indexed by mlb.classes_ ; 
# we pull out the word using the index and then get its count using our main term_df
# x[1] is the count of the word, given the subgroup
pmi_df = pd.DataFrame(df_coo.apply(lambda x: np.log(x[1]/term_df.loc[mlb.classes_[x.index]]['count']) - subgroup_prob))

In [9]:
# If all went well, this will be correlated with high/low frequency words
# Until normalizing
print(pmi_df.sort_values(by=[0])[:50])

              0
word           
the   -1.368207
and   -0.809756
to    -0.775532
of    -0.638746
a     -0.528762
in    -0.305339
is     0.078404
for    0.141734
you    0.271072
that   0.312754
it     0.428533
with   0.452654
i      0.466381
on     0.578802
s      0.703117
are    0.735654
as     0.764770
this   0.797710
be     0.807518
your   0.903788
we     0.977421
or     0.993300
have   1.046176
at     1.052741
from   1.082889
can    1.146943
by     1.157152
was    1.162812
will   1.177365
not    1.317952
an     1.329010
all    1.430064
but    1.446546
they   1.498949
our    1.527146
if     1.535350
has    1.544000
their  1.640482
more   1.656415
my     1.668962
so     1.676049
one    1.692344
t      1.698938
which  1.811306
about  1.835530
there  1.850043
also   1.870251
what   1.871298
when   1.874184
up     1.875236
