# 6. Filtering 1 - bi-gram selection

#### This script loads the bi-grams for the POS and Bleached datasets and selects the most relevant ones, so as to not have too many sprseness in the data

### Imports

In [1]:
import pandas as pd
import numpy as np

### Definitions

In [2]:
pos_all_ngram_filename = 'data/POS-ngram-all.csv'
bleached_all_ngram_filename = 'data/Bleached-ngram-all.csv'

pos_selected_ngram_filename = 'data/POS-ngram-selected.csv'
bleached_selected_ngram_filename = 'data/Bleached-ngram-selected.csv'

### Load POS bigrams

In [3]:
df = pd.read_csv(pos_all_ngram_filename)

df.head()

Unnamed: 0,Word1,Word2,FrequencyMale,FrequencyFemale
0,RB,",",938.759242,922.3376
1,",",NN,683.792886,676.379152
2,NN,VBD,1187.500786,1297.592865
3,VBD,RB,913.690476,1039.620713
4,RB,CC,227.243882,258.157108


### Get information on what is considered as 'low frequency'

In [4]:
total_count = np.asarray(df['FrequencyMale'] + df['FrequencyFemale'])

for i in [1, 10, 20, 50, 100, 1000]:
    print('Counts more than {0}: {1}'.format(i, np.sum(total_count > i)))

Counts more than 1: 1295
Counts more than 10: 926
Counts more than 20: 778
Counts more than 50: 618
Counts more than 100: 486
Counts more than 1000: 138


We will consider only those more than 100

### Remove columns with low frequency

In [5]:
df = df[total_count > 100]

total_count = total_count[total_count > 100]

### Calculate the entropy column

In [6]:
male_prob, female_prob = df['FrequencyMale']/total_count, df['FrequencyFemale']/total_count

male_prob.shape
df['Entropy'] = - ( male_prob * np.log2(male_prob) + female_prob * np.log2(female_prob) )

df.head()

Unnamed: 0,Word1,Word2,FrequencyMale,FrequencyFemale,Entropy
0,RB,",",938.759242,922.3376,0.999944
1,",",NN,683.792886,676.379152,0.999979
2,NN,VBD,1187.500786,1297.592865,0.998584
3,VBD,RB,913.690476,1039.620713,0.997
4,RB,CC,227.243882,258.157108,0.997072


### Take top 100 with least entropy (ie which discriminate the most)

In [7]:
df = df.sort_values('Entropy').reset_index(drop = True)

df = df.iloc[:100]

df.head()

Unnamed: 0,Word1,Word2,FrequencyMale,FrequencyFemale,Entropy
0,VB,VBP,65.662749,112.051981,0.950275
1,.,.,1394.374925,2198.362706,0.963569
2,NNP,NNPS,74.00891,48.092282,0.967253
3,JJ,VBP,342.405636,514.73061,0.970643
4,NNP,``,211.640477,142.708995,0.972528


### Save to new CSV

In [8]:
df[['Word1', 'Word2']].to_csv(pos_selected_ngram_filename, index=False)

## Now we do the same thing with the Bleached bi-grams

### Load Bleached bigrams

In [9]:
df = pd.read_csv(bleached_all_ngram_filename)

df.head()

Unnamed: 0,Word1,Word2,FrequencyMale,FrequencyFemale
0,CVCC,",",699.771789,673.078455
1,",",VCVCCVCV,6.806572,8.150372
2,VCVCCVCV,CVC,14.710548,18.759674
3,CVC,VC,1381.987574,1386.200353
4,VC,VCC,459.352684,448.490169


### Get information on what is considered as 'low frequency'

In [10]:
total_count = np.asarray(df['FrequencyMale'] + df['FrequencyFemale'])

for i in [1, 10, 20, 50, 100, 1000]:
    print('Counts more than {0}: {1}'.format(i, np.sum(total_count > i)))

Counts more than 1: 22238
Counts more than 10: 4643
Counts more than 20: 2808
Counts more than 50: 1414
Counts more than 100: 776
Counts more than 1000: 74


We will consider only those more than 100

### Remove columns with low frequency

In [11]:
df = df[total_count > 100]

total_count = total_count[total_count > 100]

### Calculate the entropy column

In [12]:
male_prob, female_prob = df['FrequencyMale']/total_count, df['FrequencyFemale']/total_count

male_prob.shape
df['Entropy'] = - ( male_prob * np.log2(male_prob) + female_prob * np.log2(female_prob) )

df.head()

Unnamed: 0,Word1,Word2,FrequencyMale,FrequencyFemale,Entropy
0,CVCC,",",699.771789,673.078455,0.999727
3,CVC,VC,1381.987574,1386.200353,0.999998
4,VC,VCC,459.352684,448.490169,0.999897
5,VCC,CVVCC,89.299599,93.04335,0.999696
6,CVVCC,CCVC,162.613082,160.766179,0.999976


### Take top 100 with least entropy (ie which discriminate the most)

In [13]:
df = df.sort_values('Entropy').reset_index(drop = True)

df = df.iloc[:100]

df.head()

Unnamed: 0,Word1,Word2,FrequencyMale,FrequencyFemale,Entropy
0,CVV,?,56.567883,111.408437,0.921685
1,VCCCVCC,CVCCVC,139.235562,71.851794,0.92519
2,--,--,213.582195,110.926343,0.926558
3,VCCCVCC,CVCCVCC,83.445923,43.544732,0.927564
4,!,V,155.411963,294.23801,0.930103


### Save to new CSV

In [14]:
df[['Word1', 'Word2']].to_csv(bleached_selected_ngram_filename, index=False)