# 8. Filtering 2 - Function Word selection

#### This script loads the Function Words dataset and selects the most relevant ones, so as to not have too many sprseness in the data

### Imports

In [1]:
import pandas as pd
import numpy as np

### Definitions

In [2]:
all_fw_filename = 'data/FunctionWords-extracted.csv'

selected_fw_filename = 'data/Bleached-ngram-selected.txt'

### Load Bleached bigrams

In [3]:
df = pd.read_csv(all_fw_filename)

df.head()

Unnamed: 0,Word1,Word2,FrequencyMale,FrequencyFemale
0,CVCC,",",699.771789,673.078455
1,",",VCVCCVCV,6.806572,8.150372
2,VCVCCVCV,CVC,14.710548,18.759674
3,CVC,VC,1381.987574,1386.200353
4,VC,VCC,459.352684,448.490169


### Get information on what is considered as 'low frequency'

In [4]:
total_count = np.asarray(df['FrequencyMale'] + df['FrequencyFemale'])

for i in [1, 10, 20, 50, 100, 1000, 10000]:
    print('Counts more than {0}: {1}'.format(i, np.sum(total_count > i)))

Counts more than 1: 22238
Counts more than 10: 4643
Counts more than 20: 2808
Counts more than 50: 1414
Counts more than 100: 776
Counts more than 1000: 74


We will consider only those more than 100

### Remove columns with low frequency

In [5]:
df = df[total_count > 100]

total_count = total_count[total_count > 100]

### Calculate the entropy column

In [6]:
male_prob, female_prob = df['FrequencyMale']/total_count, df['FrequencyFemale']/total_count

male_prob.shape
df['Entropy'] = - ( male_prob * np.log2(male_prob) + female_prob * np.log2(female_prob) )

df.head()

Unnamed: 0,Word1,Word2,FrequencyMale,FrequencyFemale,Entropy
0,CVCC,",",699.771789,673.078455,0.999727
3,CVC,VC,1381.987574,1386.200353,0.999998
4,VC,VCC,459.352684,448.490169,0.999897
5,VCC,CVVCC,89.299599,93.04335,0.999696
6,CVVCC,CCVC,162.613082,160.766179,0.999976


### Take top 100 with least entropy (ie which discriminate the most)

In [7]:
df = df.sort_values('Entropy').reset_index(drop = True)

df = df.iloc[:100]

df.head()

Unnamed: 0,Word1,Word2,FrequencyMale,FrequencyFemale,Entropy
0,CVV,?,56.567883,111.408437,0.921685
1,VCCCVCC,CVCCVC,139.235562,71.851794,0.92519
2,--,--,213.582195,110.926343,0.926558
3,VCCCVCC,CVCCVCC,83.445923,43.544732,0.927564
4,!,V,155.411963,294.23801,0.930103


### Save to new text file

In [8]:
df[['Word']].to_csv(selected_fw_filename, header=False, index=False)