# calculating difference in perplexity
* The below code loads the results from the txt file into pandas
* NB for all sentences you get a perplexity score for sentences where the pronoun is male and where it is female
* This perplexity score is also returned in a transformed version using the math.exp() function
* When using this function it returns the mathematicl constant e to the power of x 
* where x is the original value 
* and e is approximately equal to 2.718


**What are we measuring?**
"We are not interested in the
model’s ability to generate a particular pronoun,
the more interesting observation is whether the perplexities
for sentences containing masculine possessives
are lower than for predicting feminine possessives
when forcing the model to predict these
in place of a reflexive."

In [9]:
import sys, os
import pandas as pd


# define path 
path = os.getcwd() + "/outputs/lm/out_da.txt"

# load txt file into pandas dataframe
df = pd.read_csv(path, sep='\t', header=None, names=['all'])

df.head()


Unnamed: 0,all
0,teknikeren mistede sin tegnebog ved huset. mal...
1,teknikeren mister sin tegnebog ved huset. male...
2,teknikeren vaskede sin pensel i badekarret. ma...
3,teknikeren vasker sin pensel i badekarret. mal...
4,teknikeren efterlod sin kuglepen på kontoret. ...


In [10]:
# extract sentences from all collumn
df['sentence'] = df['all'].str.split('.').str[0]

# extract perpexity loss scores from all collumn
df['perplexity_male'] = df['all'].str.split(' ').str[-8] # 7 or 8 
df['perplexity_male_exp'] = df['all'].str.split(' ').str[-7] # 7 or 8 

df['perplexity_female'] = df['all'].str.split(' ').str[-5] # 4 or 5
df['perplexity_female_exp'] = df['all'].str.split(' ').str[-4] # 4 or 5

df['perplexity_refl'] = df['all'].str.split(' ').str[-2] # 1 or 2
df['perplexity_refl_exp'] = df['all'].str.split(' ').str[-1] # 1 or 2


# drop all collumn
#df = df.drop(columns=['all'])


df.head()

Unnamed: 0,all,sentence,perplexity_male,perplexity_male_exp,perplexity_female,perplexity_female_exp,perplexity_refl,perplexity_refl_exp
0,teknikeren mistede sin tegnebog ved huset. mal...,teknikeren mistede sin tegnebog ved huset,10.876470565795898,52916.504620686406,10.876470565795898,52916.504620686406,9.943881034851074,20824.407908186116
1,teknikeren mister sin tegnebog ved huset. male...,teknikeren mister sin tegnebog ved huset,11.486934661865234,97434.40505074752,11.486934661865234,97434.40505074752,10.810932159423828,49559.6443428941
2,teknikeren vaskede sin pensel i badekarret. ma...,teknikeren vaskede sin pensel i badekarret,9.084511756896973,8817.659552225832,9.084511756896973,8817.659552225832,7.707715511322021,2225.452425697036
3,teknikeren vasker sin pensel i badekarret. mal...,teknikeren vasker sin pensel i badekarret,8.905508041381836,7372.469777916698,8.905508041381836,7372.469777916698,7.652374267578125,2105.639009648424
4,teknikeren efterlod sin kuglepen på kontoret. ...,teknikeren efterlod sin kuglepen på kontoret,11.06373691558838,63814.576536014894,11.06373691558838,63814.576536014894,9.306498527526855,11009.3316891737


In [11]:
# make into floats
cols = df.drop(['all', 'sentence'], axis=1).columns
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')
df.dtypes

all                       object
sentence                  object
perplexity_male          float64
perplexity_male_exp      float64
perplexity_female        float64
perplexity_female_exp    float64
perplexity_refl          float64
perplexity_refl_exp      float64
dtype: object

In [12]:
df.head(3)

Unnamed: 0,all,sentence,perplexity_male,perplexity_male_exp,perplexity_female,perplexity_female_exp,perplexity_refl,perplexity_refl_exp
0,teknikeren mistede sin tegnebog ved huset. mal...,teknikeren mistede sin tegnebog ved huset,10.876471,52916.504621,10.876471,52916.504621,9.943881,20824.407908
1,teknikeren mister sin tegnebog ved huset. male...,teknikeren mister sin tegnebog ved huset,11.486935,97434.405051,11.486935,97434.405051,10.810932,49559.644343
2,teknikeren vaskede sin pensel i badekarret. ma...,teknikeren vaskede sin pensel i badekarret,9.084512,8817.659552,9.084512,8817.659552,7.707716,2225.452426


### trying to simply subtract female perplexity from male perplexity

In [13]:
# difference between male and female
df['dif'] = df['perplexity_male'] - df['perplexity_female']
df['dif_exp'] = df['perplexity_male_exp'] - df['perplexity_female_exp']



In [14]:
# calculate mean difference 
print(df['dif'].mean(), df['dif_exp'].mean())

0.0 0.0


### trying instead to look at differences between antireflexive male/female pronoun and the original reflexive pronouns

In [15]:
df['dif_male'] = df['perplexity_refl'] - df['perplexity_male']
df['dif_female'] = df['perplexity_refl'] - df['perplexity_female']

df['dif_difference'] = df['dif_male'] - df['dif_female']
df['dif_difference'].mean()


0.0

In [16]:
import math
math.exp(6.128866)

458.91545536943477