# Retrieving Gender Information in Nouns & Adjectives using Correlation
In this Notebook : attempts to isolate Word Embeddings dimension coding for **gender information** (masculine/feminine) in a sample of Word Embeddings for **NOUNS**, **ADJECTIVES** and **both**.  

## 0. Data Loading

In [1]:
import pandas as pd

# WE loading
all_n_we = pd.read_csv('../Data/FlauBERT_WE/all_nouns_we.csv', index_col=0).drop(columns=["number"])
all_a_we = pd.read_csv('../Data/FlauBERT_WE/all_adjectives_we.csv', index_col=0).drop(columns = ["number"])
all_na_we = pd.concat([all_n_we, all_a_we])

# Normalization: values fall within -1 and 1
for i in range(512):
    for df in [all_n_we, all_a_we, all_na_we]:
        values, copy = list(df[str(i)]), list(df[str(i)])
        copy.sort()
        min_, max_ = copy[0], copy[len(copy)-1]
        df[str(i)] = [val / (max_ - min_) for val in values]


## 1. Correlation study

In [2]:
# Convert the gender information into labels 0 and 1
all_n_we["gender"] = all_n_we["gender"].apply(lambda x: 1 if x == "masculine" else 0)
all_a_we["gender"] = all_a_we["gender"].apply(lambda x: 1 if x == "masculine" else 0)
all_na_we["gender"] = all_na_we["gender"].apply(lambda x: 1 if x == "masculine" else 0)

In [3]:
all_n_we.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,503,504,505,506,507,508,509,510,511,gender
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
abaisse,0.220459,-0.185174,0.045101,0.178714,-0.108041,-0.185223,-0.038133,-0.035881,0.206875,0.197348,...,0.185762,-0.084412,0.023871,-0.052849,-0.267209,-0.440458,-0.227911,-0.021792,-0.070034,0
abandonnée,0.186562,0.062755,0.030736,-0.143579,0.006462,-0.319613,-0.188934,-0.008268,-0.178685,-0.003653,...,0.023211,-0.053564,-0.200903,-0.081681,-0.337104,-0.22609,-0.244174,0.193163,-0.026614,0
abattue,0.260729,0.044799,0.100262,-0.003613,-0.014958,-0.248013,-0.062373,0.08956,0.103828,0.065361,...,0.038298,-0.067715,-0.088245,-0.185171,-0.42553,-0.397078,-0.164545,-0.047245,-0.010448,0
abbaye,0.205826,-0.21593,0.101201,0.006262,0.232492,-0.271341,-0.203044,-0.136419,-0.081058,0.065823,...,-0.279662,0.199704,0.1465,-0.078226,-0.315869,-0.32919,0.158065,-0.134252,-0.219602,0
abdominale,0.211081,-0.013678,0.240263,0.032897,0.017872,-0.184198,-0.119463,-0.36088,0.434171,0.266727,...,0.266714,-0.035179,-0.078115,-0.001525,-0.120634,-0.262442,-0.000265,0.19006,-0.191832,0


Maximum correlation for NOUNS

In [4]:
import numpy as np
gender_corr_df_n = pd.DataFrame(columns=["correlation_with_gender"])
gender_list = []

for i in range(512):
    gender_list.append(all_n_we.loc[:,[str(i), 'gender']].corr().iloc[0][-1])

gender_corr_df_n["correlation_with_gender"] = gender_list
gender_corr_df_n["sign"] = np.sign(gender_corr_df_n["correlation_with_gender"])
gender_corr_df_n["abs"] = abs(gender_corr_df_n["correlation_with_gender"])

In [5]:
# Dimensions having the highest abs correlation with gender
gender_corr_df_n.sort_values("abs")[-10:][::-1]

Unnamed: 0,correlation_with_gender,sign,abs
100,-0.201022,-1.0,0.201022
195,-0.1983,-1.0,0.1983
316,0.19295,1.0,0.19295
245,0.181808,1.0,0.181808
507,0.179582,1.0,0.179582
192,-0.169688,-1.0,0.169688
377,0.160333,1.0,0.160333
121,0.159328,1.0,0.159328
117,-0.15573,-1.0,0.15573
403,-0.153749,-1.0,0.153749


In [8]:
w1 = list(pd.read_csv('../Data/Dimensions/GG/noun.csv', index_col=0).iloc[:, 0].values)

w1.extend(gender_corr_df_n.sort_values("abs")[-10:][::-1].index)

pd.DataFrame(w1).to_csv('../Data/Dimensions/GG/noun.csv')

For adjectives

In [9]:
import numpy as np
gender_corr_df_a = pd.DataFrame(columns=["correlation_with_gender"])
gender_list = []

for i in range(512):
    gender_list.append(all_a_we.loc[:,[str(i), 'gender']].corr().iloc[0][-1])

gender_corr_df_a["correlation_with_gender"] = gender_list
gender_corr_df_a["sign"] = np.sign(gender_corr_df_a["correlation_with_gender"])
gender_corr_df_a["abs"] = abs(gender_corr_df_a["correlation_with_gender"])

In [10]:
# Dimensions having the highest abs correlation with gender
gender_corr_df_a.sort_values("abs")[-10:][::-1]

Unnamed: 0,correlation_with_gender,sign,abs
466,0.347641,1.0,0.347641
439,-0.306494,-1.0,0.306494
250,0.306199,1.0,0.306199
503,-0.305685,-1.0,0.305685
133,-0.296369,-1.0,0.296369
245,0.287354,1.0,0.287354
234,0.283759,1.0,0.283759
432,0.278664,1.0,0.278664
181,-0.262315,-1.0,0.262315
121,0.254556,1.0,0.254556


In [11]:
w1 = list(pd.read_csv('../Data/Dimensions/GG/adj.csv', index_col=0).iloc[:, 0].values)

w1.extend(gender_corr_df_a.sort_values("abs")[-10:][::-1].index)

pd.DataFrame(w1).to_csv('../Data/Dimensions/GG/adj.csv')

For both:

In [12]:
gender_corr_df_na = pd.DataFrame(columns=["correlation_with_gender"])
gender_list = []

for i in range(512):
    gender_list.append(all_na_we.loc[:,[str(i), 'gender']].corr().iloc[0][-1])

gender_corr_df_na["correlation_with_gender"] = gender_list
gender_corr_df_na["sign"] = np.sign(gender_corr_df_na["correlation_with_gender"])
gender_corr_df_na["abs"] = abs(gender_corr_df_na["correlation_with_gender"])

In [13]:
gender_corr_df_na.sort_values("abs")[-10:][::-1]

Unnamed: 0,correlation_with_gender,sign,abs
245,0.216266,1.0,0.216266
192,-0.199829,-1.0,0.199829
507,0.194352,1.0,0.194352
121,0.192432,1.0,0.192432
5,0.176665,1.0,0.176665
195,-0.174759,-1.0,0.174759
250,0.17259,1.0,0.17259
377,0.170827,1.0,0.170827
439,-0.168431,-1.0,0.168431
133,-0.163216,-1.0,0.163216


In [14]:
w1 = list(pd.read_csv('../Data/Dimensions/GG/both.csv', index_col=0).iloc[:, 0].values)

w1.extend(gender_corr_df_na.sort_values("abs")[-10:][::-1].index)

pd.DataFrame(w1).to_csv('../Data/Dimensions/GG/both.csv')