# Retrieving Number Information in Nouns & Adjectives using Correlation
In this Notebook : attempts to isolate Word Embeddings dimension coding for **number information** (singular/plural) in a sample of Word Embeddings for **NOUNS**, **ADJECTIVES** and **both**.  

## 0. Data Loading

In [3]:
import pandas as pd

# WE loading
all_noun_we = pd.read_csv('../Data/FlauBERT_WE/all_nouns_we.csv', index_col=0).drop(columns=["gender"])
all_adj_we = pd.read_csv('../Data/FlauBERT_WE/all_adjectives_we.csv', index_col=0).drop(columns = ["gender"])
all_n_a_we = pd.concat([all_noun_we, all_adj_we])

# Normalization: values fall within -1 and 1
for i in range(512):
    for df in [all_noun_we, all_adj_we, all_n_a_we]:
        values, copy = list(df[str(i)]), list(df[str(i)])
        copy.sort()
        min_, max_ = copy[0], copy[len(copy)-1]
        df[str(i)] = [val / (max_ - min_) for val in values]


## 1. Correlation study

In [4]:
sg_n_we = all_noun_we[all_noun_we["number"] == "singular"][:4638]
pl_n_we = all_noun_we[all_noun_we["number"] == "plural"][:4638]

# the size of singular noun  and plural noun are imbalance: singular noun have 8600 words sand plural noun have 4638 words
# Here we choice 4638 words for both singular and plural noun for balance the data
all_noun_we=pd.concat([sg_n_we, pl_n_we])

# same as Noun, chose 2360 singular noun and plural noun for balance the data
sg_a_we = all_adj_we[all_adj_we["number"] == "singular"][:2360]
pl_a_we = all_adj_we[all_adj_we["number"] == "plural"][:2360]
all_adj_we=pd.concat([sg_a_we, pl_a_we])

all_n_a_we = pd.concat([all_noun_we, all_adj_we])

In [5]:


# Convert the number information into labels 0 and 1
all_noun_we["number"] = all_noun_we["number"].apply(lambda x: 1 if x == "singular" else 0)
all_adj_we["number"] = all_adj_we["number"].apply(lambda x: 1 if x == "singular" else 0)
all_n_a_we["number"] = all_n_a_we["number"].apply(lambda x: 1 if x == "singular" else 0)

NOUNS

In [6]:
import numpy as np
number_corr_df_n = pd.DataFrame(columns=["correlation_with_number"])
number_list = []

for i in range(512):
    number_list.append(all_noun_we.loc[:,[str(i), 'number']].corr().iloc[0][-1])

number_corr_df_n["correlation_with_number"] = number_list
number_corr_df_n["sign"] = np.sign(number_corr_df_n["correlation_with_number"])
number_corr_df_n["abs"] = abs(number_corr_df_n["correlation_with_number"])
# Dimensions having the highest abs correlation with number
number_corr_df_n.sort_values("abs")[-10:][::-1]



Unnamed: 0,correlation_with_number,sign,abs
310,0.480787,1.0,0.480787
81,-0.347984,-1.0,0.347984
288,-0.346962,-1.0,0.346962
250,-0.318835,-1.0,0.318835
507,-0.312207,-1.0,0.312207
278,-0.307268,-1.0,0.307268
285,0.306895,1.0,0.306895
54,0.306434,1.0,0.306434
172,0.290268,1.0,0.290268
25,-0.283359,-1.0,0.283359


In [7]:
w1 = list(pd.read_csv('../Data/Dimensions/GN/noun.csv', index_col=0).iloc[:, 0].values)

w1.extend(number_corr_df_n.sort_values("abs")[-10:][::-1].index)

pd.DataFrame(w1).to_csv('../Data/Dimensions/GN/noun.csv')

ADJ

In [246]:
all_adj_we

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,503,504,505,506,507,508,509,510,511,number
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
abaissé,-0.042944,-0.121588,0.253221,-0.018416,-0.112224,-0.165065,0.064048,0.061435,0.149173,0.250338,...,0.122576,-0.151062,0.030704,-0.000267,-0.215452,-0.496838,-0.143455,0.109168,-0.033071,1
abandonné,0.189704,0.042860,0.038961,-0.143346,-0.009533,-0.411331,-0.126751,0.057869,-0.134712,-0.081770,...,-0.152498,-0.022323,-0.239252,-0.003259,-0.327105,-0.280536,-0.291624,0.131377,-0.261618,1
abattu,0.192408,-0.098454,0.197049,-0.005491,-0.074786,-0.149410,0.077398,0.110115,0.185678,0.083753,...,-0.079404,-0.251517,-0.138261,-0.156556,-0.228935,-0.390281,-0.197627,-0.109574,-0.167557,1
aboli,0.163258,-0.044586,-0.099954,0.103791,-0.071420,0.022540,0.111578,0.115080,-0.141150,0.034744,...,-0.020278,-0.026828,0.017501,0.117476,-0.256992,-0.205790,-0.294415,-0.241071,-0.205843,1
abondant,0.128294,-0.309477,0.021504,0.137204,0.412875,-0.257000,-0.016623,-0.150615,-0.092315,0.112267,...,-0.037446,0.085360,-0.010056,-0.303157,-0.131555,-0.196813,-0.067730,-0.119372,-0.116787,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
évacuées,0.115877,-0.011350,0.035997,0.028827,0.059654,-0.337283,-0.030697,0.137083,0.076271,0.090022,...,0.168681,-0.053678,0.043680,-0.281998,-0.324693,-0.468845,-0.198282,0.158278,0.024740,0
évaluées,0.337002,0.047981,0.129326,0.179921,-0.014418,-0.253125,0.060555,0.195933,-0.015158,0.037804,...,0.329961,-0.086322,0.026354,-0.021133,-0.143766,-0.024202,-0.336136,0.200033,-0.124185,0
éventuelles,-0.216523,-0.101876,-0.100012,0.012300,-0.029418,-0.180825,-0.049696,-0.213698,0.252523,-0.072447,...,0.024550,-0.129838,-0.264718,-0.002919,-0.006518,-0.436265,-0.135772,0.223473,-0.011749,0
évidentes,0.196965,-0.005879,0.190735,0.057461,-0.010534,-0.176981,-0.238832,0.102737,0.010954,-0.315445,...,0.171216,0.141069,-0.035503,0.041717,-0.085636,-0.089069,-0.380767,0.139658,-0.223387,0


In [8]:
import numpy as np
number_corr_df_a = pd.DataFrame(columns=["correlation_with_number"])
number_list = []

for i in range(512):
    number_list.append(all_adj_we.loc[:,[str(i), 'number']].corr().iloc[0][-1])

number_corr_df_a["correlation_with_number"] = number_list
number_corr_df_a["sign"] = np.sign(number_corr_df_a["correlation_with_number"])
number_corr_df_a["abs"] = abs(number_corr_df_a["correlation_with_number"])

In [9]:
# Dimensions having the highest abs correlation with number
number_corr_df_a.sort_values("abs")[-10:][::-1]

Unnamed: 0,correlation_with_number,sign,abs
310,0.410988,1.0,0.410988
54,0.399206,1.0,0.399206
192,-0.362905,-1.0,0.362905
384,-0.353143,-1.0,0.353143
274,0.35048,1.0,0.35048
84,-0.343129,-1.0,0.343129
56,-0.341046,-1.0,0.341046
318,-0.323895,-1.0,0.323895
285,0.31613,1.0,0.31613
25,-0.308397,-1.0,0.308397


In [16]:
w1 = list(pd.read_csv('../Data/Dimensions/GN/adj.csv', index_col=0).iloc[:, 0].values)

w1.extend(number_corr_df_a.sort_values("abs")[-10:][::-1].index)

pd.DataFrame(w1).to_csv('../Data/Dimensions/GN/adj.csv')

Noun and Adj

In [11]:
number_corr_df_na = pd.DataFrame(columns=["correlation_with_number"])
number_list = []

for i in range(512):
    number_list.append(all_n_a_we.loc[:,[str(i), 'number']].corr().iloc[0][-1])

number_corr_df_na["correlation_with_number"] = number_list
number_corr_df_na["sign"] = np.sign(number_corr_df_na["correlation_with_number"])
number_corr_df_na["abs"] = abs(number_corr_df_na["correlation_with_number"])

In [12]:
number_corr_df_na.sort_values("abs")[-10:][::-1]

Unnamed: 0,correlation_with_number,sign,abs
310,0.45349,1.0,0.45349
54,0.337249,1.0,0.337249
81,-0.33104,-1.0,0.33104
288,-0.326647,-1.0,0.326647
285,0.309096,1.0,0.309096
278,-0.299422,-1.0,0.299422
384,-0.293536,-1.0,0.293536
25,-0.290862,-1.0,0.290862
172,0.264594,1.0,0.264594
311,0.255739,1.0,0.255739


In [17]:
w1 = list(pd.read_csv('../Data/Dimensions/GN/both.csv', index_col=0).iloc[:, 0].values)

w1.extend(number_corr_df_na.sort_values("abs")[-10:][::-1].index)

pd.DataFrame(w1).to_csv('../Data/Dimensions/GN/both.csv')