# Retrieving meaningful dimensions using AABCC score

We find that AABCC score and correlation rank overlap only for the top ~50 dimensions. The overall distributions of values are also similar.

In [49]:
import pandas as pd
import numpy as np

In [50]:
nouns = pd.read_csv("../Data/FlauBERT_WE/all_nouns_we.csv", index_col=0)
adjs = pd.read_csv("../Data/FlauBERT_WE/all_adjectives_we.csv", index_col=0)
verbs = pd.read_csv("../Data/FlauBERT_WE/all_verb_we.csv", index_col=0)

In [51]:
nouns['number'] = nouns.number.apply(lambda x: 0 if x=='singular' else 1)
nouns['gender'] = nouns.gender.apply(lambda x: 0 if x=='masculine' else 1)

adjs['number'] = adjs.number.apply(lambda x: 0 if x=='singular' else 1)
adjs['gender'] = adjs.gender.apply(lambda x: 0 if x=='masculine' else 1)

In [52]:
normalized_nouns = (nouns - nouns.min())/(nouns.max() - nouns.min())
normalized_adjs = (adjs - adjs.min())/(adjs.max() - adjs.min())
normalized_verbs = (verbs - verbs.min())/(verbs.max() - verbs.min())

normalized_nouns['verb'] = 0
normalized_nouns['adj'] = 0
normalized_nouns['noun'] = 1

normalized_adjs['verb'] = 0
normalized_adjs['adj'] = 1
normalized_adjs['noun'] = 0

normalized_verbs['verb'] = 1
normalized_verbs['adj'] = 0
normalized_verbs['noun'] = 0

In [53]:
def get_sequence(df:pd.DataFrame, dim:int, feature:str):
    return list(df.sort_values(by=str(dim))[feature])

def get_score_for_sequence(seq:list):
    
    # initialisation
    score = 0
    i = 0 # current index
    val = seq[0] # current value

    while i < len(seq):
        j = 0 # consecutive values counter

        while i < len(seq) and seq[i] == val:
            j += 1
            i += 1
        
        score += sum([i for i in range(j+1)])
        
        if i < len(seq):
             val = seq[i]

    return score


def aabbcc(df, feature, top_dim_count, return_scores=False):
    scores = []
    for dim in range(512):
        seq = get_sequence(df, dim, feature)
        score = get_score_for_sequence(seq)
        scores.append(score)
        
    top_scores = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)[:top_dim_count]
    
    return top_scores if return_scores else [x[0] for x in top_scores]

## 1. Gender  
In this Notebook, we will implement and use a new metric to measure the meaningfulness of a WE dimension for a classification task.

### 1.1 Gender of nouns

In [55]:
gn_n = aabbcc(normalized_nouns, 'gender', top_dim_count=10)

In [56]:
w1 = list(pd.read_csv('../Data/Dimensions/GG/noun.csv', index_col=0).iloc[:, 0].values)

w1.extend(gn_n)

pd.DataFrame(w1).to_csv('../Data/Dimensions/GG/noun.csv')

### 1.2 Gender of adjs

In [57]:
gn_a = aabbcc(normalized_adjs, 'gender', top_dim_count=10)

In [58]:
w1 = list(pd.read_csv('../Data/Dimensions/GG/adj.csv', index_col=0).iloc[:, 0].values)

w1.extend(gn_a)

pd.DataFrame(w1).to_csv('../Data/Dimensions/GG/adj.csv')

### 1.3 Gender of nouns and adjs combined

In [59]:
gn_na = aabbcc(pd.concat([normalized_nouns, normalized_adjs]), 'gender', top_dim_count=10)

In [60]:
w1 = list(pd.read_csv('../Data/Dimensions/GG/both.csv', index_col=0).iloc[:, 0].values)

w1.extend(gn_na)

pd.DataFrame(w1).to_csv('../Data/Dimensions/GG/both.csv')

## 2. Plurality 

### 2.1 Plurality of nouns 

In [61]:
pl_n = aabbcc(normalized_nouns, 'number', top_dim_count=10)

In [62]:
w1 = list(pd.read_csv('../Data/Dimensions/GN/noun.csv', index_col=0).iloc[:, 0].values)

w1.extend(pl_n)

pd.DataFrame(w1).to_csv('../Data/Dimensions/GN/noun.csv')

### 2.2 Plurality of adjs

In [63]:
pl_a = aabbcc(normalized_adjs, 'number', top_dim_count=10)

In [64]:
w1 = list(pd.read_csv('../Data/Dimensions/GN/adj.csv', index_col=0).iloc[:, 0].values)

w1.extend(pl_a)

pd.DataFrame(w1).to_csv('../Data/Dimensions/GN/adj.csv')

### 2.3 Plurality of nouns and adjs

In [65]:
pl_na = aabbcc(pd.concat([normalized_nouns, normalized_adjs]), 'number', top_dim_count=10)

In [66]:
w1 = list(pd.read_csv('../Data/Dimensions/GN/both.csv', index_col=0).iloc[:, 0].values)

w1.extend(pl_na)

pd.DataFrame(w1).to_csv('../Data/Dimensions/GN/both.csv')

## 3. PoS

First, remove words with ambiguous part of speech:

In [67]:
all_norm = pd.concat([normalized_nouns, normalized_adjs, normalized_verbs]).drop(columns=['number', 'gender'])



word, count = np.unique(all_norm.index, return_counts=True)
unique_words = [x[0] for x in list(filter(lambda x: x[1] == 1, zip(word, count)))]

unique_norm = all_norm[all_norm.index.isin(unique_words)]

### 3.1 NOUN vs not-NOUN

In [68]:
pos_n = aabbcc(unique_norm, 'noun', top_dim_count=10)

In [69]:
w1 = list(pd.read_csv('../Data/Dimensions/PoS/noun.csv', index_col=0).iloc[:, 0].values)

w1.extend(pos_n)

pd.DataFrame(w1).to_csv('../Data/Dimensions/PoS/noun.csv')

### 3.2 VERB vs not-VERB

In [70]:
pos_v = aabbcc(unique_norm, 'verb', top_dim_count=10)

In [71]:
w1 = list(pd.read_csv('../Data/Dimensions/PoS/verb.csv', index_col=0).iloc[:, 0].values)

w1.extend(pos_v)

pd.DataFrame(w1).to_csv('../Data/Dimensions/PoS/verb.csv')

### 3.3 ADJ vs not-ADJ

In [72]:
pos_a = aabbcc(unique_norm, 'adj', top_dim_count=10)

In [73]:
w1 = list(pd.read_csv('../Data/Dimensions/PoS/adj.csv', index_col=0).iloc[:, 0].values)

w1.extend(pos_a)

pd.DataFrame(w1).to_csv('../Data/Dimensions/PoS/adj.csv')