# Retrieving meaningful dimensions using AABCC score

We find that AABCC score and correlation rank overlap only for the top ~50 dimensions. The overall distributions of values are also similar.

In [30]:
import pandas as pd
import numpy as np

In [15]:
nouns = pd.read_csv("../Data/FlauBERT_WE/all_nouns_we.csv", index_col=0)
adjs = pd.read_csv("../Data/FlauBERT_WE/all_adjectives_we.csv", index_col=0)
verbs = pd.read_csv("../Data/FlauBERT_WE/all_verb_we.csv", index_col=0)

In [39]:
nouns['number'] = nouns.number.apply(lambda x: 0 if x=='singular' else 1)
nouns['gender'] = nouns.gender.apply(lambda x: 0 if x=='masculine' else 1)

adjs['number'] = adjs.number.apply(lambda x: 0 if x=='singular' else 1)
adjs['gender'] = adjs.gender.apply(lambda x: 0 if x=='masculine' else 1)

In [43]:
normalized_nouns = (nouns - nouns.min())/(nouns.max() - nouns.min())
normalized_adjs = (adjs - adjs.min())/(adjs.max() - adjs.min())
normalized_verbs = (verbs - verbs.min())/(verbs.max() - verbs.min())

normalized_nouns['verb'] = 0
normalized_nouns['adj'] = 0
normalized_nouns['noun'] = 1

normalized_adjs['verb'] = 0
normalized_adjs['adj'] = 1
normalized_adjs['noun'] = 0

normalized_verbs['verb'] = 1
normalized_verbs['adj'] = 0
normalized_verbs['noun'] = 0

In [4]:
def get_sequence(df:pd.DataFrame, dim:int, feature:str):
    return list(df.sort_values(by=str(dim))[feature])

def get_score_for_sequence(seq:list):
    
    # initialisation
    score = 0
    i = 0 # current index
    val = seq[0] # current value

    while i < len(seq):
        j = 0 # consecutive values counter

        while i < len(seq) and seq[i] == val:
            j += 1
            i += 1
        
        score += sum([i for i in range(j+1)])
        
        if i < len(seq):
             val = seq[i]

    return score


def aabbcc(df, feature, top_dim_count, return_scores=False):
    scores = []
    for dim in range(512):
        seq = get_sequence(df, dim, feature)
        score = get_score_for_sequence(seq)
        scores.append(score)
        
    top_scores = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)[:top_dim_count]
    
    return top_scores if return_scores else [x[0] for x in top_scores]

## 1. Gender  
In this Notebook, we will implement and use a new metric to measure the meaningfulness of a WE dimension for a classification task.

### 1.1 Gender of nouns

In [20]:
aabbcc(normalized_nouns, 'gender', top_dim_count=10)

[100, 192, 403, 245, 377, 316, 195, 121, 507, 202]

### 1.2 Gender of adjs

In [22]:
aabbcc(normalized_adjs, 'gender', top_dim_count=10)

[466, 250, 245, 256, 439, 133, 432, 192, 503, 121]

### 1.3 Gender of nouns and adjs combined

In [24]:
aabbcc(pd.concat([normalized_nouns, normalized_adjs]), 'gender', top_dim_count=10)

[432, 213, 249, 57, 364, 325, 331, 137, 82, 272]

## 2. Plurality 

### 2.1 Plurality of nouns 

In [25]:
aabbcc(normalized_nouns, 'number', top_dim_count=10)

[310, 54, 288, 285, 278, 384, 81, 25, 250, 172]

### 2.2 Plurality of adjs

In [26]:
aabbcc(normalized_adjs, 'number', top_dim_count=10)

[310, 25, 54, 278, 84, 274, 384, 285, 81, 311]

### 2.3 Plurality of nouns and adjs

In [27]:
aabbcc(pd.concat([normalized_nouns, normalized_adjs]), 'number', top_dim_count=10)

[310, 54, 57, 278, 25, 384, 420, 213, 325, 81]

## 3. PoS

First, remove words with ambiguous part of speech:

In [44]:
all_norm = pd.concat([normalized_nouns, normalized_adjs, normalized_verbs]).drop(columns=['number', 'gender'])



word, count = np.unique(all_norm.index, return_counts=True)
unique_words = [x[0] for x in list(filter(lambda x: x[1] == 1, zip(word, count)))]

unique_norm = all_norm[all_norm.index.isin(unique_words)]

### 3.1 NOUN vs not-NOUN

In [46]:
aabbcc(unique_norm, 'noun', top_dim_count=10)

[185, 127, 341, 346, 209, 299, 96, 159, 89, 4]

### 3.2 VERB vs not-VERB

In [47]:
aabbcc(unique_norm, 'verb', top_dim_count=10)

[341, 162, 89, 185, 346, 299, 127, 182, 4, 504]

### 3.3 ADJ vs not-ADJ

In [48]:
aabbcc(unique_norm, 'adj', top_dim_count=10)

[318, 310, 64, 487, 47, 69, 291, 365, 425, 182]