# Evaluate NER performances

In [1]:
# delete everything about occupation

## TODO :

- faire le groundtruth dataset
- faire textually close dataset
- faire le lax close dataset

In [1]:
import sys
sys.path.insert(1, '../src')

# from character_extraction import *
from embeddings import *

In [2]:
def get_clustering_metrics(embeddings, embeddings_type):
    '''Given embeddings, and their ground truth data type, computes several clustering performance
    metrics. The right `ground_truth_data_df`, `textually_close_ent_ground_truth_df` or 
    `lax_ent_ground_truth_df` should have been loaded into memory before calling this function.

    Parameters
    ----------
    embeddings : dictionary
        The dictionary containing each entity and their associated embedding vector
    embeddings_type : str
        The matching ground truth data type for the given embeddings (either 'first_version',
        'textually_close' or 'lax')

    Returns
    -------
    same_entityness : list
        A list containing the performance metrics with regards to the 'same_entityness' axis
    gender : list
        A list containing the performance metrics with regards to the 'gender' axis
    first_appearance : list
        A list containing the performance metrics with regards to the 'first_appearance' axis
    '''
    
    # SAME ENTITY-NESS
    same_entityness = []
    
    if embeddings_type == 'first_version':
        mask_embs_entity = [(k, 
                             embeddings[k], 
                             ground_truth_data_df[ground_truth_data_df['name'] == k]['entity_ID'].values[0]) 
                            for k in embeddings 
                            if k.lower() in ground_truth_data_df['name'].tolist()]
    elif embeddings_type == 'textually_close':
        mask_embs_entity = [(k, 
                             embeddings[k]['MASK'], 
                             textually_close_ent_ground_truth_df[textually_close_ent_ground_truth_df['name'] == k]['entity_ID'].values[0]) 
                            for k in embeddings 
                            if k in textually_close_ent_ground_truth_df['name'].tolist()]
    elif embeddings_type == 'lax':
        mask_embs_entity = [(k, 
                             embeddings[k]['MASK'], 
                             lax_ent_ground_truth_df[lax_ent_ground_truth_df['name'] == k]['entity_ID'].values[0]) 
                            for k in embeddings 
                            if k in lax_ent_ground_truth_df['name'].tolist()]
        
    tmp_df = pd.DataFrame(mask_embs_entity)
    same_entityness.append(sklearn.metrics.silhouette_score(np.array(tmp_df[1].tolist()), 
                                                            np.array(tmp_df[2]), 
                                                            metric='euclidean', 
                                                            random_state=0))
    
    same_entityness.append(sklearn.metrics.calinski_harabasz_score(np.array(tmp_df[1].tolist()), 
                                                                   np.array(tmp_df[2])))
    
    same_entityness.append(sklearn.metrics.davies_bouldin_score(np.array(tmp_df[1].tolist()), 
                                                                np.array(tmp_df[2])))
    
    tmp_df = pd.DataFrame(mask_embs_entity)
    entityness_matrix = np.array([np.array(emb) for emb in tmp_df[1]])
    k_choice = 45 # obtained by the elbow method
    kmean = KMeans(n_clusters=k_choice, random_state=0).fit(entityness_matrix, )
    predicted_clusters = kmean.predict(np.array([np.array(emb) for emb in tmp_df[1]]))
    
    same_entityness.append(sklearn.metrics.rand_score(np.array(tmp_df[2]), predicted_clusters))
    same_entityness.append(sklearn.metrics.adjusted_rand_score(np.array(tmp_df[2]), predicted_clusters))
    same_entityness.append(sklearn.metrics.mutual_info_score(np.array(tmp_df[2]), predicted_clusters))
    same_entityness.append(sklearn.metrics.adjusted_mutual_info_score(np.array(tmp_df[2]), 
                                                                      predicted_clusters, 
                                                                      average_method='arithmetic'))
    
    
    # GENDER
    gender = []
    
    if embeddings_type == 'first_version':
        mask_embs_gender = [(k, 
                             embeddings[k], 
                             ground_truth_data_df[ground_truth_data_df['name'] == k]['gender'].values[0]) 
                            for k in embeddings 
                            if k.lower() in ground_truth_data_df['name'].tolist()]
    elif embeddings_type == 'textually_close':
        mask_embs_gender = [(k, 
                             embeddings[k]['MASK'], 
                             textually_close_ent_ground_truth_df[textually_close_ent_ground_truth_df['name'] == k]['gender'].values[0]) 
                            for k in embeddings 
                            if k in textually_close_ent_ground_truth_df['name'].tolist()]
    elif embeddings_type == 'lax':
        mask_embs_gender = [(k, 
                             embeddings[k]['MASK'], 
                             lax_ent_ground_truth_df[lax_ent_ground_truth_df['name'] == k]['gender'].values[0]) 
                            for k in embeddings 
                            if k in lax_ent_ground_truth_df['name'].tolist()]

    tmp_df = pd.DataFrame(mask_embs_gender)
    gender.append(sklearn.metrics.silhouette_score(np.array(tmp_df[1].tolist()), 
                                                   np.array(tmp_df[2] == 'M').astype(int), 
                                                   metric='euclidean', 
                                                   random_state=0))
    gender.append(sklearn.metrics.calinski_harabasz_score(np.array(tmp_df[1].tolist()), np.array(tmp_df[2])))
    gender.append(sklearn.metrics.davies_bouldin_score(np.array(tmp_df[1].tolist()), np.array(tmp_df[2])))
    
    tmp_df = pd.DataFrame(mask_embs_gender)
    gender_matrix = np.array([np.array(emb) for emb in tmp_df[1]])
    k_choice = 2 # two genders in PG literature (men and women)
    kmean = KMeans(n_clusters=k_choice, random_state=0).fit(gender_matrix)
    predicted_clusters = kmean.predict(np.array([np.array(emb) for emb in tmp_df[1]]))
    
    gender.append(sklearn.metrics.rand_score(np.array(tmp_df[2]), predicted_clusters))
    gender.append(sklearn.metrics.adjusted_rand_score(np.array(tmp_df[2]), predicted_clusters))
    gender.append(sklearn.metrics.mutual_info_score(np.array(tmp_df[2]), predicted_clusters))
    gender.append(sklearn.metrics.adjusted_mutual_info_score(np.array(tmp_df[2]), predicted_clusters, 
                                                             average_method='arithmetic'))
    
    # FIRST APPEARANCE
    first_appearance = []
    
    # build distance matrix 
    if embeddings_type == 'first_version':
        mask_embs_appear = [(k, 
                             embeddings[k], 
                             ground_truth_data_df[ground_truth_data_df['name'] == k]['first appearance'].values[0]) 
                            for k in embeddings 
                            if k.lower() in ground_truth_data_df['name'].tolist()]
    elif embeddings_type == 'textually_close':
        mask_embs_appear = [(k, 
                             embeddings[k]['MASK'], 
                             textually_close_ent_ground_truth_df[textually_close_ent_ground_truth_df['name'] == k]['first appearance'].values[0]) 
                            for k in embeddings 
                            if k in textually_close_ent_ground_truth_df['name'].tolist()]
    elif embeddings_type == 'lax':
        mask_embs_appear = [(k, 
                             embeddings[k]['MASK'], 
                             lax_ent_ground_truth_df[lax_ent_ground_truth_df['name'] == k]['first appearance'].values[0]) 
                            for k in embeddings 
                            if k in lax_ent_ground_truth_df['name'].tolist()]
        
    tmp_df = pd.DataFrame(mask_embs_appear)
    appear_matrix = np.array(tmp_df[2]).reshape(-1, 1)

    # k based both on "vector" being predict (first appearance in book) and overall clustering
    # using elbow shape
    k_choice = 17
    kmean = KMeans(n_clusters=k_choice, random_state=0).fit(appear_matrix)

    first_appearance.append(sklearn.metrics.silhouette_score(np.array(tmp_df[1].tolist()), 
                                         kmean.predict(np.array(tmp_df[2]).reshape(-1,1)), 
                                         metric='euclidean', 
                                         random_state=0))
    
    first_appearance.append(sklearn.metrics.calinski_harabasz_score(np.array(tmp_df[1].tolist()), 
                                 kmean.predict(np.array(tmp_df[2]).reshape(-1,1))))
    
    first_appearance.append(sklearn.metrics.davies_bouldin_score(np.array(tmp_df[1].tolist()), 
                                 kmean.predict(np.array(tmp_df[2]).reshape(-1,1))))
    
    tmp_df = pd.DataFrame(mask_embs_appear)
    ground_truth_based_clusters = kmean.predict(np.array(tmp_df[2]).reshape(-1,1))
    appear_matrix = np.array([np.array(emb) for emb in tmp_df[1]])
    kmean = KMeans(n_clusters=k_choice, random_state=0).fit(appear_matrix)
    predicted_clusters = kmean.predict(np.array([np.array(emb) for emb in tmp_df[1]]))
    
    first_appearance.append(sklearn.metrics.rand_score(ground_truth_based_clusters, predicted_clusters))
    first_appearance.append(sklearn.metrics.adjusted_rand_score(ground_truth_based_clusters, predicted_clusters))
    first_appearance.append(sklearn.metrics.mutual_info_score(ground_truth_based_clusters, predicted_clusters))
    first_appearance.append(sklearn.metrics.adjusted_mutual_info_score(ground_truth_based_clusters, predicted_clusters, 
                                                                       average_method='arithmetic'))
    
    return same_entityness, gender, first_appearance

In [58]:
def print_clustering_metrics(embeddings, embeddings_type):
    '''Given embeddings, and their ground truth data type, display in a table several
    clustering performance metrics. The right `ground_truth_data_df`, 
    `textually_close_ent_ground_truth_df` or `lax_ent_ground_truth_df` should have been 
    loaded into memory before calling this function.

    Parameters
    ----------
    embeddings : dictionary
        The dictionary containing each entity and their associated embedding vector
    embeddings_type : str
        The matching ground truth data type for the given embeddings (either 'first_version',
        'textually_close' or 'lax')
    '''
    
    same_entityness, gender, first_appearance = get_clustering_metrics(embeddings, embeddings_type)
    print('--------------------------------------------------------------------------------------------')
    print('|                            | Same Entity-ness |  Gender  | First Appearance |')
    print('--------------------------------------------------------------------------------------------')
    print(f'| Silhouette Score           |     {same_entityness[0]:8.5f}     | {gender[0]:8.5f} |  {first_appearance[0]:8.5f}     |')
    print(f'| Calinski Harabasz Score    |     {same_entityness[1]:8.5f}     | {gender[1]:8.5f} |  {first_appearance[1]:8.5f}     |')
    print(f'| Davies Bouldin Score       |     {same_entityness[2]:8.5f}     | {gender[2]:8.5f} |  {first_appearance[2]:8.5f}     |')
    print(f'| Rand Score                 |     {same_entityness[3]:8.5f}     | {gender[3]:8.5f} |  {first_appearance[3]:8.5f}     |')
    print(f'| Adjusted Rand Score        |     {same_entityness[4]:8.5f}     | {gender[4]:8.5f} |  {first_appearance[4]:8.5f}     |')
    print(f'| Mutual Info Score          |     {same_entityness[5]:8.5f}     | {gender[5]:8.5f} |  {first_appearance[5]:8.5f}     |')
    print(f'| Adjusted Mutual Info Score |     {same_entityness[6]:8.5f}     | {gender[6]:8.5f} |  {first_appearance[6]:8.5f}     |')
    print('--------------------------------------------------------------------------------------------')

In [None]:
same_entityness = []
    
if embeddings_type == 'first_version':
    mask_embs_entity = [(k, 
                         embeddings[k], 
                         ground_truth_data_df[ground_truth_data_df['name'] == k]['entity_ID'].values[0]) 
                        for k in embeddings 
                        if k.lower() in ground_truth_data_df['name'].tolist()]
elif embeddings_type == 'textually_close':
    mask_embs_entity = [(k, 
                         embeddings[k]['MASK'], 
                         textually_close_ent_ground_truth_df[textually_close_ent_ground_truth_df['name'] == k]['entity_ID'].values[0]) 
                        for k in embeddings 
                        if k in textually_close_ent_ground_truth_df['name'].tolist()]
elif embeddings_type == 'lax':
    mask_embs_entity = [(k, 
                         embeddings[k]['MASK'], 
                         lax_ent_ground_truth_df[lax_ent_ground_truth_df['name'] == k]['entity_ID'].values[0]) 
                        for k in embeddings 
                        if k in lax_ent_ground_truth_df['name'].tolist()]

In [2]:
embeddings_dict = french_word_embeddings('flaubert/flaubert_base_cased', '798-8')

Some weights of the model checkpoint at flaubert/flaubert_base_cased were not used when initializing FlaubertModel: ['pred_layer.proj.weight', 'pred_layer.proj.bias']
- This IS expected if you are initializing FlaubertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing FlaubertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  0%|▏                                                                                                                    | 13/7929 [00:32<5:28:00,  2.49s/it]


KeyboardInterrupt: 

# ATTENTION check how it writes a torch tensor and how it is reading it

In [12]:
import csv

with open('word_embeddings.csv', 'w') as f:  # You will need 'wb' mode in Python 2.x
    w = csv.writer(f)
    w.writerow(['key','value'])
    for k,v in embeddings_dict.items() :
        w.writerow([k,v.numpy()])

In [13]:
embeddings_dict = {}
with open('word_embeddings.csv', newline='') as pscfile:
    reader = csv.DictReader(pscfile)
    for row in reader:
        embeddings_dict[row['key']] = torch.from_numpy(numpy.array(row['value']))

In [9]:
embeddings = get_entities_embeddings('798-8', embeddings_dict)

In [None]:
embeddings_type = 'first_version'

print('Word2Vec Embeddings - Skip-gram')
print_clustering_metrics(embeddings, embeddings_type)