In [1]:
import sys
sys.path.append('../Util')

In [2]:
from evaluation import aabcc, sig_props, correlation, lr, perceptron, kmeans_1dim, \
                     score_comparison, run_tests, report, dimensions_report, repeated_dimensions, \
                    kmeans_multi_dim
from preparation import prepare_dataset, read_datasets

In [3]:
import pandas as pd
import numpy as np
from scipy.stats import shapiro
import seaborn as sns

In [4]:
models = [
    {
        'name': 'flaubert/flaubert_small_cased',
        'label': 'flau_small_c'
    },
    {
    
        'name': 'flaubert/flaubert_base_uncased', 
        'label': 'flau_base_u'

    },
    {
        'name': 'flaubert/flaubert_base_cased',
        'label': 'flau_base_c'
    },
    {
        'name': 'flaubert/flaubert_large_cased',
        'label': 'flau_large_c'
    },
    {
        'name': 'camembert/camembert-base',
        'label': 'cam_base'
    }
    
]

In [5]:
labels = [m['label'] for m in models]

# Nouns

In [6]:
we_with_features = read_datasets(
                            path = '../Data',
                            model_labels = labels,
                            file_name = 'all_unique_pos_we.csv'
                    )

In [7]:
feature_col_count = 6
feature = 'Number'

In [8]:
we_with_features[0][we_with_features[0].POS == 'NOUN']['Number'].unique()

array(['invariable', 'singular', 'plural'], dtype=object)

We can also see that `Number` can contain `invariable` values, since our tests are tailored to be binary, we will exclude these words from datasets prior to normalization.

In [9]:
normalized_dims = []
feature_vectors = []

In [10]:
for we in we_with_features:
    we = we[we.POS == 'NOUN']
    we_binary = we[we.Number != 'invariable']
    dims, f_vec = prepare_dataset(we_binary, feature_col_count=feature_col_count, feature_name=feature)
    normalized_dims.append(dims)
    feature_vectors.append(f_vec)

Now we will run our 6 tests (aabcc, sig-props, correlation, logistic regression weights, perceptron weights, Kmeans clustering on one dimension) for each of the models.

In [11]:
tests = [
    aabcc,
    sig_props,
    correlation,
    lr,
    perceptron,
    kmeans_1dim
]

In [12]:
all_res = run_tests(tests, normalized_dims, feature_vectors, labels, report_progress=True)

Currently running: AABCC
		Model: flau_small_c
		Model: flau_base_u
		Model: flau_base_c
		Model: flau_large_c
		Model: cam_base
Done
Currently running: SIG_PROPS
		Model: flau_small_c
		Model: flau_base_u
		Model: flau_base_c
		Model: flau_large_c
		Model: cam_base
Done
Currently running: CORRELATION
		Model: flau_small_c
		Model: flau_base_u
		Model: flau_base_c
		Model: flau_large_c
		Model: cam_base
Done
Currently running: LR
		Model: flau_small_c
		Model: flau_base_u
		Model: flau_base_c
		Model: flau_large_c
		Model: cam_base
Done
Currently running: PERCEPTRON
		Model: flau_small_c
		Model: flau_base_u
		Model: flau_base_c
		Model: flau_large_c
		Model: cam_base
Done
Currently running: KMEANS_1DIM
		Model: flau_small_c
		Model: flau_base_u
		Model: flau_base_c
		Model: flau_large_c
		Model: cam_base
Done


Now we can compare the distribution of test results for each test and each model:

In [13]:
report_df = report(all_res, tests, labels)
report_df

Unnamed: 0,Min,Max,Mean,25%,50%,75%,95%,97.7%,99.9%
AABCC_flau_small_c,22244.0,48568.0,24216.335938,23369.25,23797.5,24547.25,26736.55,28568.515,42061.948
AABCC_flau_base_u,28853.0,32313.0,30457.335938,30078.0,30423.0,30860.75,31388.85,31665.513,32247.038
AABCC_flau_base_c,22386.0,34012.0,23757.272135,23247.5,23592.0,24049.0,25181.35,25618.055,30441.615
AABCC_flau_large_c,22133.0,99703.0,25131.213867,23454.25,23987.0,25218.5,30207.2,34123.363,83129.946
AABCC_cam_base,21741.0,26388.0,23776.31901,23285.75,23741.5,24167.25,24937.6,25330.103,26206.988
SIG_PROPS_flau_small_c,7.8e-05,0.106517,0.02343,0.009008,0.020049,0.033445,0.055108,0.063523,0.101663
SIG_PROPS_flau_base_u,7e-06,0.031809,0.008123,0.003207,0.006719,0.012186,0.01977,0.022096,0.030548
SIG_PROPS_flau_base_c,3.5e-05,0.10563,0.015027,0.005839,0.012248,0.021794,0.03759,0.0449,0.08404
SIG_PROPS_flau_large_c,4.9e-05,0.148887,0.029722,0.012866,0.024545,0.040563,0.073748,0.086076,0.141795
SIG_PROPS_cam_base,4e-06,0.02249,0.005,0.001949,0.004032,0.007126,0.012639,0.0144,0.021805


In [14]:
percentile = 99

In [15]:
dimensions_df = dimensions_report(all_res, tests, labels, percentile)
dimensions_df

Unnamed: 0,flau_small_c,flau_base_u,flau_base_c,flau_large_c,cam_base
AABCC,"[310, 54, 288, 243, 285, 81]","[234, 419, 316, 538, 687, 191, 27, 56]","[238, 594, 176, 343, 180, 466, 509, 125]","[296, 691, 1022, 556, 576, 641, 591, 238, 243,...","[233, 384, 547, 41, 42, 572, 339, 147]"
SIG_PROPS,"[310, 54, 288, 278, 285, 81]","[687, 384, 27, 11, 133, 626, 538, 77]","[238, 594, 672, 176, 82, 117, 180, 466]","[691, 576, 296, 556, 238, 641, 591, 1022, 703,...","[24, 100, 374, 309, 119, 714, 485, 57]"
CORRELATION,"[310, 54, 285, 278, 288, 81]","[687, 384, 11, 27, 133, 538, 431, 81]","[238, 594, 176, 466, 325, 125, 467, 180]","[576, 691, 641, 296, 556, 238, 591, 703, 1022,...","[24, 374, 485, 294, 595, 758, 57, 404]"
LR,"[310, 158, 54, 359, 208, 182]","[626, 402, 191, 473, 316, 252, 293, 356]","[205, 594, 77, 580, 292, 274, 717, 191]","[576, 352, 851, 691, 387, 486, 103, 137, 185, ...","[58, 327, 345, 580, 374, 118, 661, 275]"
PERCEPTRON,"[158, 54, 310, 285, 172, 420]","[626, 687, 11, 402, 56, 706, 431, 27]","[238, 176, 672, 594, 127, 466, 268, 148]","[576, 691, 556, 634, 370, 508, 921, 373, 1015,...","[340, 409, 227, 628, 517, 705, 359, 762]"
KMEANS_1DIM,"[310, 54, 243, 285, 288, 278]","[384, 27, 81, 687, 431, 23, 96, 127]","[238, 682, 176, 127, 509, 34, 610, 620]","[691, 576, 641, 296, 703, 238, 556, 591, 1022,...","[309, 100, 714, 467, 272, 348, 717, 577]"


In [16]:
repeated_dimensions_df = repeated_dimensions(dimensions_df, labels)

In [17]:
repeated_dimensions_df

Unnamed: 0,1 test,2 tests,3 tests,4 tests,5 tests,6 tests
flau_small_c,"[54, 81, 158, 172, 182, 208, 243, 278, 285, 28...","[54, 81, 158, 243, 278, 285, 288, 310]","[54, 81, 278, 285, 288, 310]","[54, 285, 288, 310]","[54, 285, 310]","[54, 310]"
flau_base_u,"[11, 23, 27, 56, 77, 81, 96, 127, 133, 191, 23...","[11, 27, 56, 81, 133, 191, 316, 384, 402, 431,...","[11, 27, 384, 431, 538, 626, 687]","[27, 687]","[27, 687]",[]
flau_base_c,"[34, 77, 82, 117, 125, 127, 148, 176, 180, 191...","[125, 127, 176, 180, 238, 466, 509, 594, 672]","[176, 180, 238, 466, 594]","[176, 238, 466, 594]","[176, 238, 594]",[]
flau_large_c,"[103, 137, 185, 238, 243, 296, 352, 370, 373, ...","[238, 296, 370, 373, 505, 556, 576, 591, 641, ...","[238, 296, 505, 556, 576, 591, 641, 691, 703, ...","[238, 296, 556, 576, 591, 641, 691, 703, 1022]","[556, 576, 691]","[576, 691]"
cam_base,"[24, 41, 42, 57, 58, 100, 118, 119, 147, 227, ...","[24, 57, 100, 309, 374, 485, 714]",[374],[],[],[]


In [18]:
ari_scores_df = pd.DataFrame(columns = ['All_dims', '1_dim_best', 
                                        '1_test_dims', '2_test_dims',
                                        '3_test_dims', '4_test_dims',
                                        '5_test_dims', '6_test_dims'
                                       ])

In [19]:
for i in range(len(labels)):
    model = labels[i]
    n_dims = len(all_res[0][i])
    ari_scores_df.loc[model] = {
        'All_dims' : kmeans_multi_dim(normalized_dims[i], list(range(n_dims)), feature_vectors[i]), 
        '1_dim_best': all_res[-1][i][0][1], 
        '1_test_dims': kmeans_multi_dim(normalized_dims[i], repeated_dimensions_df.loc[model]['1 test'], feature_vectors[i]), 
        '2_test_dims': kmeans_multi_dim(normalized_dims[i], repeated_dimensions_df.loc[model]['2 tests'], feature_vectors[i]),
        '3_test_dims': kmeans_multi_dim(normalized_dims[i], repeated_dimensions_df.loc[model]['3 tests'], feature_vectors[i]),
        '4_test_dims': kmeans_multi_dim(normalized_dims[i], repeated_dimensions_df.loc[model]['4 tests'], feature_vectors[i]),
        '5_test_dims': kmeans_multi_dim(normalized_dims[i], repeated_dimensions_df.loc[model]['5 tests'], feature_vectors[i]),
        '6_test_dims': kmeans_multi_dim(normalized_dims[i], repeated_dimensions_df.loc[model]['6 tests'], feature_vectors[i]),
    }

In [20]:
ari_scores_df

Unnamed: 0,All_dims,1_dim_best,1_test_dims,2_test_dims,3_test_dims,4_test_dims,5_test_dims,6_test_dims
flau_small_c,0.004134,0.096621,0.151337,0.179701,0.258801,0.197642,0.15724,0.120284
flau_base_u,0.003068,0.018775,0.00015,0.002399,0.011314,0.023658,0.023658,
flau_base_c,0.000692,0.060955,0.014,0.114666,0.108591,0.125442,0.102629,
flau_large_c,0.018905,0.201408,0.646613,0.609155,0.534539,0.514189,0.316251,0.350003
cam_base,0.011367,0.011555,0.011367,0.011367,-0.009612,,,


# Adjectives

We can now repeat the experiment for Adjectives.

In [21]:
normalized_dims = []
feature_vectors = []

In [22]:
for we in we_with_features:
    we = we[we.POS == 'ADJ']
    we_binary = we[we.Number != 'invariable']
    dims, f_vec = prepare_dataset(we_binary, feature_col_count=feature_col_count, feature_name=feature)
    normalized_dims.append(dims)
    feature_vectors.append(f_vec)

In [23]:
all_res_adj = run_tests(tests, normalized_dims, feature_vectors, labels, report_progress=True)

Currently running: AABCC
		Model: flau_small_c
		Model: flau_base_u
		Model: flau_base_c
		Model: flau_large_c
		Model: cam_base
Done
Currently running: SIG_PROPS
		Model: flau_small_c
		Model: flau_base_u
		Model: flau_base_c
		Model: flau_large_c
		Model: cam_base
Done
Currently running: CORRELATION
		Model: flau_small_c
		Model: flau_base_u
		Model: flau_base_c
		Model: flau_large_c
		Model: cam_base
Done
Currently running: LR
		Model: flau_small_c
		Model: flau_base_u
		Model: flau_base_c
		Model: flau_large_c
		Model: cam_base
Done
Currently running: PERCEPTRON
		Model: flau_small_c
		Model: flau_base_u
		Model: flau_base_c
		Model: flau_large_c
		Model: cam_base
Done
Currently running: KMEANS_1DIM
		Model: flau_small_c
		Model: flau_base_u
		Model: flau_base_c
		Model: flau_large_c
		Model: cam_base
Done


In [24]:
report_adj_df = report(all_res_adj, tests, labels)
report_adj_df

Unnamed: 0,Min,Max,Mean,25%,50%,75%,95%,97.7%,99.9%
AABCC_flau_small_c,5205.0,8481.0,5867.908203,5658.5,5806.5,5975.75,6466.6,6667.82,7891.306
AABCC_flau_base_u,6242.0,7375.0,6773.348958,6652.75,6763.0,6880.5,7082.0,7155.154,7316.708
AABCC_flau_base_c,5307.0,7936.0,5801.888021,5653.75,5771.5,5906.5,6205.9,6343.872,7613.86
AABCC_flau_large_c,5276.0,20939.0,6030.698242,5688.75,5859.5,6085.75,7015.05,7684.448,15593.919
AABCC_cam_base,5902.0,9829.0,6942.095052,6624.0,6887.0,7195.25,7776.0,8108.129,9056.631
SIG_PROPS_flau_small_c,0.000271,0.112395,0.027404,0.011423,0.023514,0.039546,0.065579,0.075119,0.101266
SIG_PROPS_flau_base_u,1.3e-05,0.042709,0.008828,0.003423,0.007321,0.012847,0.02142,0.025335,0.036916
SIG_PROPS_flau_base_c,3.9e-05,0.100737,0.017824,0.006774,0.014787,0.025342,0.044941,0.049363,0.094704
SIG_PROPS_flau_large_c,5.9e-05,0.16405,0.03192,0.012833,0.027471,0.04543,0.076536,0.090656,0.156243
SIG_PROPS_cam_base,5.4e-05,0.092014,0.020008,0.007883,0.016227,0.027982,0.049259,0.061104,0.09102


In [25]:
dimensions_adj_df = dimensions_report(all_res_adj, tests, labels, percentile)
dimensions_adj_df

Unnamed: 0,flau_small_c,flau_base_u,flau_base_c,flau_large_c,cam_base
AABCC,"[310, 360, 54, 81, 250, 477]","[629, 282, 608, 477, 268, 516, 186, 329]","[594, 238, 163, 180, 467, 199, 127, 23]","[576, 556, 296, 641, 1022, 691, 634, 161, 928,...","[567, 118, 419, 188, 39, 442, 267, 362]"
SIG_PROPS,"[310, 54, 250, 81, 477, 360]","[687, 127, 635, 81, 218, 145, 580, 221]","[238, 594, 180, 82, 117, 467, 199, 672]","[576, 691, 641, 556, 296, 1022, 161, 921, 783,...","[100, 272, 309, 459, 714, 3, 467, 153]"
CORRELATION,"[310, 54, 250, 360, 477, 384]","[687, 127, 635, 499, 81, 524, 221, 218]","[238, 594, 180, 467, 34, 163, 176, 253]","[576, 556, 641, 296, 691, 1022, 161, 634, 921,...","[233, 153, 459, 643, 3, 536, 602, 512]"
LR,"[310, 54, 359, 384, 250, 285]","[687, 706, 252, 763, 598, 635, 27, 97]","[36, 597, 507, 82, 292, 253, 594, 290]","[556, 1022, 638, 641, 576, 787, 250, 452, 237,...","[268, 459, 235, 369, 655, 587, 323, 482]"
PERCEPTRON,"[310, 54, 285, 445, 477, 172]","[687, 706, 580, 198, 11, 81, 221, 29]","[238, 176, 290, 127, 169, 594, 467, 509]","[556, 641, 576, 691, 1022, 161, 928, 787, 591,...","[173, 139, 436, 542, 38, 637, 435, 478]"
KMEANS_1DIM,"[310, 54, 250, 360, 477, 81]","[687, 127, 85, 499, 72, 700, 635, 615]","[238, 176, 127, 467, 675, 264, 495, 175]","[556, 576, 691, 1022, 641, 296, 161, 921, 591,...","[741, 211, 472, 709, 675, 169, 208, 615]"


In [26]:
repeated_dimensions_adj_df = repeated_dimensions(dimensions_adj_df, labels)
repeated_dimensions_adj_df

Unnamed: 0,1 test,2 tests,3 tests,4 tests,5 tests,6 tests
flau_small_c,"[54, 81, 172, 250, 285, 310, 359, 360, 384, 44...","[54, 81, 250, 285, 310, 360, 384, 477]","[54, 81, 250, 310, 360, 477]","[54, 250, 310, 360, 477]","[54, 250, 310, 477]","[54, 310]"
flau_base_u,"[11, 27, 29, 72, 81, 85, 97, 127, 145, 186, 19...","[81, 127, 218, 221, 499, 580, 635, 687, 706]","[81, 127, 221, 635, 687]","[635, 687]",[687],[]
flau_base_c,"[23, 34, 36, 82, 117, 127, 163, 169, 175, 176,...","[82, 127, 163, 176, 180, 199, 238, 253, 290, 4...","[127, 176, 180, 238, 467, 594]","[238, 467, 594]","[238, 467, 594]",[]
flau_large_c,"[147, 161, 237, 243, 250, 296, 330, 352, 452, ...","[161, 243, 296, 556, 576, 591, 634, 641, 691, ...","[161, 296, 556, 576, 591, 634, 641, 691, 783, ...","[161, 296, 556, 576, 641, 691, 1022]","[161, 556, 576, 641, 691, 1022]","[556, 576, 641, 1022]"
cam_base,"[3, 38, 39, 100, 118, 139, 153, 169, 173, 188,...","[3, 153, 459]",[459],[],[],[]


In [27]:
ari_scores_adj_df = pd.DataFrame(columns = ['All_dims', '1_dim_best', 
                                        '1_test_dims', '2_test_dims',
                                        '3_test_dims', '4_test_dims',
                                        '5_test_dims', '6_test_dims'
                                       ])

In [28]:
for i in range(len(labels)):
    model = labels[i]
    n_dims = len(all_res[0][i])
    ari_scores_adj_df.loc[model] = {
        'All_dims' : kmeans_multi_dim(normalized_dims[i], list(range(n_dims)), feature_vectors[i]), 
        '1_dim_best': all_res_adj[-1][i][0][1], 
        '1_test_dims': kmeans_multi_dim(normalized_dims[i], repeated_dimensions_adj_df.loc[model]['1 test'], feature_vectors[i]), 
        '2_test_dims': kmeans_multi_dim(normalized_dims[i], repeated_dimensions_adj_df.loc[model]['2 tests'], feature_vectors[i]),
        '3_test_dims': kmeans_multi_dim(normalized_dims[i], repeated_dimensions_adj_df.loc[model]['3 tests'], feature_vectors[i]),
        '4_test_dims': kmeans_multi_dim(normalized_dims[i], repeated_dimensions_adj_df.loc[model]['4 tests'], feature_vectors[i]),
        '5_test_dims': kmeans_multi_dim(normalized_dims[i], repeated_dimensions_adj_df.loc[model]['5 tests'], feature_vectors[i]),
        '6_test_dims': kmeans_multi_dim(normalized_dims[i], repeated_dimensions_adj_df.loc[model]['6 tests'], feature_vectors[i]),
    }

In [29]:
ari_scores_adj_df

Unnamed: 0,All_dims,1_dim_best,1_test_dims,2_test_dims,3_test_dims,4_test_dims,5_test_dims,6_test_dims
flau_small_c,0.002865,0.115765,0.408764,0.223059,0.379892,0.163161,0.218003,0.142799
flau_base_u,-0.00318,0.02733,0.024735,-0.003837,0.039666,0.030349,0.02733,
flau_base_c,0.002051,0.098172,0.016122,0.026081,0.127452,0.132977,0.132977,
flau_large_c,-0.002011,0.25373,0.693412,0.627352,0.627302,0.585543,0.589016,0.51171
cam_base,-0.026407,0.055561,-0.026407,-0.025258,-0.005929,,,


For all FlauBERT models we see clustering quality improvement when using dimensions appearing in 99th percentile of test results. On FlauBERTlarge  ARI metric reaches 0.6+ which can be a sign of a good clustering.

However, for CamemBERT the highest ARI scores are seen on one dimension only and they stay pretty low.