In [1]:
import sys
sys.path.append('../Util')

In [2]:
from evaluation import aabcc, sig_props, correlation, lr, perceptron, kmeans_1dim, \
                     score_comparison, run_tests, report, dimensions_report, repeated_dimensions, \
                    kmeans_multi_dim
from preparation import prepare_dataset, read_datasets

In [3]:
import pandas as pd
import numpy as np
from scipy.stats import shapiro
import seaborn as sns

In [4]:
models = [
    {
        'name': 'flaubert/flaubert_small_cased',
        'label': 'flau_small_c'
    },
    {
    
        'name': 'flaubert/flaubert_base_uncased', 
        'label': 'flau_base_u'

    },
    {
        'name': 'flaubert/flaubert_base_cased',
        'label': 'flau_base_c'
    },
    {
        'name': 'flaubert/flaubert_large_cased',
        'label': 'flau_large_c'
    },
    {
        'name': 'camembert/camembert-base',
        'label': 'cam_base'
    }
    
]

In [5]:
labels = [m['label'] for m in models]

# Nouns

In [6]:
we_with_features = read_datasets(
                            path = '../Data',
                            model_labels = labels,
                            file_name = 'all_unique_pos_we.csv'
                    )

In [7]:
feature_col_count = 6
feature = 'POS'

In [8]:
normalized_dims = []
feature_vectors = []

In [9]:
for we in we_with_features:
    we_copy = we.copy()
    we_copy['POS'] = we_copy.POS.apply(lambda x: int(x == 'NOUN'))
    dims, f_vec = prepare_dataset(we_copy, feature_col_count=feature_col_count, feature_name=feature)
    normalized_dims.append(dims)
    feature_vectors.append(f_vec)

Now we will run our 6 tests (aabcc, sig-props, correlation, logistic regression weights, perceptron weights, Kmeans clustering on one dimension) for each of the models.

In [10]:
tests = [
    aabcc,
    sig_props,
    correlation,
    lr,
    perceptron,
    kmeans_1dim
]

In [11]:
all_res = run_tests(tests, normalized_dims, feature_vectors, labels, report_progress=True)

Currently running: AABCC
		Model: flau_small_c
		Model: flau_base_u
		Model: flau_base_c
		Model: flau_large_c
		Model: cam_base
Done
Currently running: SIG_PROPS
		Model: flau_small_c
		Model: flau_base_u
		Model: flau_base_c
		Model: flau_large_c
		Model: cam_base
Done
Currently running: CORRELATION
		Model: flau_small_c
		Model: flau_base_u
		Model: flau_base_c
		Model: flau_large_c
		Model: cam_base
Done
Currently running: LR
		Model: flau_small_c
		Model: flau_base_u
		Model: flau_base_c
		Model: flau_large_c
		Model: cam_base
Done
Currently running: PERCEPTRON
		Model: flau_small_c
		Model: flau_base_u
		Model: flau_base_c
		Model: flau_large_c
		Model: cam_base
Done
Currently running: KMEANS_1DIM
		Model: flau_small_c
		Model: flau_base_u
		Model: flau_base_c
		Model: flau_large_c
		Model: cam_base
Done


Now we can compare the distribution of test results for each test and each model:

In [12]:
report_df = report(all_res, tests, labels)
report_df

Unnamed: 0,Min,Max,Mean,25%,50%,75%,95%,97.7%,99.9%
AABCC_flau_small_c,31422.0,62224.0,34461.472656,32757.5,33478.0,34893.5,40001.8,44363.585,56543.213
AABCC_flau_base_u,42112.0,53989.0,44938.778646,44023.0,44736.5,45526.0,47476.5,48809.799,52685.1
AABCC_flau_base_c,31706.0,35133.0,32890.979167,32576.5,32829.5,33158.0,33844.6,34120.154,34758.704
AABCC_flau_large_c,31720.0,49187.0,34037.014648,32757.5,33335.0,34390.75,38543.7,40315.845,48337.384
AABCC_cam_base,21163.0,22948.0,21962.825521,21745.75,21955.0,22163.0,22464.55,22580.077,22884.339
SIG_PROPS_flau_small_c,0.000167,0.122715,0.029502,0.01208,0.025333,0.043028,0.073075,0.087666,0.116735
SIG_PROPS_flau_base_u,3.9e-05,0.089002,0.015234,0.005653,0.012029,0.021488,0.039344,0.047686,0.073597
SIG_PROPS_flau_base_c,5.3e-05,0.039942,0.01219,0.00491,0.010753,0.01772,0.029714,0.033731,0.038877
SIG_PROPS_flau_large_c,4.5e-05,0.093186,0.024106,0.009389,0.020202,0.034535,0.060578,0.06862,0.091916
SIG_PROPS_cam_base,3.1e-05,0.021332,0.00557,0.002352,0.004721,0.007926,0.013689,0.016404,0.021023


In [13]:
percentile = 99

In [14]:
dimensions_df = dimensions_report(all_res, tests, labels, percentile)
dimensions_df

Unnamed: 0,flau_small_c,flau_base_u,flau_base_c,flau_large_c,cam_base
AABCC,"[159, 346, 31, 305, 212, 409]","[178, 405, 107, 565, 298, 217, 299, 390]","[234, 75, 87, 119, 80, 366, 382, 705]","[437, 57, 207, 458, 988, 800, 914, 831, 966, 4...","[574, 315, 436, 291, 517, 88, 69, 626]"
SIG_PROPS,"[159, 346, 212, 504, 480, 401]","[405, 153, 565, 626, 89, 720, 170, 178]","[55, 345, 608, 687, 50, 76, 367, 426]","[207, 988, 437, 458, 354, 800, 914, 966, 57, 6...","[602, 536, 217, 579, 176, 100, 243, 694]"
CORRELATION,"[159, 346, 305, 409, 212, 504]","[107, 299, 170, 390, 89, 178, 649, 217]","[234, 366, 76, 152, 705, 688, 87, 608]","[437, 458, 207, 988, 800, 57, 479, 354, 914, 5...","[48, 602, 436, 517, 695, 360, 176, 475]"
LR,"[485, 132, 275, 176, 378, 339]","[405, 203, 238, 763, 383, 159, 385, 499]","[47, 57, 650, 248, 345, 476, 572, 754]","[82, 614, 968, 525, 1016, 482, 642, 978, 363, ...","[525, 271, 602, 436, 697, 696, 206, 486]"
PERCEPTRON,"[378, 159, 409, 305, 303, 462]","[405, 383, 170, 299, 672, 390, 238, 435]","[454, 564, 417, 106, 162, 345, 87, 55]","[437, 88, 44, 988, 1016, 549, 914, 413, 961, 2...","[723, 517, 436, 573, 251, 271, 695, 726]"
KMEANS_1DIM,"[159, 346, 305, 480, 458, 212]","[565, 626, 89, 720, 299, 390, 298, 748]","[80, 87, 720, 38, 564, 417, 745, 366]","[437, 458, 57, 207, 988, 800, 956, 479, 694, 9...","[251, 174, 475, 250, 48, 320, 357, 166]"


In [15]:
repeated_dimensions_df = repeated_dimensions(dimensions_df, labels)

In [16]:
repeated_dimensions_df

Unnamed: 0,1 test,2 tests,3 tests,4 tests,5 tests,6 tests
flau_small_c,"[31, 132, 159, 176, 212, 275, 303, 305, 339, 3...","[159, 212, 305, 346, 378, 409, 480, 504]","[159, 212, 305, 346, 409]","[159, 212, 305, 346]",[159],[]
flau_base_u,"[89, 107, 153, 159, 170, 178, 203, 217, 238, 2...","[89, 107, 170, 178, 217, 238, 298, 299, 383, 3...","[89, 170, 178, 299, 390, 405, 565]","[299, 390, 405]",[],[]
flau_base_c,"[38, 47, 50, 55, 57, 75, 76, 80, 87, 106, 119,...","[55, 76, 80, 87, 234, 345, 366, 417, 564, 608,...","[87, 345, 366]",[87],[],[]
flau_large_c,"[44, 57, 82, 88, 207, 253, 354, 363, 413, 437,...","[44, 57, 207, 354, 437, 458, 479, 800, 914, 95...","[57, 207, 437, 458, 479, 800, 914, 966, 988]","[57, 207, 437, 458, 800, 914, 988]","[207, 437, 458, 988]",[]
cam_base,"[48, 69, 88, 100, 166, 174, 176, 206, 217, 243...","[48, 176, 251, 271, 436, 475, 517, 602, 695]","[436, 517, 602]",[436],[],[]


In [17]:
ari_scores_df = pd.DataFrame(columns = ['All_dims', '1_dim_best', 
                                        '1_test_dims', '2_test_dims',
                                        '3_test_dims', '4_test_dims',
                                        '5_test_dims', '6_test_dims'
                                       ])

In [18]:
for i in range(len(labels)):
    model = labels[i]
    n_dims = len(all_res[0][i])
    ari_scores_df.loc[model] = {
        'All_dims' : kmeans_multi_dim(normalized_dims[i], list(range(n_dims)), feature_vectors[i]), 
        '1_dim_best': all_res[-1][i][0][1], 
        '1_test_dims': kmeans_multi_dim(normalized_dims[i], repeated_dimensions_df.loc[model]['1 test'], feature_vectors[i]), 
        '2_test_dims': kmeans_multi_dim(normalized_dims[i], repeated_dimensions_df.loc[model]['2 tests'], feature_vectors[i]),
        '3_test_dims': kmeans_multi_dim(normalized_dims[i], repeated_dimensions_df.loc[model]['3 tests'], feature_vectors[i]),
        '4_test_dims': kmeans_multi_dim(normalized_dims[i], repeated_dimensions_df.loc[model]['4 tests'], feature_vectors[i]),
        '5_test_dims': kmeans_multi_dim(normalized_dims[i], repeated_dimensions_df.loc[model]['5 tests'], feature_vectors[i]),
        '6_test_dims': kmeans_multi_dim(normalized_dims[i], repeated_dimensions_df.loc[model]['6 tests'], feature_vectors[i]),
    }

In [19]:
ari_scores_df

Unnamed: 0,All_dims,1_dim_best,1_test_dims,2_test_dims,3_test_dims,4_test_dims,5_test_dims,6_test_dims
flau_small_c,0.204495,0.137337,0.353523,0.317189,0.267117,0.240862,0.137337,
flau_base_u,0.058136,0.052776,0.056432,0.056031,0.055691,0.019856,,
flau_base_c,0.003036,0.018629,0.007435,0.006505,0.00651,0.01471,,
flau_large_c,0.218357,0.103452,0.405854,0.347859,0.3571,0.314346,0.251932,
cam_base,-0.004846,0.012438,-0.004846,-0.003285,-0.000429,-0.002846,,


# Adjectives

We can now repeat the experiment for Adjectives.

In [20]:
we_with_features = read_datasets(
                            path = '../Data',
                            model_labels = labels,
                            file_name = 'all_unique_pos_we.csv'
                    )

In [21]:
normalized_dims = []
feature_vectors = []

In [22]:
for we in we_with_features:
    we_copy = we.copy()
    we_copy['POS'] = we_copy.POS.apply(lambda x: int(x == 'ADJ'))
    dims, f_vec = prepare_dataset(we_copy, feature_col_count=feature_col_count, feature_name=feature)
    normalized_dims.append(dims)
    feature_vectors.append(f_vec)

In [23]:
all_res_adj = run_tests(tests, normalized_dims, feature_vectors, labels, report_progress=True)

Currently running: AABCC
		Model: flau_small_c
		Model: flau_base_u
		Model: flau_base_c
		Model: flau_large_c
		Model: cam_base
Done
Currently running: SIG_PROPS
		Model: flau_small_c
		Model: flau_base_u
		Model: flau_base_c
		Model: flau_large_c
		Model: cam_base
Done
Currently running: CORRELATION
		Model: flau_small_c
		Model: flau_base_u
		Model: flau_base_c
		Model: flau_large_c
		Model: cam_base
Done
Currently running: LR
		Model: flau_small_c
		Model: flau_base_u
		Model: flau_base_c
		Model: flau_large_c
		Model: cam_base
Done
Currently running: PERCEPTRON
		Model: flau_small_c
		Model: flau_base_u
		Model: flau_base_c
		Model: flau_large_c
		Model: cam_base
Done
Currently running: KMEANS_1DIM
		Model: flau_small_c
		Model: flau_base_u
		Model: flau_base_c
		Model: flau_large_c
		Model: cam_base
Done


In [24]:
report_adj_df = report(all_res_adj, tests, labels)
report_adj_df

Unnamed: 0,Min,Max,Mean,25%,50%,75%,95%,97.7%,99.9%
AABCC_flau_small_c,67517.0,116682.0,74219.800781,70414.75,72196.0,76050.0,85736.35,91324.681,109479.455
AABCC_flau_base_u,86652.0,102300.0,91842.483073,90369.25,91592.0,93039.5,95614.5,97137.745,102173.445
AABCC_flau_base_c,66346.0,75657.0,70441.923177,69441.5,70422.0,71351.5,72812.8,73520.361,75584.902
AABCC_flau_large_c,66222.0,148059.0,74493.316406,70467.5,72306.0,75965.75,88761.95,94135.615,115856.3
AABCC_cam_base,46417.0,54038.0,49939.473958,49083.0,49961.0,50756.25,51921.25,52642.436,53715.86
SIG_PROPS_flau_small_c,9e-06,0.091713,0.02646,0.011059,0.020236,0.03912,0.06564,0.072256,0.090863
SIG_PROPS_flau_base_u,1.2e-05,0.047029,0.01139,0.004669,0.009695,0.015912,0.02807,0.031708,0.044465
SIG_PROPS_flau_base_c,1e-06,0.026337,0.007266,0.002938,0.006483,0.010718,0.017617,0.020575,0.026227
SIG_PROPS_flau_large_c,5e-05,0.105988,0.023894,0.008887,0.019882,0.034717,0.058813,0.068721,0.10247
SIG_PROPS_cam_base,9e-06,0.061796,0.011837,0.004805,0.009778,0.016549,0.030752,0.036127,0.060482


In [25]:
dimensions_adj_df = dimensions_report(all_res_adj, tests, labels, percentile)
dimensions_adj_df

Unnamed: 0,flau_small_c,flau_base_u,flau_base_c,flau_large_c,cam_base
AABCC,"[478, 50, 100, 250, 387, 439]","[203, 423, 719, 671, 247, 526, 172, 320]","[260, 191, 407, 603, 349, 336, 498, 363]","[741, 458, 890, 92, 436, 617, 953, 346, 797, 6...","[341, 559, 68, 633, 97, 194, 620, 758]"
SIG_PROPS,"[478, 158, 464, 224, 220, 281]","[320, 746, 565, 153, 43, 572, 172, 449]","[581, 185, 336, 301, 727, 485, 571, 331]","[741, 436, 890, 948, 92, 354, 458, 617, 1005, ...","[100, 714, 309, 119, 272, 467, 430, 3]"
CORRELATION,"[478, 464, 220, 50, 250, 439]","[320, 746, 43, 423, 449, 719, 203, 275]","[581, 234, 485, 336, 571, 330, 77, 185]","[741, 436, 652, 92, 458, 792, 948, 890, 1005, ...","[473, 356, 309, 764, 119, 125, 714, 3]"
LR,"[100, 510, 260, 138, 485, 315]","[391, 495, 91, 379, 203, 159, 691, 752]","[724, 754, 492, 273, 27, 670, 178, 485]","[82, 80, 298, 978, 292, 423, 196, 304, 1006, 9...","[622, 674, 529, 323, 750, 683, 187, 437]"
PERCEPTRON,"[451, 25, 136, 21, 174, 259]","[327, 216, 703, 272, 158, 305, 56, 255]","[108, 570, 599, 188, 46, 513, 652, 190]","[286, 100, 6, 461, 991, 848, 497, 709, 295, 50...","[470, 297, 84, 623, 36, 303, 743, 587]"
KMEANS_1DIM,"[478, 250, 464, 220, 158, 314]","[565, 423, 719, 526, 203, 752, 643, 374]","[351, 555, 557, 571, 764, 491, 431, 567]","[741, 436, 346, 877, 953, 618, 792, 787, 1005,...","[119, 100, 714, 309, 52, 272, 717, 467]"


In [26]:
repeated_dimensions_adj_df = repeated_dimensions(dimensions_adj_df, labels)
repeated_dimensions_adj_df

Unnamed: 0,1 test,2 tests,3 tests,4 tests,5 tests,6 tests
flau_small_c,"[21, 25, 50, 100, 136, 138, 158, 174, 220, 224...","[50, 100, 158, 220, 250, 439, 464, 478]","[220, 250, 464, 478]",[478],[],[]
flau_base_u,"[43, 56, 91, 153, 158, 159, 172, 203, 216, 247...","[43, 172, 203, 320, 423, 449, 526, 565, 719, 7...","[203, 320, 423, 719]",[203],[],[]
flau_base_c,"[27, 46, 77, 108, 178, 185, 188, 190, 191, 234...","[185, 336, 485, 571, 581]","[336, 485, 571]",[],[],[]
flau_large_c,"[6, 50, 80, 82, 92, 100, 196, 286, 292, 295, 2...","[92, 346, 436, 458, 617, 652, 685, 741, 787, 7...","[92, 436, 458, 741, 890, 1005]","[436, 458, 741]",[],[]
cam_base,"[3, 36, 52, 68, 84, 97, 100, 119, 125, 187, 19...","[3, 100, 119, 272, 309, 467, 714]","[119, 309, 714]",[],[],[]


In [27]:
ari_scores_adj_df = pd.DataFrame(columns = ['All_dims', '1_dim_best', 
                                        '1_test_dims', '2_test_dims',
                                        '3_test_dims', '4_test_dims',
                                        '5_test_dims', '6_test_dims'
                                       ])

In [28]:
for i in range(len(labels)):
    model = labels[i]
    n_dims = len(all_res[0][i])
    ari_scores_adj_df.loc[model] = {
        'All_dims' : kmeans_multi_dim(normalized_dims[i], list(range(n_dims)), feature_vectors[i]), 
        '1_dim_best': all_res_adj[-1][i][0][1], 
        '1_test_dims': kmeans_multi_dim(normalized_dims[i], repeated_dimensions_adj_df.loc[model]['1 test'], feature_vectors[i]), 
        '2_test_dims': kmeans_multi_dim(normalized_dims[i], repeated_dimensions_adj_df.loc[model]['2 tests'], feature_vectors[i]),
        '3_test_dims': kmeans_multi_dim(normalized_dims[i], repeated_dimensions_adj_df.loc[model]['3 tests'], feature_vectors[i]),
        '4_test_dims': kmeans_multi_dim(normalized_dims[i], repeated_dimensions_adj_df.loc[model]['4 tests'], feature_vectors[i]),
        '5_test_dims': kmeans_multi_dim(normalized_dims[i], repeated_dimensions_adj_df.loc[model]['5 tests'], feature_vectors[i]),
        '6_test_dims': kmeans_multi_dim(normalized_dims[i], repeated_dimensions_adj_df.loc[model]['6 tests'], feature_vectors[i]),
    }

In [29]:
ari_scores_adj_df

Unnamed: 0,All_dims,1_dim_best,1_test_dims,2_test_dims,3_test_dims,4_test_dims,5_test_dims,6_test_dims
flau_small_c,0.002331,0.047637,0.229939,0.143326,0.152073,0.047637,,
flau_base_u,0.042316,0.041566,0.041396,0.043724,-0.01293,0.01996,,
flau_base_c,-0.001864,0.016548,-0.00076,-0.007756,-0.003591,,,
flau_large_c,0.008652,0.063562,0.229324,0.217511,0.154221,0.118495,,
cam_base,0.038898,0.039739,0.038898,0.038898,0.038898,,,


# Verb

In [30]:
we_with_features = read_datasets(
                            path = '../Data',
                            model_labels = labels,
                            file_name = 'all_unique_pos_we.csv'
                    )

In [31]:
normalized_dims = []
feature_vectors = []

In [32]:
for we in we_with_features:
    we_copy = we.copy()
    we_copy['POS'] = we_copy.POS.apply(lambda x: int(x == 'VERB'))
    dims, f_vec = prepare_dataset(we_copy, feature_col_count=feature_col_count, feature_name=feature)
    normalized_dims.append(dims)
    feature_vectors.append(f_vec)

In [33]:
all_res_verb = run_tests(tests, normalized_dims, feature_vectors, labels, report_progress=True)

Currently running: AABCC
		Model: flau_small_c
		Model: flau_base_u
		Model: flau_base_c
		Model: flau_large_c
		Model: cam_base
Done
Currently running: SIG_PROPS
		Model: flau_small_c
		Model: flau_base_u
		Model: flau_base_c
		Model: flau_large_c
		Model: cam_base
Done
Currently running: CORRELATION
		Model: flau_small_c
		Model: flau_base_u
		Model: flau_base_c
		Model: flau_large_c
		Model: cam_base
Done
Currently running: LR
		Model: flau_small_c
		Model: flau_base_u
		Model: flau_base_c
		Model: flau_large_c
		Model: cam_base
Done
Currently running: PERCEPTRON
		Model: flau_small_c
		Model: flau_base_u
		Model: flau_base_c
		Model: flau_large_c
		Model: cam_base
Done
Currently running: KMEANS_1DIM
		Model: flau_small_c
		Model: flau_base_u
		Model: flau_base_c
		Model: flau_large_c
		Model: cam_base
Done


In [34]:
report_verb_df = report(all_res_verb, tests, labels)
report_verb_df

Unnamed: 0,Min,Max,Mean,25%,50%,75%,95%,97.7%,99.9%
AABCC_flau_small_c,56618.0,432205.0,73529.255859,61191.25,66039.0,73983.5,106147.85,145883.126,349786.832
AABCC_flau_base_u,77283.0,119847.0,84454.726562,81676.0,83441.5,85818.25,91737.45,96849.361,118383.564
AABCC_flau_base_c,55855.0,76388.0,60633.950521,59266.0,60272.5,61502.0,64367.65,65500.513,74314.032
AABCC_flau_large_c,56563.0,184790.0,66699.136719,60450.75,62853.0,68127.25,85199.5,100355.377,159673.152
AABCC_cam_base,40245.0,46731.0,43079.81901,42316.0,43040.5,43796.25,45008.95,45518.925,46550.755
SIG_PROPS_flau_small_c,6e-05,0.15667,0.036321,0.014243,0.030042,0.049572,0.091745,0.108142,0.155204
SIG_PROPS_flau_base_u,4.4e-05,0.101841,0.019183,0.008115,0.016329,0.026877,0.049064,0.056502,0.088556
SIG_PROPS_flau_base_c,3e-05,0.063043,0.01498,0.00621,0.013132,0.021634,0.036272,0.040554,0.05915
SIG_PROPS_flau_large_c,1.5e-05,0.114607,0.027534,0.011064,0.023146,0.039744,0.067482,0.078372,0.105655
SIG_PROPS_cam_base,1.6e-05,0.082911,0.015429,0.00585,0.012136,0.02202,0.039316,0.047554,0.080253


In [35]:
dimensions_verb_df = dimensions_report(all_res_verb, tests, labels, percentile)
dimensions_verb_df

Unnamed: 0,flau_small_c,flau_base_u,flau_base_c,flau_large_c,cam_base
AABCC,"[480, 182, 401, 159, 310, 192]","[299, 107, 390, 649, 217, 383, 224, 121]","[87, 705, 80, 688, 285, 152, 472, 443]","[800, 473, 925, 172, 932, 437, 207, 793, 988, ...","[185, 433, 589, 303, 438, 334, 102, 405]"
SIG_PROPS,"[310, 480, 401, 159, 504, 89]","[405, 89, 720, 626, 390, 121, 383, 748]","[50, 688, 705, 55, 152, 472, 687, 87]","[932, 925, 886, 800, 927, 793, 343, 479, 988, ...","[100, 714, 309, 467, 217, 579, 272, 430]"
CORRELATION,"[310, 480, 401, 159, 89, 504]","[390, 299, 107, 89, 748, 649, 217, 121]","[688, 87, 705, 152, 80, 472, 50, 76]","[932, 925, 800, 479, 853, 333, 927, 793, 886, ...","[243, 694, 217, 486, 53, 579, 740, 524]"
LR,"[378, 56, 434, 192, 154, 432]","[405, 146, 565, 726, 170, 763, 465, 476]","[492, 57, 248, 294, 650, 136, 451, 403]","[777, 793, 490, 803, 591, 249, 909, 398, 846, ...","[285, 546, 436, 549, 90, 575, 238, 754]"
PERCEPTRON,"[192, 480, 89, 318, 56, 504]","[533, 341, 417, 523, 483, 659, 402, 238]","[169, 164, 723, 753, 346, 107, 157, 3]","[89, 848, 817, 296, 475, 423, 36, 465, 62, 204...","[266, 523, 135, 501, 741, 755, 451, 183]"
KMEANS_1DIM,"[310, 480, 89, 159, 401, 29]","[565, 390, 299, 383, 720, 626, 744, 589]","[417, 412, 87, 80, 481, 385, 66, 324]","[932, 886, 479, 437, 2, 800, 253, 605, 925, 66...","[357, 174, 21, 224, 75, 475, 320, 251]"


In [36]:
repeated_dimensions_verb_df = repeated_dimensions(dimensions_adj_df, labels)
repeated_dimensions_verb_df

Unnamed: 0,1 test,2 tests,3 tests,4 tests,5 tests,6 tests
flau_small_c,"[21, 25, 50, 100, 136, 138, 158, 174, 220, 224...","[50, 100, 158, 220, 250, 439, 464, 478]","[220, 250, 464, 478]",[478],[],[]
flau_base_u,"[43, 56, 91, 153, 158, 159, 172, 203, 216, 247...","[43, 172, 203, 320, 423, 449, 526, 565, 719, 7...","[203, 320, 423, 719]",[203],[],[]
flau_base_c,"[27, 46, 77, 108, 178, 185, 188, 190, 191, 234...","[185, 336, 485, 571, 581]","[336, 485, 571]",[],[],[]
flau_large_c,"[6, 50, 80, 82, 92, 100, 196, 286, 292, 295, 2...","[92, 346, 436, 458, 617, 652, 685, 741, 787, 7...","[92, 436, 458, 741, 890, 1005]","[436, 458, 741]",[],[]
cam_base,"[3, 36, 52, 68, 84, 97, 100, 119, 125, 187, 19...","[3, 100, 119, 272, 309, 467, 714]","[119, 309, 714]",[],[],[]


In [37]:
ari_scores_verb_df = pd.DataFrame(columns = ['All_dims', '1_dim_best', 
                                        '1_test_dims', '2_test_dims',
                                        '3_test_dims', '4_test_dims',
                                        '5_test_dims', '6_test_dims'
                                       ])

In [38]:
for i in range(len(labels)):
    model = labels[i]
    n_dims = len(all_res[0][i])
    ari_scores_verb_df.loc[model] = {
        'All_dims' : kmeans_multi_dim(normalized_dims[i], list(range(n_dims)), feature_vectors[i]), 
        '1_dim_best': all_res_verb[-1][i][0][1], 
        '1_test_dims': kmeans_multi_dim(normalized_dims[i], repeated_dimensions_verb_df.loc[model]['1 test'], feature_vectors[i]), 
        '2_test_dims': kmeans_multi_dim(normalized_dims[i], repeated_dimensions_verb_df.loc[model]['2 tests'], feature_vectors[i]),
        '3_test_dims': kmeans_multi_dim(normalized_dims[i], repeated_dimensions_verb_df.loc[model]['3 tests'], feature_vectors[i]),
        '4_test_dims': kmeans_multi_dim(normalized_dims[i], repeated_dimensions_verb_df.loc[model]['4 tests'], feature_vectors[i]),
        '5_test_dims': kmeans_multi_dim(normalized_dims[i], repeated_dimensions_verb_df.loc[model]['5 tests'], feature_vectors[i]),
        '6_test_dims': kmeans_multi_dim(normalized_dims[i], repeated_dimensions_verb_df.loc[model]['6 tests'], feature_vectors[i]),
    }

In [39]:
ari_scores_verb_df

Unnamed: 0,All_dims,1_dim_best,1_test_dims,2_test_dims,3_test_dims,4_test_dims,5_test_dims,6_test_dims
flau_small_c,0.216043,0.134604,-0.017419,0.009879,-0.010338,0.000523,,
flau_base_u,0.092836,0.078872,0.083638,0.080721,3.9e-05,-0.015872,,
flau_base_c,-0.003136,0.022624,-0.00014,-0.005372,-0.003969,,,
flau_large_c,0.080811,0.061557,-0.010414,-0.013342,-0.007812,-0.005329,,
cam_base,-0.039838,0.037875,-0.039838,-0.039838,-0.039838,,,
