# KMeans to test multiple dimensions at a time (number)

## 0. Data Loading : dataset of NOUN + ADJ

In [1]:
import pandas as pd

# WE loading
all_n_we = pd.read_csv('../Data/FlauBERT_WE/all_nouns_we.csv', index_col=0).drop(columns=["gender"])
all_a_we = pd.read_csv('../Data/FlauBERT_WE/all_adjectives_we.csv', index_col=0).drop(columns = ["gender"])

In [2]:
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

## Nouns only

In [3]:
all_n_we['number'] = all_n_we.number.apply(lambda x: 1 if x == "singular" else 0)
normalized_noun = (all_n_we - all_n_we.min())/(all_n_we.max() - all_n_we.min()) 

In [34]:
dim_list_1= [str(i) for i in range(512)] # all dimensions
df_clustering_1 = pd.DataFrame(columns = ["ARI"])
ari = []

for dim in dim_list_1:

    # Clustering using Kmeans
    km = KMeans(n_clusters=2, random_state=42, n_init=10)
    km.fit(normalized_noun[dim].values.reshape(-1, 1))

    # Compute the Adjusted Rand Index: the closer to 1, the better
    ari.append(adjusted_rand_score(normalized_noun["number"], km.labels_))

df_clustering_1["ARI"] = ari
df_clustering_1.index = dim_list_1

In [35]:
top10_1 = df_clustering_1.sort_values(by = "ARI", ascending = False)
dim_top10_1 = list(top10_1.head(10).index)
top10_1.head(10)

Unnamed: 0,ARI
310,0.113546
54,0.071276
285,0.056902
288,0.052711
278,0.051007
81,0.049033
243,0.042733
359,0.042109
172,0.038981
182,0.036312


In [36]:
w1 = list(pd.read_csv('../Data/Dimensions/GN/noun.csv', index_col=0).iloc[:, 0].values)

w1.extend(top10_1.head(10).index)

pd.DataFrame(w1).to_csv('../Data/Dimensions/GN/noun.csv')

In [4]:
dim_list_2= [[str(i), str(j)] for i in range(512) for j in range(512) if i<j] # all dimensions
df_clustering_2 = pd.DataFrame(columns = ["ARI"])
ari = []

for i, dim in enumerate(dim_list_2):

    # Clustering using Kmeans
    km = KMeans(n_clusters=2, random_state=42, n_init=10)
    km.fit(normalized_noun[dim])

    # Compute the Adjusted Rand Index: the closer to 1, the better
    ari.append(adjusted_rand_score(normalized_noun["number"], km.labels_))

    if i % (len(dim_list_2)//10) == 0:
        print("Progress: ", i/len(dim_list_2)*100, "%")

df_clustering_2["ARI"] = ari
df_clustering_2.index = [str(tuple_) for tuple_ in dim_list_2]

Progress:  0.0 %
Progress:  9.999541340508806 %
Progress:  19.99908268101761 %
Progress:  29.99862402152642 %
Progress:  39.99816536203522 %
Progress:  49.99770670254403 %
Progress:  59.99724804305284 %
Progress:  69.99678938356165 %
Progress:  79.99633072407045 %
Progress:  89.99587206457925 %
Progress:  99.99541340508806 %


In [5]:
top10_2 = df_clustering_2.sort_values(by = "ARI", ascending = False)
top10_2.head(10)

Unnamed: 0,ARI
"['278', '310']",0.163295
"['285', '310']",0.160528
"['310', '359']",0.144767
"['136', '310']",0.139973
"['246', '310']",0.139702
"['54', '310']",0.138497
"['81', '310']",0.138448
"['205', '310']",0.134706
"['25', '310']",0.130391
"['259', '310']",0.129862


## Adjs

In [6]:
all_a_we['number'] = all_a_we.number.apply(lambda x: 1 if x == "singular" else 0)
normalized_adj = (all_a_we - all_a_we.min())/(all_a_we.max() - all_a_we.min()) 

In [38]:
dim_list_1= [str(i) for i in range(512)] # all dimensions
df_clustering_1 = pd.DataFrame(columns = ["ARI"])
ari = []

for dim in dim_list_1:

    # Clustering using Kmeans
    km = KMeans(n_clusters=2, random_state=42, n_init=10)
    km.fit(normalized_adj[dim].values.reshape(-1, 1))

    # Compute the Adjusted Rand Index: the closer to 1, the better
    ari.append(adjusted_rand_score(normalized_adj["number"], km.labels_))

df_clustering_1["ARI"] = ari
df_clustering_1.index = dim_list_1

In [39]:
top10_1 = df_clustering_1.sort_values(by = "ARI", ascending = False)
dim_top10_1 = list(top10_1.head(10).index)
top10_1.head(10)

Unnamed: 0,ARI
310,0.112598
54,0.074463
285,0.061454
278,0.056826
81,0.055338
288,0.051972
25,0.048156
455,0.045349
56,0.044883
495,0.044811


In [40]:
w1 = list(pd.read_csv('../Data/Dimensions/GN/adj.csv', index_col=0).iloc[:, 0].values)

w1.extend(top10_1.head(10).index)

pd.DataFrame(w1).to_csv('../Data/Dimensions/GN/adj.csv')

In [7]:
dim_list_2= [[str(i), str(j)] for i in range(512) for j in range(512) if i<j] # all dimensions
df_clustering_2 = pd.DataFrame(columns = ["ARI"])
ari = []

for i, dim in enumerate(dim_list_2):

    # Clustering using Kmeans
    km = KMeans(n_clusters=2, random_state=42, n_init=10)
    km.fit(normalized_adj[dim])

    # Compute the Adjusted Rand Index: the closer to 1, the better
    ari.append(adjusted_rand_score(normalized_adj["number"], km.labels_))

    if i % (len(dim_list_2)//10) == 0:
        print("Progress: ", i/len(dim_list_2)*100, "%")

df_clustering_2["ARI"] = ari
df_clustering_2.index = [str(tuple_) for tuple_ in dim_list_2]

Progress:  0.0 %
Progress:  9.999541340508806 %
Progress:  19.99908268101761 %
Progress:  29.99862402152642 %
Progress:  39.99816536203522 %
Progress:  49.99770670254403 %
Progress:  59.99724804305284 %
Progress:  69.99678938356165 %
Progress:  79.99633072407045 %
Progress:  89.99587206457925 %
Progress:  99.99541340508806 %


In [8]:
top10_2 = df_clustering_2.sort_values(by = "ARI", ascending = False)
top10_2.head(10)

Unnamed: 0,ARI
"['285', '310']",0.179664
"['278', '310']",0.179162
"['54', '310']",0.178354
"['310', '455']",0.166271
"['25', '310']",0.165012
"['306', '310']",0.15991
"['136', '310']",0.158633
"['310', '470']",0.155161
"['191', '310']",0.15415
"['175', '310']",0.1463


## Both

In [41]:
normalized_both = pd.concat([normalized_adj, normalized_noun])

In [42]:
dim_list_1= [str(i) for i in range(512)] # all dimensions
df_clustering_1 = pd.DataFrame(columns = ["ARI"])
ari = []

for dim in dim_list_1:

    # Clustering using Kmeans
    km = KMeans(n_clusters=2, random_state=42, n_init=10)
    km.fit(normalized_both[dim].values.reshape(-1, 1))

    # Compute the Adjusted Rand Index: the closer to 1, the better
    ari.append(adjusted_rand_score(normalized_both["number"], km.labels_))

df_clustering_1["ARI"] = ari
df_clustering_1.index = dim_list_1

In [43]:
top10_1 = df_clustering_1.sort_values(by = "ARI", ascending = False)
dim_top10_1 = list(top10_1.head(10).index)
top10_1.head(10)

Unnamed: 0,ARI
310,0.117566
54,0.068252
285,0.055097
278,0.048557
81,0.046628
288,0.043262
25,0.04169
495,0.041609
172,0.037295
359,0.034606


In [44]:
w1 = list(pd.read_csv('../Data/Dimensions/GN/both.csv', index_col=0).iloc[:, 0].values)

w1.extend(top10_1.head(10).index)

pd.DataFrame(w1).to_csv('../Data/Dimensions/GN/both.csv')