# KMeans to test multiple dimensions at a time (gender)

## 0. Data Loading : dataset of NOUN + ADJ

In [1]:
import pandas as pd

# WE loading
all_n_we = pd.read_csv('../Data/FlauBERT_WE/all_nouns_we.csv', index_col=0).drop(columns=["number"])
all_a_we = pd.read_csv('../Data/FlauBERT_WE/all_adjectives_we.csv', index_col=0).drop(columns = ["number"])

In [2]:
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

## Nouns only 

In [3]:
all_n_we['gender'] = all_n_we.gender.apply(lambda x: 1 if x == "masculine" else 0)
normalized_noun = (all_n_we - all_n_we.min())/(all_n_we.max() - all_n_we.min()) 

In [35]:
dim_list_1= [str(i) for i in range(512)] # all dimensions
df_clustering_1 = pd.DataFrame(columns = ["ARI"])
ari = []

for dim in dim_list_1:

    # Clustering using Kmeans
    km = KMeans(n_clusters=2, random_state=42, n_init=10)
    km.fit(normalized_noun[dim].values.reshape(-1, 1))

    # Compute the Adjusted Rand Index: the closer to 1, the better
    ari.append(adjusted_rand_score(normalized_noun["gender"], km.labels_))

df_clustering_1["ARI"] = ari
df_clustering_1.index = dim_list_1

In [36]:
top10_1 = df_clustering_1.sort_values(by = "ARI", ascending = False)
dim_top10_1 = list(top10_1.head(10).index)
top10_1.head(10)

Unnamed: 0,ARI
100,0.028524
195,0.023761
316,0.021304
245,0.020889
507,0.019404
192,0.017843
403,0.017202
121,0.016885
377,0.015596
202,0.015122


In [37]:
w1 = list(pd.read_csv('../Data/Dimensions/GG/noun.csv', index_col=0).iloc[:, 0].values)

w1.extend(top10_1.head(10).index)

pd.DataFrame(w1).to_csv('../Data/Dimensions/GG/noun.csv')

In [9]:
dim_list_2 = [[str(i), str(j)] for i in range(512) for j in range(512) if i<j] # all dimensions
print("Total number of combinations: ", len(dim_list_2))
df_clustering_2 = pd.DataFrame(columns = ["ARI"])
ari = []

for i, dim in enumerate(dim_list_2):

    # Clustering using Kmeans
    km = KMeans(n_clusters=2, random_state=42, n_init=10)
    km.fit(normalized_noun[dim])

    # Compute the Adjusted Rand Index: the closer to 1, the better
    ari.append(adjusted_rand_score(normalized_noun["gender"], km.labels_))

    if i % (len(dim_list_2)//10) == 0:
        print("Progress: ", i/len(dim_list_2)*100, "%")

df_clustering_2["ARI"] = ari
df_clustering_2.index = [str(tuple_) for tuple_ in dim_list_2]

Total number of combinations:  130816
Progress:  0.0 %
Progress:  9.999541340508806 %
Progress:  19.99908268101761 %
Progress:  29.99862402152642 %
Progress:  39.99816536203522 %
Progress:  49.99770670254403 %
Progress:  59.99724804305284 %
Progress:  69.99678938356165 %
Progress:  79.99633072407045 %
Progress:  89.99587206457925 %
Progress:  99.99541340508806 %


In [10]:
top10_2 = df_clustering_2.sort_values(by = "ARI", ascending = False)
top10_2.head(10)

Unnamed: 0,ARI
"['100', '245']",0.046012
"['195', '316']",0.039982
"['100', '316']",0.039428
"['195', '507']",0.038977
"['100', '195']",0.037987
"['245', '316']",0.036835
"['192', '195']",0.036505
"['100', '121']",0.036504
"['100', '377']",0.036421
"['121', '316']",0.036322


## Adjs only

In [11]:
all_a_we['gender'] = all_a_we.gender.apply(lambda x: 1 if x == "masculine" else 0)
normalized_adj = (all_a_we - all_a_we.min())/(all_a_we.max() - all_a_we.min()) 

In [39]:
dim_list_1= [str(i) for i in range(512)] # all dimensions
df_clustering_1 = pd.DataFrame(columns = ["ARI"])
ari = []

for dim in dim_list_1:

    # Clustering using Kmeans
    km = KMeans(n_clusters=2, random_state=42, n_init=10)
    km.fit(normalized_adj[dim].values.reshape(-1, 1))

    # Compute the Adjusted Rand Index: the closer to 1, the better
    ari.append(adjusted_rand_score(normalized_adj["gender"], km.labels_))

df_clustering_1["ARI"] = ari
df_clustering_1.index = dim_list_1

In [40]:
top10_1 = df_clustering_1.sort_values(by = "ARI", ascending = False)
dim_top10_1 = list(top10_1.head(10).index)
top10_1.head(10)

Unnamed: 0,ARI
466,0.077949
503,0.070347
250,0.065059
133,0.06114
439,0.058512
38,0.053623
39,0.049656
234,0.049046
432,0.046831
181,0.045418


In [41]:
w1 = list(pd.read_csv('../Data/Dimensions/GG/adj.csv', index_col=0).iloc[:, 0].values)

w1.extend(top10_1.head(10).index)

pd.DataFrame(w1).to_csv('../Data/Dimensions/GG/adj.csv')

In [12]:
dim_list_2 = [[str(i), str(j)] for i in range(512) for j in range(512) if i<j] # all dimensions
print("Total number of combinations: ", len(dim_list_2))
df_clustering_2 = pd.DataFrame(columns = ["ARI"])
ari = []

for i, dim in enumerate(dim_list_2):

    # Clustering using Kmeans
    km = KMeans(n_clusters=2, random_state=42, n_init=10)
    km.fit(normalized_adj[dim])

    # Compute the Adjusted Rand Index: the closer to 1, the better
    ari.append(adjusted_rand_score(normalized_adj["gender"], km.labels_))

    if i % (len(dim_list_2)//10) == 0:
        print("Progress: ", i/len(dim_list_2)*100, "%")

df_clustering_2["ARI"] = ari
df_clustering_2.index = [str(tuple_) for tuple_ in dim_list_2]

Total number of combinations:  130816
Progress:  0.0 %
Progress:  9.999541340508806 %
Progress:  19.99908268101761 %
Progress:  29.99862402152642 %
Progress:  39.99816536203522 %
Progress:  49.99770670254403 %
Progress:  59.99724804305284 %
Progress:  69.99678938356165 %
Progress:  79.99633072407045 %
Progress:  89.99587206457925 %
Progress:  99.99541340508806 %


In [13]:
top10_2 = df_clustering_2.sort_values(by = "ARI", ascending = False)
top10_2.head(10)

Unnamed: 0,ARI
"['245', '466']",0.131638
"['439', '466']",0.122705
"['121', '466']",0.116324
"['250', '439']",0.11522
"['250', '466']",0.11497
"['466', '503']",0.114443
"['234', '466']",0.111678
"['206', '466']",0.111366
"['250', '503']",0.108908
"['260', '466']",0.108636


## Both

In [42]:
normalized_both = pd.concat([normalized_adj, normalized_noun])

In [43]:
dim_list_1= [str(i) for i in range(512)] # all dimensions
df_clustering_1 = pd.DataFrame(columns = ["ARI"])
ari = []

for dim in dim_list_1:

    # Clustering using Kmeans
    km = KMeans(n_clusters=2, random_state=42, n_init=10)
    km.fit(normalized_both[dim].values.reshape(-1, 1))

    # Compute the Adjusted Rand Index: the closer to 1, the better
    ari.append(adjusted_rand_score(normalized_both["gender"], km.labels_))

df_clustering_1["ARI"] = ari
df_clustering_1.index = dim_list_1

In [44]:
top10_1 = df_clustering_1.sort_values(by = "ARI", ascending = False)
dim_top10_1 = list(top10_1.head(10).index)
top10_1.head(10)

Unnamed: 0,ARI
245,0.024772
192,0.024167
439,0.024044
121,0.023915
507,0.023774
250,0.021967
5,0.020017
466,0.018595
181,0.017908
503,0.017869


In [45]:
w1 = list(pd.read_csv('../Data/Dimensions/GG/both.csv', index_col=0).iloc[:, 0].values)

w1.extend(top10_1.head(10).index)

pd.DataFrame(w1).to_csv('../Data/Dimensions/GG/both.csv')