In [2]:
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
import pandas as pd

In [3]:
import numpy as np

# Gender

In [4]:
all_n_we = pd.read_csv('../Data/FlauBERT_WE/all_nouns_we.csv', index_col=0).drop(columns=["number"])
all_a_we = pd.read_csv('../Data/FlauBERT_WE/all_adjectives_we.csv', index_col=0).drop(columns = ["number"])

### 1.1 Noun

In [5]:
all_n_we['gender'] = all_n_we.gender.apply(lambda x: 1 if x == "masculine" else 0)
normalized_noun = (all_n_we - all_n_we.min())/(all_n_we.max() - all_n_we.min()) 

In [5]:
gg_n_dims = pd.read_csv('../Data/Dimensions/GG/noun.csv', index_col=0)

In [6]:
dims, index = np.unique(gg_n_dims.iloc[10:, 0], return_counts=True)

In [7]:
top_dims = [x[0] for x in filter(lambda x: x[1] >=4, zip(dims, index))]

In [None]:
import itertools

results = []

for el in range(len(top_dims) + 1):
    for subset in itertools.combinations(top_dims, el):
        if len(subset) > 1:
            km = KMeans(n_clusters=2, random_state=42, n_init=10)
            km.fit(normalized_noun.iloc[:, list(subset)].values)
            
            results.append(
                {'Dimensions': subset, 'ARI': adjusted_rand_score(normalized_noun.gender, km.labels_)}
            )
                   


In [9]:
pd.DataFrame(results).sort_values(by='ARI', ascending=False)[:10]

Unnamed: 0,Dimensions,ARI
1009,"(100, 117, 192, 195, 245, 316, 377, 403, 507)",0.10582
1012,"(100, 117, 121, 192, 195, 245, 316, 377, 403, ...",0.103278
1010,"(100, 121, 192, 195, 245, 316, 377, 403, 507)",0.10164
980,"(100, 117, 192, 195, 245, 316, 403, 507)",0.100001
1008,"(100, 117, 121, 195, 245, 316, 377, 403, 507)",0.099718
1006,"(100, 117, 121, 192, 195, 316, 377, 403, 507)",0.099537
983,"(100, 117, 192, 245, 316, 377, 403, 507)",0.099328
1004,"(100, 117, 121, 192, 195, 245, 316, 403, 507)",0.099141
1003,"(100, 117, 121, 192, 195, 245, 316, 377, 507)",0.098857
978,"(100, 117, 192, 195, 245, 316, 377, 403)",0.098104


In [8]:
km = KMeans(n_clusters=2, random_state=42, n_init=10)
km.fit(normalized_noun[[str(i) for i in range(512)]])
print(adjusted_rand_score(normalized_noun["gender"], km.labels_))

0.006637931210181948


### 1.2 Adjective

In [9]:
all_a_we['gender'] = all_a_we.gender.apply(lambda x: 1 if x == "masculine" else 0)
normalized_adj = (all_a_we - all_a_we.min())/(all_a_we.max() - all_a_we.min()) 

In [11]:
gg_a_dims = pd.read_csv('../Data/Dimensions/GG/adj.csv', index_col=0)

In [12]:
dims, index = np.unique(gg_a_dims.iloc[:, 0], return_counts=True)

In [13]:
top_dims = [x[0] for x in filter(lambda x: x[1] >=3, zip(dims, index))]

In [14]:
results = []

for el in range(len(top_dims) + 1):
    for subset in itertools.combinations(top_dims, el):
        if len(subset) > 1:
            km = KMeans(n_clusters=2, random_state=42, n_init=10)
            km.fit(normalized_adj.iloc[:, list(subset)].values)
            
            results.append(
                {'Dimensions': subset, 'ARI': adjusted_rand_score(normalized_adj.gender, km.labels_)}
            )
                   

In [15]:
pd.DataFrame(results).sort_values(by='ARI', ascending=False)[:10]

Unnamed: 0,Dimensions,ARI
2035,"(88, 121, 133, 181, 234, 245, 250, 432, 439, 4...",0.383582
2032,"(88, 121, 181, 234, 245, 250, 432, 439, 466, 503)",0.383383
2031,"(88, 121, 133, 234, 245, 250, 432, 439, 466, 503)",0.377208
2028,"(88, 121, 133, 181, 234, 245, 432, 439, 466, 503)",0.369856
2026,"(88, 121, 133, 181, 234, 245, 250, 432, 466, 503)",0.369132
2007,"(88, 133, 181, 234, 245, 250, 432, 466, 503)",0.366057
2027,"(88, 121, 133, 181, 234, 245, 250, 439, 466, 503)",0.362342
2033,"(88, 133, 181, 234, 245, 250, 432, 439, 466, 503)",0.359317
2029,"(88, 121, 133, 181, 234, 250, 432, 439, 466, 503)",0.357743
2034,"(121, 133, 181, 234, 245, 250, 432, 439, 466, ...",0.357289


In [10]:
km = KMeans(n_clusters=2, random_state=42, n_init=10)
km.fit(normalized_adj[[str(i) for i in range(512)]])
print(adjusted_rand_score(normalized_adj["gender"], km.labels_))

-0.0007625257008206993


### 1.3 Both

In [11]:
normalized_both = pd.concat([normalized_adj, normalized_noun])

In [17]:
gg_both_dims = pd.read_csv('../Data/Dimensions/GG/both.csv', index_col=0)

In [18]:
dims, index = np.unique(gg_both_dims.iloc[:, 0], return_counts=True)

In [19]:
top_dims = [x[0] for x in filter(lambda x: x[1] >=3, zip(dims, index))]

In [20]:
results = []

for el in range(len(top_dims) + 1):
    for subset in itertools.combinations(top_dims, el):
        if len(subset) > 1:
            km = KMeans(n_clusters=2, random_state=42, n_init=10)
            km.fit(normalized_both.iloc[:, list(subset)].values)
            
            results.append(
                {'Dimensions': subset, 'ARI': adjusted_rand_score(normalized_both.gender, km.labels_)}
            )
                   

In [21]:
pd.DataFrame(results).sort_values(by='ARI', ascending=False)[:10]

Unnamed: 0,Dimensions,ARI
482,"(5, 192, 245, 250, 377, 439, 507)",0.105149
468,"(5, 121, 192, 245, 250, 439, 507)",0.104191
495,"(5, 121, 192, 195, 245, 377, 439, 507)",0.101126
476,"(5, 121, 245, 250, 377, 439, 507)",0.100043
497,"(5, 121, 192, 245, 250, 377, 439, 507)",0.09875
404,"(5, 121, 245, 250, 439, 507)",0.097469
474,"(5, 121, 195, 245, 377, 439, 507)",0.093823
420,"(5, 192, 245, 377, 439, 507)",0.092206
460,"(5, 121, 192, 195, 245, 377, 507)",0.091722
490,"(121, 195, 245, 250, 377, 439, 507)",0.091506


In [12]:
km = KMeans(n_clusters=2, random_state=42, n_init=10)
km.fit(normalized_both[[str(i) for i in range(512)]])
print(adjusted_rand_score(normalized_both["gender"], km.labels_))

0.0027097395914094493


# Number

In [13]:
all_n_we = pd.read_csv('../Data/FlauBERT_WE/all_nouns_we.csv', index_col=0).drop(columns=["gender"])
all_a_we = pd.read_csv('../Data/FlauBERT_WE/all_adjectives_we.csv', index_col=0).drop(columns = ["gender"])

### 2.1 Noun

In [14]:
all_n_we['number'] = all_n_we.number.apply(lambda x: 1 if x == "plural" else 0)
normalized_noun = (all_n_we - all_n_we.min())/(all_n_we.max() - all_n_we.min()) 

In [24]:
gn_n_dims = pd.read_csv('../Data/Dimensions/GN/noun.csv', index_col=0)

In [25]:
dims, index = np.unique(gn_n_dims.iloc[:, 0], return_counts=True)

In [26]:
top_dims = [x[0] for x in filter(lambda x: x[1] >=3, zip(dims, index))]

In [27]:
results = []

for el in range(len(top_dims) + 1):
    for subset in itertools.combinations(top_dims, el):
        if len(subset) > 1:
            km = KMeans(n_clusters=2, random_state=42, n_init=10)
            km.fit(normalized_noun.iloc[:, list(subset)].values)
            
            results.append(
                {'Dimensions': subset, 'ARI': adjusted_rand_score(normalized_noun.number, km.labels_)}
            )
                   

In [28]:
pd.DataFrame(results).sort_values(by='ARI', ascending=False)[:10]

Unnamed: 0,Dimensions,ARI
1852,"(54, 81, 172, 250, 285, 310, 359, 384)",0.384787
2031,"(54, 81, 172, 250, 278, 285, 288, 310, 359, 384)",0.362285
1948,"(81, 172, 250, 278, 285, 310, 359, 384)",0.360178
1992,"(54, 81, 172, 250, 278, 285, 288, 359, 384)",0.35643
1722,"(81, 172, 250, 278, 285, 310, 359)",0.350085
1592,"(54, 81, 250, 285, 310, 359, 384)",0.348472
2004,"(54, 81, 250, 278, 285, 288, 310, 359, 384)",0.346126
1995,"(54, 81, 172, 250, 285, 288, 310, 359, 384)",0.34376
2021,"(81, 172, 250, 278, 285, 288, 310, 359, 384)",0.34121
1528,"(54, 81, 172, 250, 310, 359, 384)",0.339356


In [15]:
km = KMeans(n_clusters=2, random_state=42, n_init=10)
km.fit(normalized_noun[[str(i) for i in range(512)]])
print(adjusted_rand_score(normalized_noun["number"], km.labels_))

-0.00029180220646910106


### 2.2 Adj

In [16]:
all_a_we['number'] = all_a_we.number.apply(lambda x: 1 if x == "plural" else 0)
normalized_adj = (all_a_we - all_a_we.min())/(all_a_we.max() - all_a_we.min()) 

In [31]:
gn_a_dims = pd.read_csv('../Data/Dimensions/GN/adj.csv', index_col=0)

In [32]:
dims, index = np.unique(gn_a_dims.iloc[:, 0], return_counts=True)

In [33]:
top_dims = [x[0] for x in filter(lambda x: x[1] >=3, zip(dims, index))]

In [34]:
results = []

for el in range(len(top_dims) + 1):
    for subset in itertools.combinations(top_dims, el):
        if len(subset) > 1:
            km = KMeans(n_clusters=2, random_state=42, n_init=10)
            km.fit(normalized_adj.iloc[:, list(subset)].values)
            
            results.append(
                {'Dimensions': subset, 'ARI': adjusted_rand_score(normalized_adj.number, km.labels_)}
            )
                   

In [35]:
pd.DataFrame(results).sort_values(by='ARI', ascending=False)[:10]

Unnamed: 0,Dimensions,ARI
2002,"(25, 54, 81, 84, 274, 285, 310, 384, 455)",0.401257
2001,"(25, 54, 81, 84, 192, 285, 310, 384, 455)",0.384589
1983,"(25, 54, 56, 81, 84, 285, 310, 384, 455)",0.380817
1367,"(54, 81, 274, 310, 384, 455)",0.379764
2032,"(25, 54, 81, 84, 192, 274, 285, 310, 384, 455)",0.379674
2029,"(25, 54, 56, 81, 84, 274, 285, 310, 384, 455)",0.378222
2019,"(54, 56, 81, 84, 274, 285, 310, 384, 455)",0.371236
2028,"(25, 54, 56, 81, 84, 192, 285, 310, 384, 455)",0.370854
1874,"(25, 54, 81, 84, 285, 310, 384, 455)",0.369973
1885,"(25, 54, 84, 192, 285, 310, 384, 455)",0.369515


In [17]:
km = KMeans(n_clusters=2, random_state=42, n_init=10)
km.fit(normalized_adj[[str(i) for i in range(512)]])
print(adjusted_rand_score(normalized_adj["number"], km.labels_))

0.0026840175415539847


### Both

In [18]:
normalized_both = pd.concat([normalized_adj, normalized_noun])

In [81]:
gn_both_dims = pd.read_csv('../Data/Dimensions/GN/both.csv', index_col=0)

In [82]:
dims, index = np.unique(gn_both_dims.iloc[:, 0], return_counts=True)

In [83]:
top_dims = [x[0] for x in filter(lambda x: x[1] >=3, zip(dims, index))]

In [84]:
results = []

for el in range(len(top_dims) + 1):
    for subset in itertools.combinations(top_dims, el):
        if len(subset) > 1:
            km = KMeans(n_clusters=2, random_state=42, n_init=10)
            km.fit(normalized_both.iloc[:, list(subset)].values)
            
            results.append(
                {'Dimensions': subset, 'ARI': adjusted_rand_score(normalized_both.number, km.labels_)}
            )
                   

In [85]:
pd.DataFrame(results).sort_values(by='ARI', ascending=False)[:10]

Unnamed: 0,Dimensions,ARI
2028,"(25, 54, 81, 172, 278, 285, 310, 359, 384, 495)",0.34887
1988,"(25, 54, 81, 172, 285, 310, 359, 384, 495)",0.343821
1977,"(25, 54, 81, 172, 278, 285, 310, 384, 495)",0.334831
1976,"(25, 54, 81, 172, 278, 285, 310, 359, 495)",0.334644
1832,"(25, 54, 81, 172, 285, 310, 384, 495)",0.330281
2018,"(54, 81, 172, 278, 285, 310, 359, 384, 495)",0.329896
1975,"(25, 54, 81, 172, 278, 285, 310, 359, 384)",0.328441
1830,"(25, 54, 81, 172, 285, 310, 359, 384)",0.328357
2035,"(25, 54, 81, 172, 278, 285, 288, 310, 359, 384...",0.326129
1831,"(25, 54, 81, 172, 285, 310, 359, 495)",0.322737


In [19]:
km = KMeans(n_clusters=2, random_state=42, n_init=10)
km.fit(normalized_both[[str(i) for i in range(512)]])
print(adjusted_rand_score(normalized_both["number"], km.labels_))

-0.00028365391100050943


# PoS

In [20]:
all_n_we = pd.read_csv('../Data/FlauBERT_WE/all_nouns_we.csv', index_col=0).drop(columns=["number", "gender"])
all_a_we = pd.read_csv('../Data/FlauBERT_WE/all_adjectives_we.csv', index_col=0).drop(columns = ["number", "gender"])
all_v_we = pd.read_csv('../Data/FlauBERT_WE/all_verb_we.csv', index_col=0)


# label 1: NOUN, 0: not NOUN
all_n_we["noun"] = 1
all_av_we = pd.concat([all_a_we, all_v_we])
all_av_we["noun"] = 0

# Normalization and concatenation
df = pd.concat([all_av_we, all_n_we])
norm_df = (df - df.min()) / (df.max() - df.min())

# Removing words having multiple POS
word, count = np.unique(norm_df.index, return_counts=True)
unique_words = [x[0] for x in list(filter(lambda x: x[1] == 1, zip(word, count)))]
un_nouns =  norm_df[norm_df.index.isin(unique_words)]

### 3.1 Noun

In [55]:
pos_n = pd.read_csv('../Data/Dimensions/PoS/noun.csv', index_col=0)

In [56]:
dims, index = np.unique(pos_n.iloc[:, 0], return_counts=True)

In [57]:
top_dims = [x[0] for x in filter(lambda x: x[1] >=3, zip(dims, index))]

In [62]:
results = []

for el in range(len(top_dims) + 1):
    for subset in itertools.combinations(top_dims, el):
        if len(subset) > 1:
            km = KMeans(n_clusters=2, random_state=42, n_init=10)
            km.fit(un_nouns.iloc[:, list(subset)].values)
            
            results.append(
                {'Dimensions': subset, 'ARI': adjusted_rand_score(un_nouns.noun, km.labels_)}
            )
                   

In [63]:
pd.DataFrame(results).sort_values(by='ARI', ascending=False)[:10]

Unnamed: 0,Dimensions,ARI
115,"(128, 159, 341, 401, 458, 480)",0.425822
109,"(159, 341, 401, 458, 480)",0.409981
119,"(128, 159, 341, 346, 401, 458, 480)",0.406572
100,"(128, 159, 401, 458, 480)",0.404923
116,"(128, 159, 346, 401, 458, 480)",0.397122
112,"(128, 159, 341, 346, 401, 458)",0.395904
118,"(159, 341, 346, 401, 458, 480)",0.391457
85,"(159, 401, 458, 480)",0.384585
94,"(128, 159, 341, 401, 458)",0.382366
110,"(159, 346, 401, 458, 480)",0.379384


In [21]:
km = KMeans(n_clusters=2, random_state=42, n_init=10)
km.fit(un_nouns[[str(i) for i in range(512)]])
print(adjusted_rand_score(un_nouns["noun"], km.labels_))

0.4111876502146539


### 3.2 Verb

In [65]:
pos_v = pd.read_csv('../Data/Dimensions/PoS/verb.csv', index_col=0)

In [22]:
# label 1: VERB, 0: not VERB
all_v_we["verb"] = 1
all_na_we = pd.concat([all_n_we, all_a_we])
all_na_we["verb"] = 0

# Normalization and concatenation
df = pd.concat([all_na_we, all_v_we]).drop(columns = ["noun"])
norm_df = (df - df.min()) / (df.max() - df.min())

# Removing words having multiple POS
word, count = np.unique(norm_df.index, return_counts=True)
unique_words = [x[0] for x in list(filter(lambda x: x[1] == 1, zip(word, count)))]
un_verb =  norm_df[norm_df.index.isin(unique_words)]

In [67]:
dims, index = np.unique(pos_v.iloc[:, 0], return_counts=True)

In [68]:
top_dims = [x[0] for x in filter(lambda x: x[1] >=3, zip(dims, index))]

In [69]:
results = []

for el in range(len(top_dims) + 1):
    for subset in itertools.combinations(top_dims, el):
        if len(subset) > 1:
            km = KMeans(n_clusters=2, random_state=42, n_init=10)
            km.fit(un_verb.iloc[:, list(subset)].values)
            
            results.append(
                {'Dimensions': subset, 'ARI': adjusted_rand_score(un_verb.verb, km.labels_)}
            )
                   

In [70]:
pd.DataFrame(results).sort_values(by='ARI', ascending=False)[:10]

Unnamed: 0,Dimensions,ARI
1012,"(89, 159, 192, 282, 310, 341, 401, 458, 480, 504)",0.593161
1007,"(89, 159, 192, 282, 341, 401, 458, 480, 504)",0.585072
1003,"(89, 159, 192, 282, 310, 341, 401, 458, 504)",0.584642
1010,"(89, 192, 282, 310, 341, 401, 458, 480, 504)",0.581776
1005,"(89, 159, 192, 282, 310, 341, 458, 480, 504)",0.580965
990,"(89, 192, 282, 341, 401, 458, 480, 504)",0.580141
988,"(89, 192, 282, 310, 341, 458, 480, 504)",0.574751
1009,"(89, 159, 282, 310, 341, 401, 458, 480, 504)",0.574575
961,"(89, 159, 192, 282, 310, 341, 458, 504)",0.574236
968,"(89, 159, 192, 282, 341, 401, 458, 504)",0.571869


In [23]:
km = KMeans(n_clusters=2, random_state=42, n_init=10)
km.fit(un_verb[[str(i) for i in range(512)]])
print(adjusted_rand_score(un_verb["verb"], km.labels_))

0.4832314334217509


### 3.3 Adj

In [24]:
# label 1: ADJ, 0: not ADJ
all_a_we["adj"] = 1
all_nv_we = pd.concat([all_n_we, all_v_we])
all_nv_we["adj"] = 0

# Normalization and concatenation
df = pd.concat([all_nv_we, all_a_we]).drop(columns = ["verb"])
norm_df = (df - df.min()) / (df.max() - df.min())

# Removing words having multiple POS
word, count = np.unique(norm_df.index, return_counts=True)
unique_words = [x[0] for x in list(filter(lambda x: x[1] == 1, zip(word, count)))]
un_adj =  norm_df[norm_df.index.isin(unique_words)]

In [72]:
pos_a = pd.read_csv('../Data/Dimensions/PoS/adj.csv', index_col=0)

In [73]:
dims, index = np.unique(pos_a.iloc[:, 0], return_counts=True)

In [74]:
top_dims = [x[0] for x in filter(lambda x: x[1] >=3, zip(dims, index))]

In [75]:
results = []

for el in range(len(top_dims) + 1):
    for subset in itertools.combinations(top_dims, el):
        if len(subset) > 1:
            km = KMeans(n_clusters=2, random_state=42, n_init=10)
            km.fit(un_adj.iloc[:, list(subset)].values)
            
            results.append(
                {'Dimensions': subset, 'ARI': adjusted_rand_score(un_adj.adj, km.labels_)}
            )
                   

In [76]:
pd.DataFrame(results).sort_values(by='ARI', ascending=False)[:10]

Unnamed: 0,Dimensions,ARI
7,"(158, 220, 464)",0.045668
0,"(158, 220)",0.034791
2,"(158, 464)",0.031803
4,"(220, 464)",0.018842
9,"(220, 310, 464)",0.015401
3,"(220, 310)",0.015228
6,"(158, 220, 310)",0.014161
10,"(158, 220, 310, 464)",0.013177
5,"(310, 464)",0.005958
1,"(158, 310)",0.005214


In [25]:
km = KMeans(n_clusters=2, random_state=42, n_init=10)
km.fit(un_adj[[str(i) for i in range(512)]])
print(adjusted_rand_score(un_adj["adj"], km.labels_))

-0.008039470851211553
