In [1]:
from sklearn.linear_model import LogisticRegression 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import seaborn as sns
from matplotlib import pyplot as plt

## Data preparation

### Load data: NOUN, ADJ, and both

In [2]:
nouns = pd.read_csv('../Data/FlauBERT_WE/all_nouns_we.csv', index_col=0).drop(columns=['gender', 'number'])
nouns['noun'] = 1
nouns['verb'] = 0
nouns['adj'] = 0

verbs = pd.read_csv('../Data/FlauBERT_WE/all_verb_we.csv', index_col=0)
verbs['noun'] = 0
verbs['verb'] = 1
verbs['adj'] = 0

adjs = pd.read_csv('../Data/FlauBERT_WE/all_adjectives_we.csv', index_col=0).drop(columns=['gender', 'number'])
adjs['noun'] = 0
adjs['verb'] = 0
adjs['adj'] = 1


data = pd.concat([nouns, adjs, verbs])
data = data.sample(frac=1)

normalized_data = (data - data.min())/(data.max() - data.min()) 

In [3]:
# target : gender
Y_N = np.asarray(normalized_data.noun)
Y_V = np.asarray(normalized_data.verb)
Y_A = np.asarray(normalized_data.adj)

# features : word embeddings dimensions
X = np.asarray(normalized_data.iloc[:, :512])

# split data into train and test sets
X_N_train, X_N_test, Y_N_train, Y_N_test = train_test_split(X, Y_N, test_size=0.2, random_state=42)
X_A_train, X_A_test, Y_A_train, Y_A_test = train_test_split(X, Y_A, test_size=0.2, random_state=42)
X_V_train, X_V_test, Y_V_train, Y_V_test = train_test_split(X, Y_V, test_size=0.2, random_state=42)

names = ['Noun vs Not Noun', 'Adj vs Not Adj', 'Verb vs not Verb']

In [4]:
train_features = [X_N_train, X_A_train, X_V_train]
test_features = [X_N_test, X_A_test, X_V_test]
train_targets = [Y_N_train, Y_A_train, Y_V_train]
test_targets = [Y_N_test, Y_A_test, Y_V_test]

## Train the models

In [6]:
weights = [[], [], []]

In [7]:
# train the models 

for y in range(10):
    for i in range(3):
        print(f" Training model:  {names[i]}, run {y}")
        
        clf = LogisticRegression(random_state=y, max_iter=1000)
        clf.fit(train_features[i], train_targets[i])
        
        weights[i].append(clf.coef_[0])

 Training model:  Noun vs Not Noun, run 0
 Training model:  Adj vs Not Adj, run 0
 Training model:  Verb vs not Verb, run 0
 Training model:  Noun vs Not Noun, run 1
 Training model:  Adj vs Not Adj, run 1
 Training model:  Verb vs not Verb, run 1
 Training model:  Noun vs Not Noun, run 2
 Training model:  Adj vs Not Adj, run 2
 Training model:  Verb vs not Verb, run 2
 Training model:  Noun vs Not Noun, run 3
 Training model:  Adj vs Not Adj, run 3
 Training model:  Verb vs not Verb, run 3
 Training model:  Noun vs Not Noun, run 4
 Training model:  Adj vs Not Adj, run 4
 Training model:  Verb vs not Verb, run 4
 Training model:  Noun vs Not Noun, run 5
 Training model:  Adj vs Not Adj, run 5
 Training model:  Verb vs not Verb, run 5
 Training model:  Noun vs Not Noun, run 6
 Training model:  Adj vs Not Adj, run 6
 Training model:  Verb vs not Verb, run 6
 Training model:  Noun vs Not Noun, run 7
 Training model:  Adj vs Not Adj, run 7
 Training model:  Verb vs not Verb, run 7
 Trainin

In [8]:
weights = np.abs(weights)

## Nouns vs non-Nouns

In [9]:
noun_weights = pd.DataFrame(columns=list(range(512)))

In [10]:
noun_weights['run'] = list(range(10))
for r in range(10):
    dims_sorted = [x[0] for x in sorted(enumerate(weights[0][r]), key=lambda x: abs(x[1]), reverse=True)]
    for i in range(len(dims_sorted)):
        noun_weights.iloc[r, dims_sorted[i]] = i

Average ranking of dimensions after 10 runs for **NOUN** vs **non-NOUN**:

In [16]:
noun_weights.iloc[:, :512].mean().sort_values()[:10]

52     0.0
92     1.0
261    2.0
275    3.0
229    4.0
427    5.0
132    6.0
37     7.0
345    8.0
223    9.0
dtype: float64

## Verb vs non-Verbs

In [17]:
verb_weights = pd.DataFrame(columns=list(range(512)))

In [18]:
verb_weights['run'] = list(range(10))
for r in range(10):
    dims_sorted = [x[0] for x in sorted(enumerate(weights[2][r]), key=lambda x: abs(x[1]), reverse=True)]
    for i in range(len(dims_sorted)):
        verb_weights.iloc[r, dims_sorted[i]] = i

In [19]:
verb_weights.iloc[:, :512].mean().sort_values()[:10]

12     0.0
261    1.0
216    2.0
192    3.0
291    4.0
310    5.0
341    6.0
92     7.0
275    8.0
56     9.0
dtype: float64

## Adj vs non-Adjs

In [20]:
adj_weights = pd.DataFrame(columns=list(range(512)))

In [21]:
adj_weights['run'] = list(range(10))
for r in range(10):
    dims_sorted = [x[0] for x in sorted(enumerate(weights[1][r]), key=lambda x: abs(x[1]), reverse=True)]
    for i in range(len(dims_sorted)):
        adj_weights.iloc[r, dims_sorted[i]] = i

In [22]:
adj_weights.iloc[:, :512].mean().sort_values()[:10]

133    0.0
328    1.0
81     2.0
21     3.0
310    4.0
292    5.0
260    6.0
26     7.0
369    8.0
110    9.0
dtype: float64