In [4]:
from sklearn.linear_model import LogisticRegression 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import seaborn as sns
from matplotlib import pyplot as plt

## Data preparation

### Load data: NOUN, ADJ, and both

In [5]:
df_nouns_we = pd.read_csv('../Data/FlauBERT_WE/all_nouns_we.csv').drop(columns=["number"])
df_adj_we = pd.read_csv('../Data/FlauBERT_WE/all_adjectives_we.csv').drop(columns = "number")
df_both_we = pd.concat([df_nouns_we, df_adj_we], ignore_index=True)

# target : gender
Y_gd_N = df_nouns_we["gender"].apply(lambda x: 1 if x == "masculine" else 0)
Y_gd_A = df_adj_we["gender"].apply(lambda x: 1 if x == "masculine" else 0)
Y_gd_both = df_both_we["gender"].apply(lambda x: 1 if x == "masculine" else 0)

# features : word embeddings dimensions
X_gd_N = df_nouns_we.drop(columns=["Word", "gender"])
X_gd_A = df_adj_we.drop(columns=["Word", "gender"])
X_gd_both = df_both_we.drop(columns = ["Word", "gender"] )

# normalize data to be between 0 and 1
X_gd_N = (X_gd_N - X_gd_N.min()) / (X_gd_N.max() - X_gd_N.min())
X_gd_A = (X_gd_A - X_gd_A.min()) / (X_gd_A.max() - X_gd_A.min())
X_gd_both = (X_gd_both - X_gd_both.min()) / (X_gd_both.max() - X_gd_both.min())

# split data into train and test sets
X_gd_N_train, X_gd_N_test, Y_gd_N_train, Y_gd_N_test = train_test_split(X_gd_N, Y_gd_N, test_size=0.2, random_state=42)
X_gd_A_train, X_gd_A_test, Y_gd_A_train, Y_gd_A_test = train_test_split(X_gd_A, Y_gd_A, test_size=0.2, random_state=42)
X_gd_both_train, X_gd_both_test, Y_gd_both_train, Y_gd_both_test = train_test_split(X_gd_both, Y_gd_both, test_size=0.2, random_state=42)

In [6]:
train_features = [X_gd_N_train, X_gd_A_train, X_gd_both_train]
test_features = [X_gd_N_test, X_gd_A_test, X_gd_both_test]
train_targets = [Y_gd_N_train, Y_gd_A_train, Y_gd_both_train]
test_targets = [Y_gd_N_test, Y_gd_A_test, Y_gd_both_test]

In [7]:
names = ['Gender: Noun', 'Gender: Adj', 'Gender: Noun + Adj']

## Train the models

In [8]:
weights = [[], [], []]

In [9]:
# train the models 

for y in range(10):
    for i in range(3):
        print(f" Training model:  {names[i]}, run {y}")
        
        clf = LogisticRegression(random_state=y, max_iter=1000)
        clf.fit(train_features[i], train_targets[i])
        
        weights[i].append(clf.coef_[0])

 Training model:  Gender: Noun, run 0
 Training model:  Gender: Adj, run 0
 Training model:  Gender: Noun + Adj, run 0
 Training model:  Gender: Noun, run 1
 Training model:  Gender: Adj, run 1
 Training model:  Gender: Noun + Adj, run 1
 Training model:  Gender: Noun, run 2
 Training model:  Gender: Adj, run 2
 Training model:  Gender: Noun + Adj, run 2
 Training model:  Gender: Noun, run 3
 Training model:  Gender: Adj, run 3
 Training model:  Gender: Noun + Adj, run 3
 Training model:  Gender: Noun, run 4
 Training model:  Gender: Adj, run 4
 Training model:  Gender: Noun + Adj, run 4
 Training model:  Gender: Noun, run 5
 Training model:  Gender: Adj, run 5
 Training model:  Gender: Noun + Adj, run 5
 Training model:  Gender: Noun, run 6
 Training model:  Gender: Adj, run 6
 Training model:  Gender: Noun + Adj, run 6
 Training model:  Gender: Noun, run 7
 Training model:  Gender: Adj, run 7
 Training model:  Gender: Noun + Adj, run 7
 Training model:  Gender: Noun, run 8
 Training 

In [10]:
weights = np.abs(weights)

## Gender: Noun weights

In [11]:
noun_weights = pd.DataFrame(columns=list(range(512)))

In [12]:
noun_weights['run'] = list(range(10))
for r in range(10):
    dims_sorted = [x[0] for x in sorted(enumerate(weights[0][r]), key=lambda x: abs(x[1]), reverse=True)]
    for i in range(len(dims_sorted)):
        noun_weights.iloc[r, dims_sorted[i]] = i

In [13]:
noun_weights

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,503,504,505,506,507,508,509,510,511,run
0,229,425,449,426,51,104,506,10,47,465,...,127,292,97,293,45,254,200,197,309,0
1,229,425,449,426,51,104,506,10,47,465,...,127,292,97,293,45,254,200,197,309,1
2,229,425,449,426,51,104,506,10,47,465,...,127,292,97,293,45,254,200,197,309,2
3,229,425,449,426,51,104,506,10,47,465,...,127,292,97,293,45,254,200,197,309,3
4,229,425,449,426,51,104,506,10,47,465,...,127,292,97,293,45,254,200,197,309,4
5,229,425,449,426,51,104,506,10,47,465,...,127,292,97,293,45,254,200,197,309,5
6,229,425,449,426,51,104,506,10,47,465,...,127,292,97,293,45,254,200,197,309,6
7,229,425,449,426,51,104,506,10,47,465,...,127,292,97,293,45,254,200,197,309,7
8,229,425,449,426,51,104,506,10,47,465,...,127,292,97,293,45,254,200,197,309,8
9,229,425,449,426,51,104,506,10,47,465,...,127,292,97,293,45,254,200,197,309,9


In [14]:
noun_weights.iloc[:, :512].mean().sort_values()[:10]

100    0.0
250    1.0
195    2.0
265    3.0
162    4.0
269    5.0
377    6.0
390    7.0
468    8.0
434    9.0
dtype: float64

## Gender: Adj weights

In [15]:
adj_weights = pd.DataFrame(columns=list(range(512)))

In [16]:
adj_weights['run'] = list(range(10))
for r in range(10):
    dims_sorted = [x[0] for x in sorted(enumerate(weights[1][r]), key=lambda x: abs(x[1]), reverse=True)]
    for i in range(len(dims_sorted)):
        adj_weights.iloc[r, dims_sorted[i]] = i

In [17]:
adj_weights.iloc[:, :512].mean().sort_values()[:10]

250    0.0
121    1.0
162    2.0
390    3.0
88     4.0
89     5.0
432    6.0
177    7.0
175    8.0
245    9.0
dtype: float64

## Gender: both weights

In [18]:
both_weights = pd.DataFrame(columns=list(range(512)))

In [19]:
both_weights['run'] = list(range(10))
for r in range(10):
    dims_sorted = [x[0] for x in sorted(enumerate(weights[2][r]), key=lambda x: abs(x[1]), reverse=True)]
    for i in range(len(dims_sorted)):
        both_weights.iloc[r, dims_sorted[i]] = i

In [20]:
both_weights.iloc[:, :512].mean().sort_values()[:10]

250    0.0
162    1.0
100    2.0
195    3.0
269    4.0
390    5.0
214    6.0
265    7.0
377    8.0
468    9.0
dtype: float64