In [5]:
import torch.nn as nn
import torch 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import seaborn as sns
from matplotlib import pyplot as plt

## Data preparation

### Load data: NOUN, ADJ, and both

In [31]:
df_nouns_we = pd.read_csv('../Data/FlauBERT_WE/all_nouns_we.csv').drop(columns=["number"])
df_adj_we = pd.read_csv('../Data/FlauBERT_WE/all_adjectives_we.csv').drop(columns = "number")
df_both_we = pd.concat([df_nouns_we, df_adj_we], ignore_index=True)

# target : gender
Y_gd_N = df_nouns_we["gender"].apply(lambda x: 1 if x == "masculine" else 0)
Y_gd_A = df_adj_we["gender"].apply(lambda x: 1 if x == "masculine" else 0)
Y_gd_both = df_both_we["gender"].apply(lambda x: 1 if x == "masculine" else 0)

# features : word embeddings dimensions
X_gd_N = df_nouns_we.drop(columns=["Word", "gender"])
X_gd_A = df_adj_we.drop(columns=["Word", "gender"])
X_gd_both = df_both_we.drop(columns = ["Word", "gender"] )

# normalize data to be between 0 and 1
X_gd_N = (X_gd_N - X_gd_N.min()) / (X_gd_N.max() - X_gd_N.min())
X_gd_A = (X_gd_A - X_gd_A.min()) / (X_gd_A.max() - X_gd_A.min())
X_gd_both = (X_gd_both - X_gd_both.min()) / (X_gd_both.max() - X_gd_both.min())

# split data into train and test sets
X_gd_N_train, X_gd_N_test, Y_gd_N_train, Y_gd_N_test = train_test_split(X_gd_N, Y_gd_N, test_size=0.2, random_state=42)
X_gd_A_train, X_gd_A_test, Y_gd_A_train, Y_gd_A_test = train_test_split(X_gd_A, Y_gd_A, test_size=0.2, random_state=42)
X_gd_both_train, X_gd_both_test, Y_gd_both_train, Y_gd_both_test = train_test_split(X_gd_both, Y_gd_both, test_size=0.2, random_state=42)

In [32]:
train_features = [X_gd_N_train, X_gd_A_train, X_gd_both_train]
test_features = [X_gd_N_test, X_gd_A_test, X_gd_both_test]
train_targets = [Y_gd_N_train, Y_gd_A_train, Y_gd_both_train]
test_targets = [Y_gd_N_test, Y_gd_A_test, Y_gd_both_test]

In [33]:
# convert to tensors
train_features = [torch.tensor(x.values).float() for x in train_features]
test_features = [torch.tensor(x.values).float() for x in test_features]
train_targets = [torch.tensor(x.values).long() for x in train_targets]
test_targets = [torch.tensor(x.values).long() for x in test_targets]

In [12]:
names = ['Gender: Noun', 'Gender: Adj', 'Gender: Noun + Adj']

### Tensors

In [9]:
# convert to tensors
train_features = [torch.tensor(x).float() for x in train_features]
test_features = [torch.tensor(x).float() for x in test_features]
train_targets = [torch.tensor(x).long() for x in train_targets]
test_targets = [torch.tensor(x).long() for x in test_targets]

  train_features = [torch.tensor(x).float() for x in train_features]
  test_features = [torch.tensor(x).float() for x in test_features]
  train_targets = [torch.tensor(x).long() for x in train_targets]
  test_targets = [torch.tensor(x).long() for x in test_targets]


In [10]:
from torch.utils.data import TensorDataset, DataLoader
# the TensorDataset is a ready to use class to represent your data as list of tensors. 
# Note that input_features and labels must match on the length of the first dimension
train_sets = [TensorDataset(X_train, Y_train) for X_train, Y_train in zip(train_features, train_targets)]
test_sets = [TensorDataset(X_valid, Y_valid) for X_valid, Y_valid in zip(test_features, test_targets)]

# DataLoader shuffles and batches the data and load its in parallel using multiprocessing workers
train_loaders = [DataLoader(train_set, batch_size=32, shuffle=True) for train_set in train_sets]
test_loaders = [DataLoader(test_set, batch_size=32) for test_set in test_sets]

## Train the models

In [13]:
nb_epochs = 20

In [14]:
weights = [[], [], []]

In [15]:
# train the models 

for y in range(10):
    for i in range(3):
        print(f" Training model:  {names[i]}, run {y}")
        
        model = nn.Sequential(nn.Linear(512, 2), nn.Softmax(dim=1))
        # define the loss function
        loss_fn = nn.CrossEntropyLoss()
        # define the optimizer
        optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
        # put the model in training mode
        model.train()
        for epoch in range(nb_epochs):
            for X_train, Y_train in train_loaders[i]:
                # compute the model output
                Y_pred = model(X_train)
                # calculate loss
                loss = loss_fn(Y_pred, Y_train)
                # reset the gradients
                optimizer.zero_grad()
                # backpropagation
                loss.backward()
                # update model weights
                optimizer.step()

            if epoch % 10 == 0:
                print("--Epoch ", epoch, " Loss : ", loss.item())
        
        weights[i].append(model[0].weight.data.numpy()[1])

 Training model:  Gender: Noun, run 0
--Epoch  0  Loss :  0.6518369913101196
--Epoch  10  Loss :  0.4662286043167114
 Training model:  Gender: Adj, run 0
--Epoch  0  Loss :  0.6257043480873108
--Epoch  10  Loss :  0.41980743408203125
 Training model:  Gender: Noun + Adj, run 0
--Epoch  0  Loss :  0.5905786752700806
--Epoch  10  Loss :  0.43220603466033936
 Training model:  Gender: Noun, run 1
--Epoch  0  Loss :  0.6511135697364807
--Epoch  10  Loss :  0.567952036857605
 Training model:  Gender: Adj, run 1
--Epoch  0  Loss :  0.6755850315093994
--Epoch  10  Loss :  0.47971639037132263
 Training model:  Gender: Noun + Adj, run 1
--Epoch  0  Loss :  0.6628757119178772
--Epoch  10  Loss :  0.3877723515033722
 Training model:  Gender: Noun, run 2
--Epoch  0  Loss :  0.7195064425468445
--Epoch  10  Loss :  0.4591420590877533
 Training model:  Gender: Adj, run 2
--Epoch  0  Loss :  0.5874132513999939
--Epoch  10  Loss :  0.4571899473667145
 Training model:  Gender: Noun + Adj, run 2
--Epoch  

In [16]:
weights = np.abs(weights)

## Gender: Noun weights

In [17]:
noun_weights = pd.DataFrame(columns=list(range(512)))

In [18]:
noun_weights['run'] = list(range(10))
for r in range(10):
    dims_sorted = [x[0] for x in sorted(enumerate(weights[0][r]), key=lambda x: abs(x[1]), reverse=True)]
    for i in range(len(dims_sorted)):
        noun_weights.iloc[r, dims_sorted[i]] = i

In [19]:
noun_weights

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,503,504,505,506,507,508,509,510,511,run
0,70,296,424,410,98,18,218,10,191,148,...,135,482,401,463,5,195,462,143,446,0
1,66,303,467,347,92,21,238,13,239,142,...,102,510,381,442,6,152,326,141,480,1
2,100,304,414,478,115,32,256,11,168,177,...,119,461,382,459,6,136,417,92,476,2
3,87,272,457,364,136,24,261,11,195,129,...,109,511,410,476,6,154,446,122,442,3
4,66,263,450,378,93,31,262,6,203,169,...,126,485,338,436,9,151,356,86,425,4
5,61,230,393,420,100,36,254,10,238,178,...,151,442,315,438,6,176,405,136,497,5
6,83,283,428,410,89,24,251,13,193,176,...,133,472,366,438,6,180,447,122,413,6
7,72,245,400,457,139,30,264,8,241,122,...,94,484,345,510,4,141,386,129,418,7
8,91,306,360,393,107,22,246,11,179,140,...,127,501,416,480,5,149,409,100,464,8
9,60,256,436,392,113,21,311,10,177,164,...,123,504,362,424,5,161,400,129,484,9


In [20]:
noun_weights.iloc[:, :512].mean().sort_values()[:10]

100    0.0
434    1.7
162    1.9
316    3.2
377    4.3
245    4.9
507    5.8
250    7.8
186    9.1
117    9.3
dtype: float64

## Gender: Adj weights

In [21]:
adj_weights = pd.DataFrame(columns=list(range(512)))

In [22]:
adj_weights['run'] = list(range(10))
for r in range(10):
    dims_sorted = [x[0] for x in sorted(enumerate(weights[1][r]), key=lambda x: abs(x[1]), reverse=True)]
    for i in range(len(dims_sorted)):
        adj_weights.iloc[r, dims_sorted[i]] = i

In [24]:
adj_weights.iloc[:, :512].mean().sort_values()[:10]

466    0.2
250    1.3
245    2.2
439    3.3
5      4.5
181    5.9
133    6.6
177    6.8
88     9.5
503    9.6
dtype: float64

## Gender: both weights

In [25]:
both_weights = pd.DataFrame(columns=list(range(512)))

In [26]:
both_weights['run'] = list(range(10))
for r in range(10):
    dims_sorted = [x[0] for x in sorted(enumerate(weights[2][r]), key=lambda x: abs(x[1]), reverse=True)]
    for i in range(len(dims_sorted)):
        both_weights.iloc[r, dims_sorted[i]] = i

In [27]:
both_weights.iloc[:, :512].mean().sort_values()[:10]

162    0.2
377    0.9
507    3.0
245    3.1
250    3.4
100    5.1
316    5.8
434    6.5
28     8.6
499    8.8
dtype: float64