In [3]:
import torch.nn as nn
import torch 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import seaborn as sns
from matplotlib import pyplot as plt

## Data preparation

### Load data: NOUN, ADJ, and both

In [8]:
df_nouns_we = pd.read_csv('../Data/FlauBERT_WE/all_nouns_we.csv').drop(columns=["gender"])
df_adj_we = pd.read_csv('../Data/FlauBERT_WE/all_adjectives_we.csv').drop(columns = "gender")
df_both_we = pd.concat([df_nouns_we, df_adj_we], ignore_index=True)
# target : number
Y_nb_N = df_nouns_we["number"].apply(lambda x: 1 if x == "singular" else 0)
Y_nb_A = df_adj_we["number"].apply(lambda x: 1 if x == "singular" else 0)
Y_nb_both = df_both_we["number"].apply(lambda x: 1 if x == "singular" else 0)
# features : word embeddings dimensions
X_nb_N = df_nouns_we.drop(columns=["Word", "number"])
X_nb_A = df_adj_we.drop(columns=["Word", "number"])
X_nb_both = df_both_we.drop(columns = ["Word", "number"] )

# normalize data to be between 0 and 1
X_nb_N = (X_nb_N - X_nb_N.min()) / (X_nb_N.max() - X_nb_N.min())
X_nb_A = (X_nb_A - X_nb_A.min()) / (X_nb_A.max() - X_nb_A.min())
X_nb_both = (X_nb_both - X_nb_both.min()) / (X_nb_both.max() - X_nb_both.min())


# split data into train and test sets
X_nb_N_train, X_nb_N_test, Y_nb_N_train, Y_nb_N_test = train_test_split(X_nb_N, Y_nb_N, test_size=0.2, random_state=42)
X_nb_A_train, X_nb_A_test, Y_nb_A_train, Y_nb_A_test = train_test_split(X_nb_A, Y_nb_A, test_size=0.2, random_state=42)
X_nb_both_train, X_nb_both_test, Y_nb_both_train, Y_nb_both_test = train_test_split(X_nb_both, Y_nb_both, test_size=0.2, random_state=42)

In [9]:
train_feature = [X_nb_N_train, X_nb_A_train, X_nb_both_train]
test_feature = [X_nb_N_test, X_nb_A_test, X_nb_both_test]
train_target = [Y_nb_N_train, Y_nb_A_train, Y_nb_both_train]
test_target = [Y_nb_N_test, Y_nb_A_test, Y_nb_both_test]

names = ['Number: Noun', 'Number: Adjs', 'Number: Both']

### Tensors

In [13]:
# convert to tensors
train_feature = [torch.tensor(x.values).float() for x in train_feature]
test_feature = [torch.tensor(x.values).float() for x in test_feature]
train_target = [torch.tensor(x.values).long() for x in train_target]
test_target = [torch.tensor(x.values).long() for x in test_target]

In [15]:
from torch.utils.data import TensorDataset, DataLoader
# the TensorDataset is a ready to use class to represent your data as list of tensors. 
# Note that input_features and labels must match on the length of the first dimension
train_sets = [TensorDataset(X_train, Y_train) for X_train, Y_train in zip(train_feature, train_target)]
test_sets = [TensorDataset(X_valid, Y_valid) for X_valid, Y_valid in zip(test_feature, test_target)]

# DataLoader shuffles and batches the data and load its in parallel using multiprocessing workers
train_loaders = [DataLoader(train_set, batch_size=32, shuffle=True) for train_set in train_sets]
test_loaders = [DataLoader(test_set, batch_size=32) for test_set in test_sets]

## Train the models

In [16]:
nb_epochs = 20

In [17]:
weights = [[], [], []]

In [18]:
# train the models 

for y in range(10):
    for i in range(3):
        print(f" Training model:  {names[i]}, run {y}")
        
        model = nn.Sequential(nn.Linear(512, 2), nn.Softmax(dim=1))
        # define the loss function
        loss_fn = nn.CrossEntropyLoss()
        # define the optimizer
        optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
        # put the model in training mode
        model.train()
        for epoch in range(nb_epochs):
            for X_train, Y_train in train_loaders[i]:
                # compute the model output
                Y_pred = model(X_train)
                # calculate loss
                loss = loss_fn(Y_pred, Y_train)
                # reset the gradients
                optimizer.zero_grad()
                # backpropagation
                loss.backward()
                # update model weights
                optimizer.step()

            if epoch % 10 == 0:
                print("--Epoch ", epoch, " Loss : ", loss.item())
        
        weights[i].append(model[0].weight.data.numpy()[1])

 Training model:  Number: Noun, run 0
--Epoch  0  Loss :  0.5339772701263428
--Epoch  10  Loss :  0.3435690701007843
 Training model:  Number: Adjs, run 0
--Epoch  0  Loss :  0.6323344111442566
--Epoch  10  Loss :  0.35955438017845154
 Training model:  Number: Both, run 0
--Epoch  0  Loss :  0.5656259655952454
--Epoch  10  Loss :  0.3276616632938385
 Training model:  Number: Noun, run 1
--Epoch  0  Loss :  0.5480201840400696
--Epoch  10  Loss :  0.35430285334587097
 Training model:  Number: Adjs, run 1
--Epoch  0  Loss :  0.6420716643333435
--Epoch  10  Loss :  0.37520483136177063
 Training model:  Number: Both, run 1
--Epoch  0  Loss :  0.6836819052696228
--Epoch  10  Loss :  0.3508302867412567
 Training model:  Number: Noun, run 2
--Epoch  0  Loss :  0.5583521127700806
--Epoch  10  Loss :  0.38170331716537476
 Training model:  Number: Adjs, run 2
--Epoch  0  Loss :  0.5966718792915344
--Epoch  10  Loss :  0.3695962131023407
 Training model:  Number: Both, run 2
--Epoch  0  Loss :  0.

In [19]:
weights = np.abs(weights)

## Number: Nouns

In [20]:
noun_weights = pd.DataFrame(columns=list(range(512)))

In [21]:
noun_weights['run'] = list(range(10))
for r in range(10):
    dims_sorted = [x[0] for x in sorted(enumerate(weights[0][r]), key=lambda x: abs(x[1]), reverse=True)]
    for i in range(len(dims_sorted)):
        noun_weights.iloc[r, dims_sorted[i]] = i

In [23]:
noun_weights.iloc[:, :512].mean().sort_values()[:10]

310     0.0
54      1.0
158     2.0
285     3.0
359     4.6
172     5.3
384     5.9
495     6.3
250     8.8
182    10.1
dtype: float64

In [34]:
w1 = list(pd.read_csv('../Data/Dimensions/GN/noun.csv', index_col=0).iloc[:, 0].values)

w1.extend(noun_weights.iloc[:, :512].mean().sort_values()[:10].index)

pd.DataFrame(w1).to_csv('../Data/Dimensions/GN/noun.csv')

## Number: Adjs

In [28]:
adj_weights = pd.DataFrame(columns=list(range(512)))

In [29]:
adj_weights['run'] = list(range(10))
for r in range(10):
    dims_sorted = [x[0] for x in sorted(enumerate(weights[1][r]), key=lambda x: abs(x[1]), reverse=True)]
    for i in range(len(dims_sorted)):
        adj_weights.iloc[r, dims_sorted[i]] = i

In [30]:
adj_weights.iloc[:, :512].mean().sort_values()[:10]

310    0.0
285    1.3
54     1.7
384    3.9
455    5.0
495    6.6
200    7.6
360    8.2
192    8.7
25     9.0
dtype: float64

In [35]:
w1 = list(pd.read_csv('../Data/Dimensions/GN/adj.csv', index_col=0).iloc[:, 0].values)

w1.extend(adj_weights.iloc[:, :512].mean().sort_values()[:10].index)

pd.DataFrame(w1).to_csv('../Data/Dimensions/GN/adj.csv')

## Number: both

In [31]:
both_weights = pd.DataFrame(columns=list(range(512)))

In [32]:
both_weights['run'] = list(range(10))
for r in range(10):
    dims_sorted = [x[0] for x in sorted(enumerate(weights[2][r]), key=lambda x: abs(x[1]), reverse=True)]
    for i in range(len(dims_sorted)):
        both_weights.iloc[r, dims_sorted[i]] = i

In [33]:
both_weights.iloc[:, :512].mean().sort_values()[:10]

310    0.0
54     1.0
158    2.0
285    3.0
359    4.1
384    5.1
172    6.4
495    6.7
250    8.5
200    9.4
dtype: float64

In [36]:
w1 = list(pd.read_csv('../Data/Dimensions/GN/both.csv', index_col=0).iloc[:, 0].values)

w1.extend(both_weights.iloc[:, :512].mean().sort_values()[:10].index)

pd.DataFrame(w1).to_csv('../Data/Dimensions/GN/both.csv')