In [2]:
import torch.nn as nn
import torch 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import seaborn as sns
from matplotlib import pyplot as plt

# NN Classification (Perceptron) : Noun/ Not-Noun

## Data preparation

### Load data: NOUN, ADJ, and both

In [3]:
nouns = pd.read_csv('../Data/FlauBERT_WE/all_nouns_we.csv', index_col=0).drop(columns=['gender', 'number'])
nouns['noun'] = 1
nouns['verb'] = 0
nouns['adj'] = 0

verbs = pd.read_csv('../Data/FlauBERT_WE/all_verb_we.csv', index_col=0)
verbs['noun'] = 0
verbs['verb'] = 1
verbs['adj'] = 0

adjs = pd.read_csv('../Data/FlauBERT_WE/all_adjectives_we.csv', index_col=0).drop(columns=['gender', 'number'])
adjs['noun'] = 0
adjs['verb'] = 0
adjs['adj'] = 1


data = pd.concat([nouns, adjs, verbs])
data = data.sample(frac=1)

normalized_data = (data - data.min())/(data.max() - data.min()) 

In [4]:
# target : gender
Y_N = np.asarray(normalized_data.noun)
Y_V = np.asarray(normalized_data.verb)
Y_A = np.asarray(normalized_data.adj)

# features : word embeddings dimensions
X = np.asarray(normalized_data.iloc[:, :512])

# split data into train and test sets
X_N_train, X_N_test, Y_N_train, Y_N_test = train_test_split(X, Y_N, test_size=0.2, random_state=42)
X_A_train, X_A_test, Y_A_train, Y_A_test = train_test_split(X, Y_A, test_size=0.2, random_state=42)
X_V_train, X_V_test, Y_V_train, Y_V_test = train_test_split(X, Y_V, test_size=0.2, random_state=42)

names = ['Noun vs Not Noun', 'Adj vs Not Adj', 'Verb vs not Verb']

In [5]:
train_features = [X_N_train, X_A_train, X_V_train]
test_features = [X_N_test, X_A_test, X_V_test]
train_targets = [Y_N_train, Y_A_train, Y_V_train]
test_targets = [Y_N_test, Y_A_test, Y_V_test]

### Tensors

In [6]:
# convert to tensors
train_features = [torch.tensor(x).float() for x in train_features]
test_features = [torch.tensor(x).float() for x in test_features]
train_targets = [torch.tensor(x).long() for x in train_targets]
test_targets = [torch.tensor(x).long() for x in test_targets]

In [7]:
from torch.utils.data import TensorDataset, DataLoader
# the TensorDataset is a ready to use class to represent your data as list of tensors. 
# Note that input_features and labels must match on the length of the first dimension
train_sets = [TensorDataset(X_train, Y_train) for X_train, Y_train in zip(train_features, train_targets)]
test_sets = [TensorDataset(X_valid, Y_valid) for X_valid, Y_valid in zip(test_features, test_targets)]

# DataLoader shuffles and batches the data and load its in parallel using multiprocessing workers
train_loaders = [DataLoader(train_set, batch_size=32, shuffle=True) for train_set in train_sets]
test_loaders = [DataLoader(test_set, batch_size=32) for test_set in test_sets]

## Classification

### NN definition

In [197]:
NNlist = []

for i in range(3):
    
    # create a fully connected perceptron with 1 input layer (512 features) and 1 output layer (2 classes)
    model = nn.Sequential(nn.Linear(512, 2), nn.Softmax(dim=1))
    # define the loss function
    loss_fn = nn.CrossEntropyLoss()
    # define the optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    # add the model to the list
    NNlist.append([model, loss_fn, optimizer])

## Train the models

In [74]:
nb_epochs = 20

In [75]:
weights = [[], [], []]

In [76]:
# train the models 

for y in range(10):
    for i in range(3):
        print(f" Training model:  {names[i]}, run {y}")
        
        model = nn.Sequential(nn.Linear(512, 2), nn.Softmax(dim=1))
        # define the loss function
        loss_fn = nn.CrossEntropyLoss()
        # define the optimizer
        optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
        # put the model in training mode
        model.train()
        for epoch in range(nb_epochs):
            for X_train, Y_train in train_loaders[i]:
                # compute the model output
                Y_pred = model(X_train)
                # calculate loss
                loss = loss_fn(Y_pred, Y_train)
                # reset the gradients
                optimizer.zero_grad()
                # backpropagation
                loss.backward()
                # update model weights
                optimizer.step()

            if epoch % 10 == 0:
                print("--Epoch ", epoch, " Loss : ", loss.item())
        
        weights[i].append(model[0].weight.data.numpy()[1])

 Training model:  Noun vs Not Noun, run 0
--Epoch  0  Loss :  0.5869765877723694
--Epoch  10  Loss :  0.6973314881324768
 Training model:  Adj vs Not Adj, run 0
--Epoch  0  Loss :  0.6257501244544983
--Epoch  10  Loss :  0.5632616877555847
 Training model:  Verb vs not Verb, run 0
--Epoch  0  Loss :  0.6199825406074524
--Epoch  10  Loss :  0.5220359563827515
 Training model:  Noun vs Not Noun, run 1
--Epoch  0  Loss :  0.5855097770690918
--Epoch  10  Loss :  0.5742368102073669
 Training model:  Adj vs Not Adj, run 1
--Epoch  0  Loss :  0.5008059740066528
--Epoch  10  Loss :  0.5632617473602295
 Training model:  Verb vs not Verb, run 1
--Epoch  0  Loss :  0.6043552756309509
--Epoch  10  Loss :  0.4314430356025696
 Training model:  Noun vs Not Noun, run 2
--Epoch  0  Loss :  0.7129372954368591
--Epoch  10  Loss :  0.5634873509407043
 Training model:  Adj vs Not Adj, run 2
--Epoch  0  Loss :  0.5632551908493042
--Epoch  10  Loss :  0.37576207518577576
 Training model:  Verb vs not Verb, r

In [77]:
weights = np.abs(weights)

Noun dimensions ranking after 10 runs:

In [78]:
noun_weights = pd.DataFrame(columns=list(range(512)))

In [79]:
noun_weights['run'] = list(range(10))

In [80]:
for r in range(10):
    dims_sorted = np.argsort(weights[0][r])
    for i in range(len(dims_sorted)):
        noun_weights.iloc[r, dims_sorted[i]] = i

In [81]:
noun_weights

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,503,504,505,506,507,508,509,510,511,run
0,227,396,478,198,185,63,345,305,42,406,...,132,434,21,317,78,452,474,140,176,0
1,158,366,460,202,231,38,356,277,109,397,...,133,430,33,296,92,412,464,136,100,1
2,230,381,453,261,195,9,311,292,80,383,...,97,437,15,320,87,424,467,212,170,2
3,207,342,480,246,198,0,321,261,112,381,...,57,434,18,341,123,435,478,237,86,3
4,208,347,487,299,213,5,324,327,54,374,...,167,437,89,355,28,411,456,191,113,4
5,142,382,467,173,167,64,337,236,103,388,...,116,417,32,314,106,428,472,219,71,5
6,143,376,460,257,224,17,382,286,64,398,...,95,436,102,342,110,453,466,163,150,6
7,176,344,482,255,230,3,313,267,159,380,...,117,443,62,339,65,421,468,157,168,7
8,207,340,457,286,225,48,373,212,59,395,...,131,406,55,362,122,456,435,241,127,8
9,119,368,478,257,176,87,382,311,90,355,...,102,444,17,335,97,457,477,141,205,9


Average ranking of dimensions after 10 runs:

In [82]:
noun_weights.iloc[:, :512].mean().sort_values(ascending=False)[:10]

159    510.6
409    510.4
305    507.8
275    507.7
378    507.2
387    506.5
260    505.2
465    504.3
462    501.3
374    500.2
dtype: float64

In [83]:
noun_weights.iloc[:, :512].mean().sort_values()[:10]

77     19.9
356    20.4
190    23.0
265    24.0
136    25.7
432    27.1
59     27.1
79     27.3
163    28.0
17     28.4
dtype: float64