In [1]:
import torch.nn as nn
import torch 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import seaborn as sns
from matplotlib import pyplot as plt

# NN Classification (Perceptron) : Noun/ Not-Noun

## Data preparation

### Load data: NOUN, ADJ, and both

In [2]:
nouns = pd.read_csv('../Data/FlauBERT_WE/all_nouns_we.csv', index_col=0).drop(columns=['gender', 'number'])
nouns['noun'] = 1
nouns['verb'] = 0
nouns['adj'] = 0

verbs = pd.read_csv('../Data/FlauBERT_WE/all_verb_we.csv', index_col=0)
verbs['noun'] = 0
verbs['verb'] = 1
verbs['adj'] = 0

adjs = pd.read_csv('../Data/FlauBERT_WE/all_adjectives_we.csv', index_col=0).drop(columns=['gender', 'number'])
adjs['noun'] = 0
adjs['verb'] = 0
adjs['adj'] = 1


data = pd.concat([nouns, adjs, verbs])
data = data.sample(frac=1)

normalized_data = (data - data.min())/(data.max() - data.min()) 

In [3]:
# target : gender
Y_N = np.asarray(normalized_data.noun)
Y_V = np.asarray(normalized_data.verb)
Y_A = np.asarray(normalized_data.adj)

# features : word embeddings dimensions
X = np.asarray(normalized_data.iloc[:, :512])

# split data into train and test sets
X_N_train, X_N_test, Y_N_train, Y_N_test = train_test_split(X, Y_N, test_size=0.2, random_state=42)
X_A_train, X_A_test, Y_A_train, Y_A_test = train_test_split(X, Y_A, test_size=0.2, random_state=42)
X_V_train, X_V_test, Y_V_train, Y_V_test = train_test_split(X, Y_V, test_size=0.2, random_state=42)

names = ['Noun vs Not Noun', 'Adj vs Not Adj', 'Verb vs not Verb']

In [4]:
train_features = [X_N_train, X_A_train, X_V_train]
test_features = [X_N_test, X_A_test, X_V_test]
train_targets = [Y_N_train, Y_A_train, Y_V_train]
test_targets = [Y_N_test, Y_A_test, Y_V_test]

### Tensors

In [27]:
# convert to tensors
train_features = [torch.tensor(x).float() for x in train_features]
test_features = [torch.tensor(x).float() for x in test_features]
train_targets = [torch.tensor(x).long() for x in train_targets]
test_targets = [torch.tensor(x).long() for x in test_targets]

In [28]:
from torch.utils.data import TensorDataset, DataLoader
# the TensorDataset is a ready to use class to represent your data as list of tensors. 
# Note that input_features and labels must match on the length of the first dimension
train_sets = [TensorDataset(X_train, Y_train) for X_train, Y_train in zip(train_features, train_targets)]
test_sets = [TensorDataset(X_valid, Y_valid) for X_valid, Y_valid in zip(test_features, test_targets)]

# DataLoader shuffles and batches the data and load its in parallel using multiprocessing workers
train_loaders = [DataLoader(train_set, batch_size=32, shuffle=True) for train_set in train_sets]
test_loaders = [DataLoader(test_set, batch_size=32) for test_set in test_sets]

## Classification

### NN definition

In [29]:
NNlist = []

for i in range(3):
    
    # create a fully connected perceptron with 1 input layer (512 features) and 1 output layer (2 classes)
    model = nn.Sequential(nn.Linear(512, 2), nn.Softmax(dim=1))
    # define the loss function
    loss_fn = nn.CrossEntropyLoss()
    # define the optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    # add the model to the list
    NNlist.append([model, loss_fn, optimizer])

## Train the models

In [30]:
nb_epochs = 20

In [31]:
weights = [[], [], []]

In [32]:
# train the models 

for y in range(10):
    for i in range(3):
        print(f" Training model:  {names[i]}, run {y}")
        
        model = nn.Sequential(nn.Linear(512, 2), nn.Softmax(dim=1))
        # define the loss function
        loss_fn = nn.CrossEntropyLoss()
        # define the optimizer
        optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
        # put the model in training mode
        model.train()
        for epoch in range(nb_epochs):
            for X_train, Y_train in train_loaders[i]:
                # compute the model output
                Y_pred = model(X_train)
                # calculate loss
                loss = loss_fn(Y_pred, Y_train)
                # reset the gradients
                optimizer.zero_grad()
                # backpropagation
                loss.backward()
                # update model weights
                optimizer.step()

            if epoch % 10 == 0:
                print("--Epoch ", epoch, " Loss : ", loss.item())
        
        weights[i].append(model[0].weight.data.numpy()[1])

 Training model:  Noun vs Not Noun, run 0
--Epoch  0  Loss :  0.5742295980453491
--Epoch  10  Loss :  0.5159028768539429
 Training model:  Adj vs Not Adj, run 0
--Epoch  0  Loss :  0.43832382559776306
--Epoch  10  Loss :  0.438262015581131
 Training model:  Verb vs not Verb, run 0
--Epoch  0  Loss :  0.6563544273376465
--Epoch  10  Loss :  0.5942022800445557
 Training model:  Noun vs Not Noun, run 1
--Epoch  0  Loss :  0.5531182885169983
--Epoch  10  Loss :  0.5248103141784668
 Training model:  Adj vs Not Adj, run 1
--Epoch  0  Loss :  0.7506886720657349
--Epoch  10  Loss :  0.5007619261741638
 Training model:  Verb vs not Verb, run 1
--Epoch  0  Loss :  0.536217212677002
--Epoch  10  Loss :  0.5251181125640869
 Training model:  Noun vs Not Noun, run 2
--Epoch  0  Loss :  0.5758458375930786
--Epoch  10  Loss :  0.6964913606643677
 Training model:  Adj vs Not Adj, run 2
--Epoch  0  Loss :  0.5007773041725159
--Epoch  10  Loss :  0.4382621943950653
 Training model:  Verb vs not Verb, run

In [33]:
weights = np.abs(weights)

## Nouns vs non-Nouns

In [43]:
noun_weights = pd.DataFrame(columns=list(range(512)))

In [44]:
noun_weights['run'] = list(range(10))
for r in range(10):
    dims_sorted = [x[0] for x in sorted(enumerate(weights[0][r]), key=lambda x: abs(x[1]), reverse=True)]
    for i in range(len(dims_sorted)):
        noun_weights.iloc[r, dims_sorted[i]] = i

In [45]:
noun_weights

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,503,504,505,506,507,508,509,510,511,run
0,328,166,54,187,332,485,149,365,389,151,...,448,83,385,145,475,65,33,222,447,0
1,225,131,60,190,308,496,194,293,299,132,...,406,100,347,161,463,71,35,200,364,1
2,278,200,51,238,245,481,172,314,332,178,...,395,85,367,157,463,65,55,210,398,2
3,348,211,39,265,370,450,217,338,349,170,...,423,48,311,169,381,85,57,192,494,3
4,324,150,28,173,302,468,229,310,342,120,...,450,73,314,167,416,67,50,193,385,4
5,321,187,26,181,280,445,227,391,322,170,...,441,62,340,145,448,96,33,213,371,5
6,316,138,64,229,315,496,164,292,374,130,...,393,55,350,123,429,93,71,223,387,6
7,324,171,47,235,311,488,158,294,299,166,...,394,53,285,145,472,74,43,268,403,7
8,348,133,24,235,337,480,201,378,427,146,...,506,80,335,114,471,69,43,215,430,8
9,356,186,59,264,296,509,232,376,368,158,...,396,86,339,164,485,67,48,233,453,9


Average ranking of dimensions after 10 runs for **NOUN** vs **non-NOUN**:

In [51]:
noun_weights.iloc[:, :512].mean().sort_values()[:10]

159     0.4
409     0.6
305     2.2
465     3.8
275     3.9
378     4.4
260     6.1
387     8.0
462     9.4
37     10.0
dtype: float64

## Verb vs non-Verbs

In [47]:
verb_weights = pd.DataFrame(columns=list(range(512)))

In [60]:
verb_weights['run'] = list(range(10))
for r in range(10):
    dims_sorted = [x[0] for x in sorted(enumerate(weights[2][r]), key=lambda x: abs(x[1]), reverse=True)]
    for i in range(len(dims_sorted)):
        verb_weights.iloc[r, dims_sorted[i]] = i

In [61]:
verb_weights.iloc[:, :512].mean().sort_values()[:10]

192    0.0
310    1.6
378    2.0
508    4.2
480    4.8
158    5.0
159    6.7
175    8.1
89     9.1
282    9.3
dtype: float64

## Adj vs non-Adjs

In [63]:
adj_weights = pd.DataFrame(columns=list(range(512)))

In [64]:
adj_weights['run'] = list(range(10))
for r in range(10):
    dims_sorted = [x[0] for x in sorted(enumerate(weights[1][r]), key=lambda x: abs(x[1]), reverse=True)]
    for i in range(len(dims_sorted)):
        adj_weights.iloc[r, dims_sorted[i]] = i

In [65]:
adj_weights.iloc[:, :512].mean().sort_values()[:10]

256    132.8
133    136.1
426    136.3
98     140.0
310    144.5
381    144.7
12     144.7
188    146.1
1      146.8
412    150.8
dtype: float64