In [6]:
from sklearn.linear_model import LogisticRegression 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import seaborn as sns
from matplotlib import pyplot as plt

## Data preparation

### Load data: NOUN, ADJ, and both

In [2]:
df_nouns_we = pd.read_csv('../Data/FlauBERT_WE/all_nouns_we.csv').drop(columns=["gender"])
df_adj_we = pd.read_csv('../Data/FlauBERT_WE/all_adjectives_we.csv').drop(columns = "gender")
df_both_we = pd.concat([df_nouns_we, df_adj_we], ignore_index=True)
# target : number
Y_nb_N = df_nouns_we["number"].apply(lambda x: 1 if x == "singular" else 0)
Y_nb_A = df_adj_we["number"].apply(lambda x: 1 if x == "singular" else 0)
Y_nb_both = df_both_we["number"].apply(lambda x: 1 if x == "singular" else 0)
# features : word embeddings dimensions
X_nb_N = df_nouns_we.drop(columns=["Word", "number"])
X_nb_A = df_adj_we.drop(columns=["Word", "number"])
X_nb_both = df_both_we.drop(columns = ["Word", "number"] )

# normalize data to be between 0 and 1
X_nb_N = (X_nb_N - X_nb_N.min()) / (X_nb_N.max() - X_nb_N.min())
X_nb_A = (X_nb_A - X_nb_A.min()) / (X_nb_A.max() - X_nb_A.min())
X_nb_both = (X_nb_both - X_nb_both.min()) / (X_nb_both.max() - X_nb_both.min())


# split data into train and test sets
X_nb_N_train, X_nb_N_test, Y_nb_N_train, Y_nb_N_test = train_test_split(X_nb_N, Y_nb_N, test_size=0.2, random_state=42)
X_nb_A_train, X_nb_A_test, Y_nb_A_train, Y_nb_A_test = train_test_split(X_nb_A, Y_nb_A, test_size=0.2, random_state=42)
X_nb_both_train, X_nb_both_test, Y_nb_both_train, Y_nb_both_test = train_test_split(X_nb_both, Y_nb_both, test_size=0.2, random_state=42)

In [3]:
train_feature = [X_nb_N_train, X_nb_A_train, X_nb_both_train]
test_feature = [X_nb_N_test, X_nb_A_test, X_nb_both_test]
train_target = [Y_nb_N_train, Y_nb_A_train, Y_nb_both_train]
test_target = [Y_nb_N_test, Y_nb_A_test, Y_nb_both_test]

names = ['Number: Noun', 'Number: Adjs', 'Number: Both']

## Train the models

In [4]:
weights = [[], [], []]

In [8]:
for y in range(10):
    for i in range(3):
        print(f" Training model:  {names[i]}, run {y}")
        
        clf = LogisticRegression(random_state=y, max_iter=1000)
        clf.fit(train_feature[i], train_target[i])
        
        weights[i].append(clf.coef_[0])

 Training model:  Number: Noun, run 0
 Training model:  Number: Adjs, run 0
 Training model:  Number: Both, run 0
 Training model:  Number: Noun, run 1
 Training model:  Number: Adjs, run 1
 Training model:  Number: Both, run 1
 Training model:  Number: Noun, run 2
 Training model:  Number: Adjs, run 2
 Training model:  Number: Both, run 2
 Training model:  Number: Noun, run 3
 Training model:  Number: Adjs, run 3
 Training model:  Number: Both, run 3
 Training model:  Number: Noun, run 4
 Training model:  Number: Adjs, run 4
 Training model:  Number: Both, run 4
 Training model:  Number: Noun, run 5
 Training model:  Number: Adjs, run 5
 Training model:  Number: Both, run 5
 Training model:  Number: Noun, run 6
 Training model:  Number: Adjs, run 6
 Training model:  Number: Both, run 6
 Training model:  Number: Noun, run 7
 Training model:  Number: Adjs, run 7
 Training model:  Number: Both, run 7
 Training model:  Number: Noun, run 8
 Training model:  Number: Adjs, run 8
 Training mo

In [9]:
weights = np.abs(weights)

## Number: Nouns

In [10]:
noun_weights = pd.DataFrame(columns=list(range(512)))

In [11]:
noun_weights['run'] = list(range(10))
for r in range(10):
    dims_sorted = [x[0] for x in sorted(enumerate(weights[0][r]), key=lambda x: abs(x[1]), reverse=True)]
    for i in range(len(dims_sorted)):
        noun_weights.iloc[r, dims_sorted[i]] = i

In [12]:
noun_weights

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,503,504,505,506,507,508,509,510,511,run
0,483,61,429,140,266,342,194,47,175,88,...,447,369,276,150,124,442,500,283,142,0
1,483,61,429,140,266,342,194,47,175,88,...,447,369,276,150,124,442,500,283,142,1
2,483,61,429,140,266,342,194,47,175,88,...,447,369,276,150,124,442,500,283,142,2
3,483,61,429,140,266,342,194,47,175,88,...,447,369,276,150,124,442,500,283,142,3
4,483,61,429,140,266,342,194,47,175,88,...,447,369,276,150,124,442,500,283,142,4
5,483,61,429,140,266,342,194,47,175,88,...,447,369,276,150,124,442,500,283,142,5
6,483,61,429,140,266,342,194,47,175,88,...,447,369,276,150,124,442,500,283,142,6
7,483,61,429,140,266,342,194,47,175,88,...,447,369,276,150,124,442,500,283,142,7
8,483,61,429,140,266,342,194,47,175,88,...,447,369,276,150,124,442,500,283,142,8
9,483,61,429,140,266,342,194,47,175,88,...,447,369,276,150,124,442,500,283,142,9


In [13]:
noun_weights.iloc[:, :512].mean().sort_values()[:10]

310    0.0
54     1.0
208    2.0
384    3.0
359    4.0
158    5.0
81     6.0
285    7.0
182    8.0
172    9.0
dtype: float64

## Number: Adjs

In [14]:
adj_weights = pd.DataFrame(columns=list(range(512)))

In [15]:
adj_weights['run'] = list(range(10))
for r in range(10):
    dims_sorted = [x[0] for x in sorted(enumerate(weights[1][r]), key=lambda x: abs(x[1]), reverse=True)]
    for i in range(len(dims_sorted)):
        adj_weights.iloc[r, dims_sorted[i]] = i

In [16]:
adj_weights.iloc[:, :512].mean().sort_values()[:10]

310    0.0
54     1.0
384    2.0
359    3.0
285    4.0
81     5.0
200    6.0
360    7.0
455    8.0
192    9.0
dtype: float64

## Number: both

In [17]:
both_weights = pd.DataFrame(columns=list(range(512)))

In [18]:
both_weights['run'] = list(range(10))
for r in range(10):
    dims_sorted = [x[0] for x in sorted(enumerate(weights[2][r]), key=lambda x: abs(x[1]), reverse=True)]
    for i in range(len(dims_sorted)):
        both_weights.iloc[r, dims_sorted[i]] = i

In [19]:
both_weights.iloc[:, :512].mean().sort_values()[:10]

310    0.0
54     1.0
384    2.0
208    3.0
359    4.0
81     5.0
360    6.0
182    7.0
158    8.0
285    9.0
dtype: float64