In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras import Sequential, Model
from IPython.display import display

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

from transformers import TFCamembertModel, CamembertTokenizer, CamembertConfig
config = CamembertConfig.from_pretrained("camembert-base", output_hidden_states=False)
camembert = TFCamembertModel.from_pretrained("camembert-base", config=config)
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")

Some layers from the model checkpoint at camembert-base were not used when initializing TFCamembertModel: ['lm_head']
- This IS expected if you are initializing TFCamembertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFCamembertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFCamembertModel were not initialized from the model checkpoint at camembert-base and are newly initialized: ['roberta/pooler/dense/bias:0', 'roberta/pooler/dense/kernel:0']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [158]:
# TODO faire une base d'eval en plus de train et test

Definition des models

In [2]:
class CustomCamemBERT(tf.keras.Model):
	def __init__(self, camembert, tokenizer):
		super().__init__()
		self.tokenizer = tokenizer
		self.cam = camembert
		self.GAP = GlobalAveragePooling1D()

	def call(self, inputs):
		l = tf.reshape(tf.convert_to_tensor(()), (0, 768))
		for sentence in inputs:
			# print("\nSentence:", sentence)
			encoded_sentence = tf.constant([tokenizer.encode(tokenizer.tokenize(sentence))], dtype=tf.int32)
			# print("Sentence encoded:", encoded_sentence.numpy())
			x = self.cam(encoded_sentence).last_hidden_state
			# print(x)
			x = self.GAP(x)
			# x = tf.reduce_mean(x, axis=1)
			# print("x", x)
			l = tf.concat([l, x], 0)
		# print("l", l)
		return l

# tokenized_sentence = tokenizer.tokenize("J'aime le camembert !")
# encoded_sentence = tf.constant([tokenizer.encode(tokenizer.tokenize("J'aime le camembert !"))], dtype=tf.int32)

# print(encoded_sentence)
# camembert(encoded_sentence)

def generator(latent_dim):
	noise = Input(shape=(latent_dim,), dtype=tf.float32)
	label = Input(shape=(1,), dtype=tf.float32)
	
	model = Sequential()
	model.add(Dense(256, input_dim=latent_dim))
	model.add(Dense(768))

	label_embedding = Flatten()(Embedding(2, latent_dim)(label))
	model_input = multiply([noise, label_embedding])

	out = model(model_input)

	return Model([noise, label], out)

def discriminator():
	hidden_rep = Input(shape=(768,), dtype=tf.float32)

	model = Sequential()
	model.add(Dense(256, input_dim=768))
	model.add(Dense(2))
	model.add(Activation("sigmoid"))
	
	out = model(hidden_rep)

	return Model(hidden_rep, out)

NLP_model = CustomCamemBERT(camembert, tokenizer)
NLP_model.trainable = False

# display(model("J'aime le camembert !"))
display(NLP_model(["J'aime pas le camembert !", "J'aime le camembert !"]))

# sup1 = sup[sup[:, 1] == 1, :]
# sup0 = sup[sup[:, 1] == 0, :]


# a = select_real_samples(sup1, 2)

<tf.Tensor: shape=(2, 768), dtype=float32, numpy=
array([[ 0.00872202, -0.08576093,  0.02685546, ..., -0.08810256,
         0.01628952,  0.07871367],
       [ 0.00422955, -0.08892401,  0.02005171, ..., -0.10487113,
        -0.00174052,  0.0599581 ]], dtype=float32)>

1. Préparation des données

In [3]:
sup = pd.read_csv("./supervise.csv")
# display(sup.head())
sup = sup[["text", "label_shufan"]]
# display(sup.loc[sup["label_shufan"] == 1]["text"])
# sup['text'] = sup['text'].apply(lambda t: tokenizer.encode(tokenizer.tokenize(t)))
# Feature extraction
#sup['text'] = sup['text'].apply(lambda t: NLP_model([t]))
features = NLP_model(sup[["text"]].to_numpy().reshape(-1,).tolist())
display(sup)
sup['text'] = features.numpy().tolist()
display(sup)
sup.to_csv("./featured.csv", header=False, index=False)
sup = sup.to_numpy()

Unnamed: 0,text,label_shufan
0,"Ces mêmes insectes qui, soit mangent la graine...",0
1,#insectID Est-ce que vous pouvez m'aider à ide...,0
2,"Le gourbi kabyle, 1890 - Jules-Charles-Clément...",0
3,Ce tweet qui résume parfaitement la situation ...,0
4,Réveil spinosad est utilisé en bio en mais sur...,0
...,...,...
239,"Mycotoxines, oidium, mildiou, doryphore, taupi...",0
240,".... et le taupin,les pucerons et leurs virus ...",0
241,happy family days Shooting hivernale tout en d...,0
242,happy family days Shooting hivernale tout en d...,0


Unnamed: 0,text,label_shufan
0,"[0.04667048901319504, -0.0020412157755345106, ...",0
1,"[0.04596919193863869, -0.0014597502304241061, ...",0
2,"[0.03022572211921215, 0.08436614274978638, 0.0...",0
3,"[0.018831443041563034, -0.03856367617845535, -...",0
4,"[0.03444313257932663, 0.19099833071231842, -0....",0
...,...,...
239,"[0.012171024456620216, 0.12850446999073029, -0...",0
240,"[0.0017464796546846628, 0.0735335424542427, -0...",0
241,"[0.05000726133584976, 0.08253184705972672, -0....",0
242,"[0.04931621998548508, 0.07705958187580109, -0....",0


In [4]:
nonsup = pd.read_csv("./nonsupervise.csv")

In [10]:
latent_dim = 100

def generator_loss(y_true, y_pred):
	return NotImplemented

def generate_noise(n_batch, latent_dim):
	return np.random.randn(latent_dim * n_batch).reshape((n_batch, latent_dim))

G = generator(latent_dim)
D = discriminator()
n_batch = 1
noise = generate_noise(n_batch, latent_dim)
label = np.repeat(0, n_batch).reshape(n_batch, 1)
print(noise.shape, label.shape)
G([noise, label])


(1, 100) (1, 1)


<tf.Tensor: shape=(1, 768), dtype=float32, numpy=
array([[ 2.04546899e-02, -3.41128884e-03, -1.48330955e-02,
         5.43523114e-03, -3.58823314e-02, -1.19235553e-02,
        -1.50728002e-02,  1.45085445e-02,  3.76014342e-03,
         6.36793347e-03, -1.31925233e-02,  2.40269974e-02,
        -1.02936057e-03,  2.41885916e-03,  2.47671222e-03,
        -5.55145089e-03,  2.79429369e-02,  3.34561779e-03,
         4.18128166e-03, -1.57661568e-02,  1.75950583e-03,
        -1.78138446e-02, -4.50865738e-03,  9.45626944e-03,
         1.24353366e-02,  8.67690518e-03,  9.15837195e-03,
        -1.50609175e-02,  5.98522788e-03, -1.93752944e-02,
        -4.20917291e-03, -1.42712211e-02, -1.91901438e-03,
        -6.65744301e-05,  4.13274299e-03, -8.69758427e-03,
         1.05074579e-02, -1.46139516e-02,  2.30597351e-02,
         4.80119139e-03, -9.79346782e-03,  7.92679377e-03,
        -2.04897225e-02,  5.45336679e-03, -6.90681767e-03,
        -9.11845081e-03, -8.24315380e-03, -2.08778144e-03,
      

In [101]:
D = discriminator()
D.compile(loss="mse")
D.trainable = False
# display(D.summary())

latent_dim = 100
G = generator(latent_dim)
# display(G.summary())

# GAN = tf.keras.Sequential([G, D], name="GAN")
# GAN.compile(loss='mse')
# GAN(generate_noise(1, 100))
# GAN.summary()

noise = Input(shape=(latent_dim,))
label = Input(shape=(1,))
hidden_rep = G([noise, label])
validity = D(hidden_rep)

# connect them
GAN = Model([noise, label], validity)
GAN.compile(loss='mse')



In [13]:
def generate_fake_samples(n_batch, labelN, generator, latent_dim):
	labels = np.repeat(labelN, n_batch)
	X_fake = generator([generate_noise(n_batch, latent_dim), labels])
	y_fake = np.array([labels, np.repeat(0, n_batch)]).reshape(2, n_batch).T
	return X_fake, y_fake

def select_real_samples(n_batch, labelN, dataset):
	dataset = dataset[dataset[:, 1] == labelN, :]
	ind = np.random.choice(len(dataset), size=n_batch, replace=False)
	labels = np.repeat(labelN, n_batch)
	X_real = np.array(dataset[ind, 0].tolist())
	y_real =  np.array([labels, np.repeat(1, n_batch)]).reshape(2, n_batch).T
	return X_real, y_real

def train_d(dataset, n_batch):
	# (n_batch, 768), ([label, validité])
	X_fake, y_fake = generate_fake_samples(n_batch, 0, generator=G, latent_dim=100)
	d_fake_0 = D.train_on_batch(X_fake, y_fake)

	X_fake, y_fake = generate_fake_samples(n_batch, 1, generator=G, latent_dim=100)
	d_fake_1 = D.train_on_batch(X_fake, y_fake)

	X_real, y_real = select_real_samples(n_batch, 0, dataset=dataset)
	d_real_0 = D.train_on_batch(X_real, y_real)

	X_real, y_real = select_real_samples(n_batch, 1, dataset=dataset)
	d_real_1 = D.train_on_batch(X_real, y_real)

	return (1 / 4) * (d_fake_0 + d_fake_1 + d_real_0 + d_real_1)

def train_g(n_batch):
	labels = np.repeat(1, n_batch)
	noise = generate_noise(n_batch, latent_dim)
	g_loss_1 = GAN.train_on_batch([noise, labels], np.array([labels, np.repeat(0, n_batch)]).reshape(2, n_batch).T)

	labels = np.repeat(0, n_batch)
	noise = generate_noise(n_batch, latent_dim)
	g_loss_0 = GAN.train_on_batch([noise, labels], np.array([labels, np.repeat(1, n_batch)]).reshape(2, n_batch).T)

	return (1 / 2) * (g_loss_1 + g_loss_0)

In [15]:
n_batch = 2
labelN = 1

x, y = select_real_samples(n_batch, labelN, dataset=sup)
print(x.shape, y.shape)
print(y)

x, y = generate_fake_samples(n_batch, labelN, generator=G, latent_dim=100)
print(x.shape, y.shape)
print(y)


(2, 768) (2, 2)
[[1 1]
 [1 1]]
(2, 768) (2, 2)
[[1 0]
 [1 0]]


In [59]:
def discriminator_loss(y_true, y_pred):
	print(y_true.numpy())
	print(y_pred.numpy())
	return 0

def discriminator():
	m = Sequential()
	m.add(Dense(256))
	m.add(Dense(2, activation="sigmoid"))
	return m


#D = discriminator()
#D.compile(loss="mse", run_eagerly=True)

X_real, y_real = select_real_samples(1, 0, dataset=sup)
D.evaluate(X_real, y_real)
print(D(X_real), y_real)


tf.Tensor([[0.52395815 0.5486245 ]], shape=(1, 2), dtype=float32) [[0 1]]


In [32]:
v, c = np.unique(sup[:,1], return_counts=True)
print(v)
print(c)


[0 1]
[206  38]


19.0

In [60]:
sup_train = np.concatenate([sup[sup[:,1] == 1][:19], sup[sup[:,1] == 0][:19]])
sup_test  = np.concatenate([sup[sup[:,1] == 1][19:], sup[sup[:,1] == 0][19:]])
sup_test.shape, sup_train.shape

((206, 2), (38, 2))

In [64]:
epochs = 300
n_batch = 1
for e in range(epochs):
	d_loss = train_d(sup_train, n_batch)
	g_loss = train_g(n_batch)
	
	X = np.array(sup_train[:,0].tolist())
	labels = sup_train[:,1]
	realite = np.ones((len(labels),))
	train_loss = D.evaluate(X, np.array([labels, realite], dtype=np.float32).T, verbose=0)
	
	X = np.array(sup_test[:,0].tolist())
	labels = sup_test[:,1]
	realite = np.ones((len(labels),))
	test_loss = D.evaluate(X, np.array([labels, realite], dtype=np.float32).T, verbose=0)
	print(e, [np.round(g_loss, 2), np.round(d_loss, 2), np.round(train_loss, 2), np.round(test_loss, 2)])

0 [0.26, 0.26, 0.13, 0.15]
1 [0.27, 0.24, 0.13, 0.15]
2 [0.28, 0.21, 0.13, 0.2]
3 [0.25, 0.21, 0.12, 0.19]
4 [0.26, 0.24, 0.12, 0.18]
5 [0.37, 0.23, 0.12, 0.17]
6 [0.23, 0.22, 0.12, 0.18]
7 [0.3, 0.21, 0.11, 0.17]
8 [0.19, 0.18, 0.11, 0.16]
9 [0.23, 0.19, 0.11, 0.16]
10 [0.36, 0.23, 0.1, 0.15]
11 [0.28, 0.16, 0.1, 0.16]
12 [0.23, 0.17, 0.11, 0.16]
13 [0.27, 0.12, 0.11, 0.15]
14 [0.24, 0.14, 0.11, 0.16]
15 [0.34, 0.09, 0.11, 0.17]
16 [0.32, 0.13, 0.12, 0.12]
17 [0.27, 0.29, 0.12, 0.1]
18 [0.32, 0.16, 0.11, 0.15]
19 [0.26, 0.13, 0.12, 0.2]
20 [0.28, 0.14, 0.11, 0.15]
21 [0.23, 0.15, 0.12, 0.23]
22 [0.28, 0.21, 0.12, 0.23]
23 [0.33, 0.21, 0.11, 0.22]
24 [0.27, 0.08, 0.1, 0.17]
25 [0.36, 0.17, 0.1, 0.15]
26 [0.32, 0.13, 0.1, 0.19]
27 [0.34, 0.11, 0.1, 0.2]
28 [0.3, 0.2, 0.09, 0.17]
29 [0.37, 0.18, 0.11, 0.24]
30 [0.32, 0.12, 0.09, 0.19]
31 [0.33, 0.09, 0.09, 0.14]
32 [0.32, 0.14, 0.11, 0.24]
33 [0.34, 0.18, 0.1, 0.21]
34 [0.25, 0.13, 0.09, 0.13]
35 [0.4, 0.07, 0.09, 0.11]
36 [0.27, 0.17, 0

pour evaluer on reprend la base de donné supervisé, on ballance les feature au discriminateur sans les label et on regarde s'il trouve les meme label et qu'il dit qu'il sont des donné reel (cad données non générées)

In [107]:
X = np.array(sup_train[:,0].tolist())
labels = sup_train[:,1]
realite = np.ones((len(labels),))
D.evaluate(X, np.array([labels, realite], dtype=np.float32).T)
# np.round(D(X).numpy()[:,0], 2), labels
loss_train_label = tf.keras.losses.BinaryCrossentropy()(D(X).numpy()[:,0], tf.constant(labels, dtype=tf.float32))
loss_train_validity = tf.keras.losses.BinaryCrossentropy()(D(X).numpy()[:,1], tf.constant(realite, dtype=tf.float32))
print(loss_train_label.numpy(), loss_train_validity.numpy())

7.529764 8.29346


In [106]:
X = np.array(sup_test[:,0].tolist())
labels = sup_test[:,1]
realite = np.ones((len(labels),))
D.evaluate(X, np.array([labels, realite], dtype=np.float32).T)
# np.round(D(X).numpy()[:,0], 2), labels
loss_test_label = tf.keras.losses.BinaryCrossentropy()(D(X).numpy()[:,0], tf.constant(labels, dtype=tf.float32))
loss_test_validity = tf.keras.losses.BinaryCrossentropy()(D(X).numpy()[:,1], tf.constant(realite, dtype=tf.float32))
print(loss_test_label.numpy(), loss_test_validity.numpy())

8.524452 8.151393
