In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras import Sequential, Model
from IPython.display import display

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

from transformers import TFCamembertModel, CamembertTokenizer, CamembertConfig
config = CamembertConfig.from_pretrained("camembert-base", output_hidden_states=False)
camembert = TFCamembertModel.from_pretrained("camembert-base", config=config)
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")

from matplotlib import pyplot as plt

Some layers from the model checkpoint at camembert-base were not used when initializing TFCamembertModel: ['lm_head']
- This IS expected if you are initializing TFCamembertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFCamembertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFCamembertModel were not initialized from the model checkpoint at camembert-base and are newly initialized: ['roberta/pooler/dense/bias:0', 'roberta/pooler/dense/kernel:0']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
# TODO :
# faire une base d'evaluation en plus de train et test
# implementer un systeme d'epoch complet avec tf.dataset
# ajouter les données non supervisées au discriminateur => extraction des feature des texte non labelisé à faire
# annulation de la loss du label parametrable (lors de l'utilisation des data non supervisé), faisable en ajoutant des boolean au discriminateur faisant office de mask
# Masking loss : https://stackoverflow.com/questions/64130293/custom-loss-function-in-keras-with-masking-array-as-input

#multi output loss https://datascience.stackexchange.com/questions/86700/custom-loss-function-with-multiple-outputs-in-tensorflow

In [3]:
class CustomCamemBERT(tf.keras.Model):
    def __init__(self, camembert, tokenizer):
        super().__init__()
        self.tokenizer = tokenizer
        self.cam = camembert
        self.GAP = GlobalAveragePooling1D()

    def call(self, inputs):
        l = tf.reshape(tf.convert_to_tensor(()), (0, 768))
        for sentence in inputs:
            # print("\nSentence:", sentence)
            encoded_sentence = tf.constant([tokenizer.encode(tokenizer.tokenize(sentence))], dtype=tf.int32)
            # print("Sentence encoded:", encoded_sentence.numpy())
            x = self.cam(encoded_sentence).last_hidden_state
            # print(x)
            x = self.GAP(x)
            # x = tf.reduce_mean(x, axis=1)
            # print("x", x)
            l = tf.concat([l, x], 0)
        # print("l", l)
        return l

# tokenized_sentence = tokenizer.tokenize("J'aime le camembert !")
# encoded_sentence = tf.constant([tokenizer.encode(tokenizer.tokenize("J'aime le camembert !"))], dtype=tf.int32)

# print(encoded_sentence)
# camembert(encoded_sentence)
def generator(latent_dim):
    noise = Input(shape=(latent_dim,), name="noise_input", dtype=tf.float32)
    label = Input(shape=(1,), name="label_input", dtype=tf.float32) 
    
    label_embedding = Flatten()(Embedding(2, latent_dim, name="label_embeding")(label))
    model_input = Multiply(name="mult_label_noise")([noise, label_embedding])
    
    x = Dense(256, input_dim=latent_dim)(model_input)
    out = Dense(768, name="Generated_Hidden_rep")(x)
    
    return Model([noise, label], out, name="Generator")

def discriminator():
    hidden_rep = Input(shape=(768,), dtype=tf.float32, name="Hidden_rep_Input")

    x = Dense(256)(hidden_rep)
    out = Dense(2, name="Prediction", activation="sigmoid")(x)    

    return Model(hidden_rep, out, name="Discriminator")

NLP_model = CustomCamemBERT(camembert, tokenizer)
NLP_model.trainable = False

# display(model("J'aime le camembert !"))
display(NLP_model(["J'aime pas le camembert !", "J'aime le camembert !"]))

# sup1 = sup[sup[:, 1] == 1, :]
# sup0 = sup[sup[:, 1] == 0, :]


# a = select_real_samples(sup1, 2)

<tf.Tensor: shape=(2, 768), dtype=float32, numpy=
array([[ 0.00872202, -0.08576093,  0.02685546, ..., -0.08810256,
         0.01628952,  0.07871367],
       [ 0.00422955, -0.08892401,  0.02005171, ..., -0.10487113,
        -0.00174052,  0.0599581 ]], dtype=float32)>

In [4]:
if(os.path.isfile('./featured.csv')):
    sup = np.genfromtxt('featured.csv', delimiter=',')
else:
    sup = pd.read_csv("./supervise.csv")[["text", "label_shufan"]]
    features = NLP_model(sup[["text"]].to_numpy().reshape(-1,).tolist())
    features = features.numpy()
    labels = np.array([sup["label_shufan"]]).T
    sup = np.concatenate([labels, features], axis=1)
    np.savetxt('featured.csv', sup, delimiter=",")


In [5]:
latent_dim = 100
def generate_noise(n_batch, latent_dim):
    	return np.random.randn(latent_dim * n_batch).reshape((n_batch, latent_dim))
 
def generate_fake_samples(n_batch, labelN, generator, latent_dim):
	labels = np.repeat(labelN, n_batch)
	X_fake = generator([generate_noise(n_batch, latent_dim), labels])
	y_fake = np.array([labels, np.repeat(0, n_batch)]).reshape(2, n_batch).T
	return X_fake, y_fake

def select_real_samples(n_batch, labelN, dataset):
	dataset = dataset[dataset[:, 0] == labelN, :]
	ind = np.random.choice(len(dataset), size=n_batch, replace=False)
	labels = np.repeat(labelN, n_batch)
	X_real = np.array(dataset[ind, 1:].tolist())
	y_real =  np.array([labels, np.repeat(1, n_batch)]).reshape(2, n_batch).T
	return X_real, y_real

def train_d(discriminator, dataset, n_batch):
	# (n_batch, 768), ([label, validité])
	X_fake, y_fake = generate_fake_samples(n_batch, 0, generator=G, latent_dim=100)
	d_fake_0 = discriminator.train_on_batch(X_fake, y_fake)

	X_fake, y_fake = generate_fake_samples(n_batch, 1, generator=G, latent_dim=100)
	d_fake_1 = discriminator.train_on_batch(X_fake, y_fake)

	X_real, y_real = select_real_samples(n_batch, 0, dataset=dataset)
	d_real_0 = discriminator.train_on_batch(X_real, y_real)

	X_real, y_real = select_real_samples(n_batch, 1, dataset=dataset)
	d_real_1 = discriminator.train_on_batch(X_real, y_real)

	return (1 / 4) * (d_fake_0[0] + d_fake_1[0] + d_real_0[0] + d_real_1[0])

def train_g(gan, n_batch):
	labels = np.repeat(1, n_batch)
	noise = generate_noise(n_batch, latent_dim)
	g_loss_1 = gan.train_on_batch([noise, labels], np.array([labels, np.repeat(1, n_batch)]).reshape(2, n_batch).T)

	labels = np.repeat(0, n_batch)
	noise = generate_noise(n_batch, latent_dim)
	g_loss_0 = gan.train_on_batch([noise, labels], np.array([labels, np.repeat(1, n_batch)]).reshape(2, n_batch).T)

	return (1 / 2) * (g_loss_1[0] + g_loss_0[0])

In [6]:
sup_train = np.concatenate([sup[sup[:,0] == 1][0:19], sup[sup[:,0] == 0][0:19]])
sup_test  = np.concatenate([sup[sup[:,0] == 1][19:38], sup[sup[:,0] == 0][19:38]])
sup_test.shape, sup_train.shape
sup_test[:,0]
sup_train[:,0]

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0.])

In [41]:
tf.config.run_functions_eagerly(False)

@tf.function
def label_acc(y_true, y_pred): return  tf.keras.metrics.binary_accuracy(y_true[:,0], y_pred[:,0])

@tf.function
def reality_acc(y_true, y_pred): return tf.keras.metrics.binary_accuracy(y_true[:,1], y_pred[:,1])

@tf.function
def label_acc_g(y_true, y_pred): return  tf.keras.metrics.binary_accuracy(y_true[:,0], 1 - y_pred[:,0])

@tf.function
def reality_acc_g(y_true, y_pred): return tf.keras.metrics.binary_accuracy(y_true[:,1], 1 - y_pred[:,1])

@tf.function
def label_loss(y_true, y_pred): return tf.keras.losses.binary_crossentropy(y_true[:,0], y_pred[:,0])

@tf.function
def reality_loss(y_true, y_pred): return tf.keras.losses.binary_crossentropy(y_true[:,1], y_pred[:,1])

@tf.function
def discriminator_loss(y_true, y_pred):
    loss_l = label_loss(y_true, y_pred)
    loss_r = reality_loss(y_true, y_pred)
    return ( 0.5 * loss_l ) + ( 0.5 * loss_r )

@tf.function
def generator_loss(y_true, y_pred):
    loss_l = label_loss(y_true, 1 - y_pred)
    loss_r = reality_loss(y_true, 1 - y_pred)
    return ( 0.5 * loss_l ) + ( 0.5 * loss_r )

In [25]:
D = discriminator()
D.compile(loss=discriminator_loss, metrics=[label_loss, reality_loss, label_acc, reality_acc], optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.001))
D.trainable = False
# display(D.summary())

latent_dim = 100
G = generator(latent_dim)
# display(G.summary())

noise = Input(shape=(latent_dim,))
label = Input(shape=(1,))
hidden_rep = G([noise, label])
validity = D(hidden_rep)

# connect them
GAN = Model([noise, label], validity, name="GAN")
GAN.compile(loss=generator_loss, metrics=[label_loss, reality_loss, label_acc_g, reality_acc_g], optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.001))


hist = []

In [42]:
epochs = 20
n_batch = 4

glabels = np.repeat(1, 20)
gnoise = generate_noise(20, latent_dim)
gY = np.array([glabels, np.repeat(1, 20)]).reshape(2, 20).T
gX = [gnoise, glabels]

X_train = sup_train[:,1:]
labels_train = np.array(sup_train[:,0], dtype=np.float32)
realite_train = np.ones((len(labels_train),), dtype=np.float32)
y_train = np.array([labels_train, realite_train]).T

X_test = sup_test[:,1:]
labels_test = np.array(sup_test[:,0], dtype=np.float32)
realite_test = np.ones((len(labels_test),), dtype=np.float32)
y_test = np.array([labels_test, realite_test]).T

for e in range(epochs):
	d_loss = train_d(D, sup_train, n_batch)
	g_loss = train_g(GAN, n_batch)
	
	train_loss = D.evaluate(X_train, y_train, verbose=0)
	test_loss = D.evaluate(X_test, y_test, verbose=0)

	geval = GAN.evaluate(gX, gY, verbose=0)
	deval = D.evaluate(G.predict(gX), gY, verbose=0)
	
	hist.append([d_loss, g_loss, *train_loss, *test_loss, *geval, *deval])
	print(e, [np.round(g_loss, 2), np.round(d_loss, 2), np.round(train_loss, 2), np.round(test_loss, 2), np.round(geval, 2), np.round(deval, 2)])

0 [0.42, 0.37, array([0.1 , 0.25, 0.  , 0.98, 1.  ]), array([0.36, 0.76, 0.  , 0.55, 1.  ]), array([0.48, 0.52, 6.95, 0.15, 1.  ]), array([3.74, 0.52, 6.95, 0.85, 0.  ])]
1 [0.4, 0.23, array([0.1 , 0.22, 0.  , 1.  , 1.  ]), array([0.35, 0.71, 0.  , 0.55, 1.  ]), array([0.47, 0.54, 6.94, 0.2 , 1.  ]), array([3.74, 0.54, 6.94, 0.8 , 0.  ])]
2 [0.51, 0.24, array([0.1 , 0.22, 0.  , 1.  , 1.  ]), array([0.36, 0.73, 0.  , 0.53, 1.  ]), array([0.43, 0.63, 7.02, 0.2 , 1.  ]), array([3.83, 0.63, 7.02, 0.8 , 0.  ])]
3 [0.43, 0.24, array([0.1 , 0.24, 0.  , 0.98, 1.  ]), array([0.37, 0.77, 0.  , 0.55, 1.  ]), array([0.44, 0.63, 7.01, 0.45, 1.  ]), array([3.82, 0.63, 7.01, 0.55, 0.  ])]
4 [0.41, 0.24, array([0.09, 0.2 , 0.  , 0.98, 1.  ]), array([0.36, 0.69, 0.  , 0.56, 1.  ]), array([0.45, 0.63, 7.16, 0.3 , 1.  ]), array([3.89, 0.63, 7.16, 0.7 , 0.  ])]
5 [0.48, 0.26, array([0.11, 0.29, 0.  , 0.9 , 1.  ]), array([0.39, 0.86, 0.  , 0.56, 1.  ]), array([0.44, 0.64, 7.16, 0.35, 1.  ]), array([3.9 , 0