In [7]:
import datetime
import os

import fasttext
import keras_core as keras
import numpy as np
import pandas as pd
from keras_core import Sequential
from keras_core.src.layers import Dense
from keras_core.models import load_model
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import tensorflow as tf

from tensorboard.plugins.hparams import api as hp

Using TensorFlow backend


2024-06-27 10:43:51.710301: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## import fastTest model

In [24]:
import os
#embedded_path = './../../'

#contents = os.listdir(embedded_path)
#print(contents)
ft = fasttext.load_model('../../data/embedding_data/cc.fr.300.bin')

## Configurations

In [3]:
params_grid = {
    "model" : "nlp",
    "embedder" : "fasttext",
    "embedding_data" : "cc.fr.300.bin",
    "dataset" : "fr_hf.csv",
    "batch_size" : 300
}

In [9]:
data_folder_src = './../../data/processed/'
data_path_src = f'{data_folder_src}{params_grid["dataset"]}'
date_str = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
experiment_name = f'deepwoke_{params_grid["embedder"]}_{params_grid["model"]}_{params_grid["dataset"]}_{date_str}'

log_dir = f"../../log/fit/{experiment_name}"
model_weight_dst = f'../../model_weights/{experiment_name}-model.keras'

In [10]:

vector_path = f'./../../embedded_vector/{params_grid["embedding_data"]}_{params_grid["dataset"]}'
vector_x_dst = f'{vector_path}.x.npy'
vector_y_dst = f'{vector_path}.y.npy'

# loading dataset

In [11]:
X = np.load(vector_x_dst)
y = np.load(vector_y_dst)

## text to vector

In [22]:
def text_to_vector(text):
    words = text.split(' ')
    word_vectors = [ft.get_word_vector(word) for word in words if word in ft.words]
    if not word_vectors:
        return np.zeros(300)
    return np.mean(word_vectors, axis=0)

## building and training the model

### callbacks

In [14]:
early_stopping = keras.callbacks.EarlyStopping(
    patience=10,
    min_delta=0.001,
    restore_best_weights=True,
    monitor='loss'
)


In [15]:

class EarlyStoppingLogging(tf.keras.callbacks.Callback):
    def __init__(self, early_stopping_callback, log_dir):
        super().__init__()
        self.early_stopping = early_stopping_callback
        self.stopped_epoch = 0
        self.writer = tf.summary.create_file_writer(log_dir)

    def on_epoch_end(self, epoch, logs=None):
        if self.early_stopping.stopped_epoch > 0:
            self.stopped_epoch = self.early_stopping.stopped_epoch
            with self.writer.as_default():
                tf.summary.scalar('early_stopping_epoch', self.stopped_epoch, step=epoch)
                self.writer.flush()


In [16]:
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
early_stopping_logging_callback = EarlyStoppingLogging(early_stopping, log_dir)

2024-06-27 10:45:33.221888: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-06-27 10:45:33.777864: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2251] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [17]:
callback = [
    early_stopping,
    tensorboard_callback,
    hp.KerasCallback(log_dir, params_grid),
    early_stopping_logging_callback,
]

### training the model

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = Sequential()
model.add(Dense(128, input_dim=300, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(34, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train,
          y_train,
          epochs=500,
          batch_size = params_grid['batch_size'],
          callbacks=callback,
          validation_data=(X_test, y_test)
          )

Epoch 1/500


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
2024-06-27 10:45:57.070783: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 17347200 exceeds 10% of free system memory.
I0000 00:00:1719477958.503533   42698 service.cc:145] XLA service 0x7f5fec00bda0 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1719477958.503680   42698 service.cc:153]   StreamExecutor device (0): Host, Default Version
2024-06-27 10:45:58.588340: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.


[1m 1/49[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2:02[0m 3s/step - accuracy: 0.2633 - loss: 0.7054

I0000 00:00:1719477959.267670   42698 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 20ms/step - accuracy: 0.7000 - loss: 0.6197 - val_accuracy: 0.7687 - val_loss: 0.5347
Epoch 2/500
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7600 - loss: 0.5384 - val_accuracy: 0.7687 - val_loss: 0.5072
Epoch 3/500
[1m41/49[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m0s[0m 3ms/step - accuracy: 0.7560 - loss: 0.5196

2024-06-27 10:46:00.768680: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 17347200 exceeds 10% of free system memory.


[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7569 - loss: 0.5175 - val_accuracy: 0.7687 - val_loss: 0.4880
Epoch 4/500
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.7596 - loss: 0.4943 - val_accuracy: 0.7744 - val_loss: 0.4727
Epoch 5/500
[1m39/49[0m [32m━━━━━━━━━━━━━━━[0m[37m━━━━━[0m [1m0s[0m 3ms/step - accuracy: 0.7774 - loss: 0.4706

2024-06-27 10:46:01.533763: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 17347200 exceeds 10% of free system memory.


[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7772 - loss: 0.4711 - val_accuracy: 0.7779 - val_loss: 0.4680
Epoch 6/500
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7813 - loss: 0.4626 - val_accuracy: 0.7872 - val_loss: 0.4654
Epoch 7/500
[1m40/49[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m0s[0m 3ms/step - accuracy: 0.7975 - loss: 0.4483

2024-06-27 10:46:02.148967: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 17347200 exceeds 10% of free system memory.


[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7965 - loss: 0.4487 - val_accuracy: 0.7797 - val_loss: 0.4637
Epoch 8/500
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7985 - loss: 0.4419 - val_accuracy: 0.7895 - val_loss: 0.4592
Epoch 9/500
[1m43/49[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 2ms/step - accuracy: 0.8011 - loss: 0.4313

2024-06-27 10:46:02.740808: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 17347200 exceeds 10% of free system memory.


[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8013 - loss: 0.4313 - val_accuracy: 0.7936 - val_loss: 0.4716
Epoch 10/500
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8000 - loss: 0.4302 - val_accuracy: 0.7895 - val_loss: 0.4611
Epoch 11/500
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8185 - loss: 0.4085 - val_accuracy: 0.7862 - val_loss: 0.4670
Epoch 12/500
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8192 - loss: 0.4038 - val_accuracy: 0.7908 - val_loss: 0.4676
Epoch 13/500
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8255 - loss: 0.3875 - val_accuracy: 0.7872 - val_loss: 0.4714
Epoch 14/500
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8308 - loss: 0.3776 - val_accuracy: 0.7882 - val_loss: 0.4760
Epoch 15/500
[1m49/49[0m [32m━━━━━━━━━

<keras_core.src.callbacks.history.History at 0x7f60e73f35b0>

# save model

In [19]:
model.save(model_weight_dst)

### testing with custom sentence

In [20]:
#model_weight_dst = f'../../model_weights/fasttext_nlp_20240613-131608-model.keras'
new_model = load_model(model_weight_dst)

In [25]:
text_nice = [
    "la bonjour je suis un type sympas", 
    "bonjour ça va les gens",
    "j'aime pas trop ce mec",
    "cette meuf est cool",
    "les meuf sont trop cool",
    "les mecs sont des gens stylé",
    "le mec est stylé",
    "nan mais mon manager exagère quand il dit que je suis pas assez productif et que je travail pas assez, franchement il abuse",
    "les hommes cis sont toxiques pour la société car ils imposent leur vision paternaliste",
    "Les hommes sont moins matures que les femmes quand il s'agit de prendre des décisions"
    
]

text_no_nice = [
    "regarde moi cette connasse",
    "les femme leur place c'est à la cuisine",
    "femme au volant, accident",
    "les femmes on peux pas leur faire confiance",
    "Pour une transphobe son passing est pas ouf",
    "Elle l’ouvre quand même beaucoup celle là pour quelqu’un qu’est pas capable d’éduquer ses enfants",
    "Dis donc, c’est pas parce que t’es mal baisée que tu peux tout te permettre ????",
    "Moi ce que j’aime bien chez les filles comme toi c’est que t’es pas du genre prise de tête",
    "les femmes c'est comme le fromage, quand elles puent c'est qu'elles sont bonnes",
    "Oh les grosses tepus",
    "Faut pas t'étonner si tu te fais violer vu comment t'es habillée",
    "Tu deverais avoir des enfants maintenant avant que ce soit trop tard",
    "Vous seriez une collaboratrice idéal pour notre entreprise, malheureusement nous cherchons quelqu’un disponible sur le long terme, et nous craignons qu’à votre âge des changements familiaux ne vous permettent plus de vous investir autant que besoins"
]
wordsvec_nice = [np.expand_dims(text_to_vector(text), axis=0) for text in text_nice]
wordsvec_no_nice = [np.expand_dims(text_to_vector(text), axis=0) for text in text_no_nice]

print("nice output")
for i in range(0, len(wordsvec_nice)):
    prediction = new_model.predict(wordsvec_nice[i], verbose=False)
    print(f"{text_nice[i]} : {prediction > 0.5}")
    print(f"{prediction}")


print("\n\nnot nice output")
for i in range(0, len(wordsvec_no_nice)):
    prediction =  new_model.predict(wordsvec_no_nice[i], verbose=False)
    print(f"{text_no_nice[i]} : {prediction > 0.5}")
    print(f"{prediction}")

nice output
la bonjour je suis un type sympas : [[False]]
[[0.00700457]]
bonjour ça va les gens : [[False]]
[[0.01231155]]
j'aime pas trop ce mec : [[False]]
[[3.2521093e-08]]
cette meuf est cool : [[False]]
[[4.1303685e-09]]
les meuf sont trop cool : [[False]]
[[2.4200557e-12]]
les mecs sont des gens stylé : [[False]]
[[1.6187134e-05]]
le mec est stylé : [[False]]
[[7.093816e-10]]
nan mais mon manager exagère quand il dit que je suis pas assez productif et que je travail pas assez, franchement il abuse : [[False]]
[[6.7943774e-06]]
les hommes cis sont toxiques pour la société car ils imposent leur vision paternaliste : [[False]]
[[0.00073071]]
Les hommes sont moins matures que les femmes quand il s'agit de prendre des décisions : [[ True]]
[[0.9999702]]


not nice output
regarde moi cette connasse : [[ True]]
[[0.8987701]]
les femme leur place c'est à la cuisine : [[ True]]
[[0.999668]]
femme au volant, accident : [[ True]]
[[0.99989605]]
les femmes on peux pas leur faire confiance : 