# Entraînement d'un modèle d'embeddings sur les données de ChatGPT

## Import de toutes les librairies

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

2024-01-15 17:24:33.325690: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-01-15 17:24:33.335112: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-15 17:24:33.390998: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-15 17:24:33.391042: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-15 17:24:33.391062: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to regi

## Import des données en dataframe

In [2]:
df = pd.read_csv("GptData/everything.csv")
print(df)

                                                review  label
0    A total disappointment. Their 'ecological appr...      0
1    An ecological pretension devoid of meaning. Th...      0
2    Disappointing. Their alleged ecological consci...      0
3    A restaurant that boasts of being ecological b...      0
4    Ecological facades. Their discourse on ecology...      0
..                                                 ...    ...
899  My recent visit to this place exposed a neutra...      3
900  Based on my experience at this establishment, ...      3
901  While not achieving top-tier performance in wa...      3
902  As someone who dined at this venue, the waste ...      3
903  In my experience at this joint, the waste mana...      3

[904 rows x 2 columns]


## Séparation des données d'entraînement, de test et de validation

In [3]:
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
test_data, val_data = train_test_split(test_data, test_size=0.2, random_state=42)

 ## Tokénisation des textes

In [4]:
# Tokenize and pad the text data
max_words = 10000  # Choose the maximum number of words in your vocabulary
max_len = 100  # Choose the maximum length of your sequences

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(train_data['review'])

X_train = pad_sequences(tokenizer.texts_to_sequences(train_data['review']), maxlen=max_len)
print(X_train.shape)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data['review']), maxlen=max_len)
X_val = pad_sequences(tokenizer.texts_to_sequences(val_data['review']), maxlen=max_len)

(723, 100)


## Encodage des labels (notes)

In [6]:


# Encode the labels
# we can take the data that have already been tokenized
label_encoder_grade = LabelEncoder()
y_train = label_encoder_grade.fit_transform(train_data['label'])
y_test = label_encoder_grade.transform(test_data['label'])
y_val = label_encoder_grade.transform(val_data['label'])


## Construction du modèle

In [11]:
# Finally the model part
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

model_grade = Sequential()
model_grade.add(Embedding(input_dim=max_words, output_dim=64, input_length=max_len))
model_grade.add(LSTM(128))
model_grade.add(Dropout(0.3))
model_grade.add(Dense(64, activation='relu'))
model_grade.add(Dropout(0.1))
model_grade.add(Dense(len(label_encoder_grade.classes_), activation='softmax'))

model_grade.compile(optimizer='adam', loss='sparse_categorical_crossentropy',metrics=["accuracy"])
model_grade.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 100, 64)           640000    
                                                                 
 lstm_1 (LSTM)               (None, 128)               98816     
                                                                 
 dropout_2 (Dropout)         (None, 128)               0         
                                                                 
 dense_2 (Dense)             (None, 64)                8256      
                                                                 
 dropout_3 (Dropout)         (None, 64)                0         
                                                                 
 dense_3 (Dense)             (None, 6)                 390       
                                                                 
Total params: 747462 (2.85 MB)
Trainable params: 74746

## Entraînement du modèle

In [12]:
model_grade.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7fd554d68100>

## Résultats

In [13]:
loss , acc = model_grade.evaluate(X_val,y_val,verbose=2)
print("Accuracy new data",acc)

new_texts = ["Best managed McDonald's I've ever seen!,","Yo, socially, this joint is kinda in the middle. Moves and inclusivity are cool, like hosting a regular hangout. A chill experience, nothing too wild.","A total disappointment. Their 'ecological approach' was a facade, the omnipresent plastic being blatant proof.","Discovering the treasures within these walls reveals a masterpiece that surpasses expectations. The chef, a virtuoso in directing the kitchen, conducts an orchestra of flavors with finesse. It's not merely dining; it's an experience that transcends the realms of artistry."]
new_sequences = pad_sequences(tokenizer.texts_to_sequences(new_texts), maxlen=max_len)

predictions = model_grade.predict(new_sequences)
predicted_labels = label_encoder_grade.inverse_transform(predictions.argmax(axis=1))

print(predicted_labels)

2/2 - 0s - loss: 1.1226 - accuracy: 0.7297 - 43ms/epoch - 21ms/step
Accuracy new data 0.7297297120094299
[0 2 0 5]
