In [55]:
import os
import re
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer, GPT2Config, TFGPT2Model

In [56]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', lang='fr')

In [57]:
current_dir = os.path.dirname(os.path.realpath("__file__"))
repo_dir = os.path.dirname(current_dir)
DATA_FILE = os.path.join(repo_dir,"data","cleaned_data.csv")
data = pd.read_csv(DATA_FILE, encoding="utf-8",sep=',')

In [58]:
data.sample(5)

Unnamed: 0,Nom,Duree,Personnes,Type,Ingredients,Ustensiles,Etapes,recettes
13018,samoussa rapide boeuf,30 min,10,4,250 gramme boeuf hacher 1 carotte 1 oignon ...,poele pinceau cuillere bois denoyauteur assiette,etape 1 poele feu vif faire revenir oignon ail...,samoussa rapide boeuf 250 gramme boeuf hach...
23801,roti boeuf four simple,30 min,2,5,1 gousse ail 2 c.a. huile olive 12 cl ea...,four couteau couteau planche decouper plat gra...,etape 1 prechauffer four 220 degre c thermosta...,roti boeuf four simple 1 gousse ail 2 c.a...
27850,gateau pomme facile,50 min,6,3,120 gramme sucre semoul 1 paquet sucre van...,grille patisserie moule manquer couteau four c...,etape 1 prechauffer four 160 degre c thermosta...,gateau pomme facile 120 gramme sucre semoul...
42364,sushi français,40 min,4,1,2 feuille nori magasin biologique 1 verre ...,couteau plat,etape 1 poser feuille nori plat couper etape 2...,sushi français 2 feuille nori magasin biolo...
27748,mug cak fondant chocolat,3 min,1,3,40 gramme chocolat noir 40 gramme beurre ...,micro-onde cuillere bois bol balance cuisine,etape 1 mug mettre carre chocolat beurre decou...,mug cak fondant chocolat 40 gramme chocolat...


In [16]:
data['tokens'] = data['recettes'].apply(tokenizer.tokenize)

In [60]:
# oov_tokens = ['s','Entrée']
# tokenizer.add_tokens(oov_tokens)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

max_length = 512
sequences = tokenizer.batch_encode_plus(data['recettes'].to_list(), padding=True, truncation=True, return_tensors='tf', max_length=max_length)

# num_classes = len(np.unique(y))
# y = tf.keras.utils.to_categorical(y, num_classes)

In [62]:
len(tokenizer)

50258

In [63]:
# Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(sequences['input_ids'].numpy(), sequences['attention_mask'].numpy(), test_size=0.2, random_state=42)
# X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

X = sequences['input_ids'].numpy()
y = sequences['attention_mask'].numpy()
input_dim = X.shape[1]

# Reshape the input data to a 3D tensor
batch_size = X.shape[0]
sequence_length = 1  # Set the sequence length to 1
# X = np.reshape(X, (batch_size, sequence_length, input_dim))

output_dim = y.shape[1]

# Reshape the target data to a 2D tensor
# Y = np.reshape(y, (batch_size, output_dim))

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [43]:
batch_size

47340

In [64]:
X_train = np.expand_dims(X_train, axis=-1)
X_test = np.expand_dims(X_test, axis=-1)
y_train = np.expand_dims(y_train, axis=-1)
y_test = np.expand_dims(y_test, axis=-1)

# X_train = tf.reshape(X_train, (-1, 1024))
# X_test= tf.reshape(X_test, (-1, 1024))

# y_train= tf.reshape(y_train, (-1, 1024))
# y_test = tf.reshape(y_test, (-1, 1024))

In [46]:
print(X_train.shape)
print(y_train.shape)

(37872, 512, 1)
(37872, 512, 1)


In [65]:
config = GPT2Config.from_pretrained('gpt2')
config.output_hidden_states = False
model = TFGPT2Model.from_pretrained('gpt2',config=config)
# model.config = config
model.resize_token_embeddings(len(tokenizer))

All model checkpoint layers were used when initializing TFGPT2Model.

All the layers of TFGPT2Model were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2Model for predictions without further training.


<transformers.modeling_tf_utils.TFSharedEmbeddings at 0x2449ef70e20>

In [66]:
# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.CategoricalAccuracy('accuracy')

model.compile(
  loss=loss,
  optimizer=optimizer,
  metrics=metric
)

# Train the model
history = model.fit(
  X_train,
  y_train,
  batch_size=128,
  epochs=10,
  validation_data=(X_test, y_test)
)


Epoch 1/10
