In [2]:
import os
import re
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer, GPT2Config, TFGPT2Model

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
current_dir = os.path.dirname(os.path.realpath("__file__"))
repo_dir = os.path.dirname(current_dir)
DATA_FILE = os.path.join(repo_dir,"data","cleaned_data.csv")
data = pd.read_csv(DATA_FILE, encoding="utf-8",sep=',')

In [4]:
data.dropna(inplace=True)

In [6]:
config = GPT2Config.from_pretrained('gpt2',n_layer=6)
config.output_hidden_states = False
model = TFGPT2Model.from_pretrained('gpt2',config=config)

Some layers from the model checkpoint at gpt2 were not used when initializing TFGPT2Model: ['transformer/h_._8/ln_1/gamma:0', 'transformer/h_._6/ln_1/gamma:0', 'transformer/h_._8/mlp/c_proj/weight:0', 'transformer/h_._9/mlp/c_fc/bias:0', 'transformer/h_._8/mlp/c_proj/bias:0', 'transformer/h_._7/mlp/c_proj/bias:0', 'transformer/h_._6/attn/c_attn/bias:0', 'transformer/h_._7/ln_1/gamma:0', 'transformer/h_._8/ln_2/gamma:0', 'transformer/h_._10/ln_1/gamma:0', 'transformer/h_._9/attn/c_attn/bias:0', 'transformer/h_._9/ln_1/beta:0', 'transformer/h_._7/attn/c_attn/bias:0', 'transformer/h_._6/attn/c_proj/bias:0', 'transformer/h_._8/ln_1/beta:0', 'transformer/h_._11/mlp/c_proj/weight:0', 'transformer/h_._10/ln_2/gamma:0', 'transformer/h_._7/attn/c_proj/bias:0', 'transformer/h_._8/attn/c_attn/bias:0', 'transformer/h_._10/attn/c_proj/bias:0', 'transformer/h_._10/attn/c_proj/weight:0', 'transformer/h_._10/attn/c_attn/weight:0', 'transformer/h_._11/ln_2/beta:0', 'transformer/h_._11/attn/c_attn/weigh

In [7]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', lang='fr')
# tokenizer.pad_token = tokenizer.eos_token
# model.resize_token_embeddings(len(tokenizer))
# model.config.pad_token_id = model.config.eos_token_id

In [9]:
input_sequences = []
target_sequences = []
for i in range(len(data)):
    recipe = data.iloc[[i]].to_dict("records")[0]
    inputs = tokenizer.encode(recipe["Ingredients"], truncation = True,return_tensors="tf")
    targets = tokenizer.encode(recipe["recettes"], truncation = True,return_tensors="tf")
    input_sequences.append(inputs)
    target_sequences.append(targets)

In [10]:
max_len_input = max([input_sequences[i].shape[1] for i in range(len(input_sequences))])
max_len_target = max([target_sequences[i].shape[1] for i in range(len(target_sequences))])

In [11]:
max_length = max(max_len_input,max_len_target)

# Pad the input and target sequences to the maximum length
padded_input_sequences = []
padded_target_sequences = []
for inputs, targets in zip(input_sequences, target_sequences):
    padded_inputs = tf.pad(inputs, [[0, max_length - inputs.shape[1]], [0, 0]])
    padded_targets = tf.pad(targets, [[0, max_length - targets.shape[1]], [0, 0]])
    padded_input_sequences.append(padded_inputs)
    padded_target_sequences.append(padded_targets)

input_sequences = padded_input_sequences
target_sequences = padded_target_sequences

In [19]:
len(padded_input_sequences[1])

957

In [20]:
input_sequences = tf.keras.preprocessing.sequence.pad_sequences(input_sequences, maxlen=max_length, padding="post")
target_sequences = tf.keras.preprocessing.sequence.pad_sequences(target_sequences, maxlen = max_length, padding="post")

MemoryError: Unable to allocate 13.8 GiB for an array with shape (46926, 1024, 77) and data type int32

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((input_sequences, target_sequences))

In [None]:
config = GPT2Config.from_pretrained('gpt2')
config.output_hidden_states = False
model = TFGPT2Model.from_pretrained('gpt2',config=config)
# model.config = config
model.resize_token_embeddings(len(tokenizer))

All model checkpoint layers were used when initializing TFGPT2Model.

All the layers of TFGPT2Model were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2Model for predictions without further training.


<transformers.modeling_tf_utils.TFSharedEmbeddings at 0x2449ef70e20>

In [None]:
@tf.function
def train_step(inputs, targets):
    with tf.GradientTape() as tape:
        logits = model(inputs, labels=targets)[0]
        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=targets)
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))
    return loss


In [None]:
num_epochs = 10

for epoch in range(num_epochs):
    for inputs, targets in dataset:
        loss = train_step(inputs, targets)

In [None]:
X_train = np.expand_dims(X_train, axis=-1)
X_test = np.expand_dims(X_test, axis=-1)
y_train = np.expand_dims(y_train, axis=-1)
y_test = np.expand_dims(y_test, axis=-1)

In [None]:
# Split the data into training and testing sets

X = sequences['input_ids'].numpy()
y = sequences['attention_mask'].numpy()
input_dim = X.shape[1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.CategoricalAccuracy('accuracy')

model.compile(
  loss=loss,
  optimizer=optimizer,
  metrics=metric
)

# Train the model
history = model.fit(
  X_train,
  y_train,
  batch_size=128,
  epochs=10,
  validation_data=(X_test, y_test)
)


Epoch 1/10


In [8]:
max_length = 512
input_data = 
sequences = tokenizer.batch_encode_plus(data['recettes'].to_list(), padding=True, truncation=True, return_tensors='tf', max_length=max_length)

# num_classes = len(np.unique(y))
# y = tf.keras.utils.to_categorical(y, num_classes)

KeyboardInterrupt: ignored