In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # Suppress INFO and WARNING messages
import argparse
import sys
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import joblib
from sentence_transformers import SentenceTransformer

## train test split
from sklearn.model_selection import train_test_split


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Parameters
n_missing = 3
MODEL_PATH = "rnn_model.h5"
TOKENIZER_PATH = "tokenizer.pkl"
LABEL_ENCODER_PATH = "label_encoder.pkl"
MAX_SEQUENCE_LENGTH = 100
EMBEDDING_DIM = 128

In [3]:
df = pd.read_csv("prompts_dataset.csv")

In [4]:
data = df.Prompt
note = df.Complexite

In [5]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data)

In [94]:
X_train_seq = tokenizer.texts_to_sequences(data)
X_train_padded = pad_sequences(X_train_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

In [66]:
X_train_padded = X_train_padded.reshape((X_train_padded.shape + (1,)))

In [74]:
X_train.shape

(800, 100, 1)

In [73]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X_train_padded, np.array(list(note)), test_size=0.2, random_state=42)

In [75]:
vocab_size = len(tokenizer.word_index) + 1

In [130]:
from keras.layers import Embedding, Bidirectional

model = Sequential()

# Add an embedding layer (adjust vocab_size and embedding_dim as needed)
model.add(Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))

# Add bidirectional LSTM layers
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Bidirectional(LSTM(64)))

# Add dense layers
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='linear'))

# Compile the model
model.compile(loss='mean_squared_error', optimizer='adam')

In [131]:
model.summary()

Model: "sequential_28"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (None, 100, 128)          236672    
                                                                 
 bidirectional_8 (Bidirectio  (None, 100, 256)         263168    
 nal)                                                            
                                                                 
 bidirectional_9 (Bidirectio  (None, 128)              164352    
 nal)                                                            
                                                                 
 dense_40 (Dense)            (None, 16)                2064      
                                                                 
 dense_41 (Dense)            (None, 1)                 17        
                                                                 
Total params: 666,273
Trainable params: 666,273
Non-t

In [135]:
# Train and evaluate model with train and test
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=32)

Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x77eb4328e5f0>

In [138]:
def predict(string):
    list_string = [string]
    seq = tokenizer.texts_to_sequences(list_string)
    padded = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
    prediction = model.predict(padded)
    print(padded)
    return prediction

In [139]:
predict("écrit moi la suite de fibonacci en python")

[[269   2  44   1  17   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]]


array([[2.2862215]], dtype=float32)