In [10]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

# Charger les données
file_path = 'C:/Users/ouali/PRAMA_Projet/europarl-v6.fr-en.fr_toy' 
with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()

# Prétraitement des données
text = text.lower()
words = text.split()

# Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(words)
sequences = tokenizer.texts_to_sequences(words)
vocab_size = len(tokenizer.word_index) + 1

# Création de séquences d'entraînement
def create_sequences(words, seq_length=5):
    sequences = []
    for i in range(seq_length, len(words)):
        seq = words[i-seq_length:i+1]
        sequences.append(seq)
    return sequences

seq_length = 5
sequences = create_sequences(sequences, seq_length)
sequences = np.array(sequences)

# Séparation des entrées et des sorties
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)


FileNotFoundError: [Errno 2] No such file or directory: 'C:/Users/ouali/PRAMA_Projet/europarl-v6.fr-en.fr_toy'

In [None]:
# 2. Construction du modèle de réseau de neurones
#Nous allons construire un modèle simple de réseau de neurones récurrent (LSTM) pour prédire le prochain mot.

from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from keras.optimizers import Adam

# Définir le modèle
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=100, input_length=seq_length))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
# Compiler le modèle
model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])
model.summary()

In [None]:
# 3. Entraînement du modèle

history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_val, y_val))

# Évaluer le modèle
loss, accuracy = model.evaluate(X_val, y_val)
print(f'Loss: {loss}, Accuracy: {accuracy}')

In [None]:
#Utilisation du modèle pour le calcul de \( P(w) \)
def predict_next_word(model, tokenizer, text_seq, seq_length=5):
    encoded = tokenizer.texts_to_sequences([text_seq])[0]
    encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
    pred = model.predict(encoded, verbose=0)
    return pred

# Exemple d'utilisation
text_seq = 'il est'
pred = predict_next_word(model, tokenizer, text_seq)
predicted_word = tokenizer.index_word[np.argmax(pred)]
print(f'Predicted word: {predicted_word}')