In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_parquet("hf://datasets/data-is-better-together/10k_prompts_ranked/data/train-00000-of-00001.parquet")

  from .autonotebook import tqdm as notebook_tqdm


In [79]:
df = pd.read_csv("prompts_dataset.csv")

In [80]:
data = df.Prompt

In [81]:
note = df.Complexite

In [None]:
## train test split
from sklearn.model_selection import train_test_split

In [82]:
import nltk
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec

# Assurez-vous de télécharger le tokenizer
nltk.download('punkt')

# data contenant vos textes (déjà défini dans votre notebook)
# data = df.prompt

# Tokenisation de chaque texte en minuscule
sentences = data.apply(lambda x: word_tokenize(x.lower())).tolist()

# Entraînement du modèle Word2Vec
model_embedding = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=1, workers=4)

# Affichage des premiers mots du vocabulaire
print("Exemple de vocabulaire :", list(model_embedding.wv.key_to_index.keys())[:10])

[nltk_data] Downloading package punkt to /home/dimitri/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Exemple de vocabulaire : ['?', 'de', '’', 'l', 'la', 'est', 'que', ',', 'pensez-vous', 'quelle']


In [83]:
import numpy as np

corpus = [
    "Le deep learning est puissant",
    "Les réseaux de neurones sont utilisés pour la vision par ordinateur",
    "Word2Vec apprend des représentations de mots"
]

def sentence_to_vec(sentence, model):
    """Convertit une phrase en un vecteur en moyennant les vecteurs Word2Vec des mots"""
    words = word_tokenize(sentence.lower())
    
    vectors = [model.wv[word] for word in words if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)


In [84]:
list_data = list(data)

In [85]:
X = np.array([sentence_to_vec(sent, model_embedding) for sent in list_data])

3145    3.500000
3562    5.000000
6449    5.000000
9522    3.000000
828     3.000000
          ...   
5734    2.000000
5191    2.666667
5390    3.333333
860     3.000000
7270    3.000000
Name: avg_rating, Length: 8264, dtype: float64


In [86]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, list(note), test_size=0.2, random_state=42)

In [100]:
import torch.nn as nn
import torch.optim as optim

# Définition du modèle
class MLP(nn.Module):
    def __init__(self, input_dim):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        return x

input_dim = X.shape[1]  # Taille des vecteurs Word2Vec
model = MLP(input_dim)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)


In [101]:
import torch
# Conversion en tenseurs PyTorch
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1)

In [102]:
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import mean_absolute_error, r2_score

batch_size = 16  # Taille des mini-batchs

# Création du dataset
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [103]:
num_epochs = 100

for epoch in range(num_epochs):
    epoch_loss = 0
    epoch_mae = 0
    num_batches = 0

    for batch_X, batch_y in train_loader:
        # Forward pass
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)

        # Calcul de la MAE sur ce batch
        mae = mean_absolute_error(batch_y.detach().numpy(), outputs.detach().numpy())
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Accumulation des pertes et MAE
        epoch_loss += loss.item()
        epoch_mae += mae
        num_batches += 1

    # Moyenne des métriques sur l'époque
    epoch_loss /= num_batches
    epoch_mae /= num_batches

    if (epoch + 1) % 5 == 0:
        print(f'Époque [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}, MAE: {epoch_mae:.4f}')


Époque [5/100], Loss: 10.8562, MAE: 2.9912
Époque [10/100], Loss: 10.8562, MAE: 2.9912
Époque [15/100], Loss: 10.8562, MAE: 2.9912
Époque [20/100], Loss: 10.8562, MAE: 2.9912
Époque [25/100], Loss: 10.8562, MAE: 2.9912
Époque [30/100], Loss: 10.8562, MAE: 2.9912
Époque [35/100], Loss: 10.8562, MAE: 2.9912
Époque [40/100], Loss: 10.8562, MAE: 2.9912
Époque [45/100], Loss: 10.8562, MAE: 2.9912
Époque [50/100], Loss: 10.8562, MAE: 2.9912
Époque [55/100], Loss: 10.8562, MAE: 2.9912
Époque [60/100], Loss: 10.8562, MAE: 2.9912
Époque [65/100], Loss: 10.8562, MAE: 2.9912
Époque [70/100], Loss: 10.8562, MAE: 2.9912
Époque [75/100], Loss: 10.8562, MAE: 2.9912
Époque [80/100], Loss: 10.8562, MAE: 2.9912


KeyboardInterrupt: 

In [93]:
with torch.no_grad():  # Pas de calcul de gradient pendant l'évaluation
    test_outputs = model(X_test_tensor)
    test_loss = criterion(test_outputs, y_test_tensor)
    print(f'Perte sur test: {test_loss.item():.4f}')


Perte sur test: 1.7038


In [71]:
def predict(liste_sentence):
    X = np.array([sentence_to_vec(sent, model_embedding) for sent in liste_sentence])
    X_tensor = torch.tensor(X, dtype=torch.float32)
    return model(X_tensor).squeeze().tolist()

In [None]:
list_data2 = [
    "Quel est la capitale de la France ?",
    "Djufara",
    "Je souhaite comprendre la différence entre un réseau de neurones et un arbre de décision"
]

print(predict(list_data2))

[3.4593892097473145, 3.9093027114868164, 3.1277976036071777]
