In [23]:
!pip3 install --upgrade transformers

import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from transformers import CamembertModel, CamembertTokenizer
from sklearn.metrics import accuracy_score
from sentence_transformers import SentenceTransformer
import pandas as pd
import sentencepiece

# Load your dataset
df = pd.read_csv('training_data_cleaned_length.csv').reset_index(drop=True)

# Map difficulty labels
difficulty_mapping = {'A1': 0, 'A2': 1, 'B1': 2, 'B2': 3, 'C1': 4, 'C2': 5}
df['difficulty_encoded'] = df['difficulty'].map(difficulty_mapping)

# Split data into train and validation sets
train_df, validation_df = train_test_split(df, test_size=0.2, random_state=42)

# Load SentenceTransformer model
sentence_model = SentenceTransformer("dangvantuan/sentence-camembert-base")

# Tokenize and embed text
train_embeddings = sentence_model.encode(train_df['sentence'].tolist(), convert_to_tensor=True)
validation_embeddings = sentence_model.encode(validation_df['sentence'].tolist(), convert_to_tensor=True)

# Add labels to the embeddings
train_data = TensorDataset(train_embeddings, torch.tensor(train_df['difficulty_encoded'].tolist()))
validation_data = TensorDataset(validation_embeddings, torch.tensor(validation_df['difficulty_encoded'].tolist()))

# Set up DataLoader
batch_size = 8
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
validation_dataloader = DataLoader(validation_data, batch_size=batch_size, shuffle=False)

# Load CamemBERT model
camembert_model = CamembertModel.from_pretrained("camembert-base")
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")

# Optional: You may want to freeze the parameters of the CamemBERT model to prevent them from being updated during training
for param in camembert_model.parameters():
    param.requires_grad = False

# Set up a simple linear layer for classification
linear_layer = torch.nn.Linear(camembert_model.config.hidden_size, 6)  # Assuming 6 difficulty levels

# Set up the classification model
classification_model = torch.nn.Sequential(camembert_model, linear_layer)

# Set up device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
classification_model.to(device)

# Set up optimizer and scheduler
optimizer = torch.optim.Adam(classification_model.parameters(), lr=2e-5)
total_steps = len(train_dataloader) * epochs
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)  # You can adjust the scheduler as needed

# Training loop
for epoch in range(epochs):
    classification_model.train()
    for batch in train_dataloader:
        embeddings, labels = batch
        embeddings, labels = embeddings.to(device), labels.to(device)

        # Clear out gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = classification_model(embeddings)
        loss = torch.nn.functional.cross_entropy(outputs, labels)

        # Backward pass
        loss.backward()

        # Update parameters and scheduler
        optimizer.step()
        scheduler.step()

# Evaluation loop
classification_model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for batch in validation_dataloader:
        embeddings, labels = batch
        embeddings, labels = embeddings.to(device), labels.to(device)

        # Forward pass
        outputs = classification_model(embeddings)
        preds = torch.argmax(outputs, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Calculate accuracy
accuracy = accuracy_score(all_labels, all_preds)
print(f"Validation Accuracy: {accuracy}")




ImportError: ignored