In [4]:
import pandas as pd
import numpy as np
#we read the training data
df = pd.read_csv('training_data.csv')
#read the test data

baserate = df['difficulty'].value_counts() / len(df)

In [7]:
# Drop rows with missing 'sentence' or 'difficulty' in training data
df = df.dropna(subset=['sentence', 'difficulty'])

# Fill or drop missing 'sentence' in test data
#test = test.dropna(subset=['sentence'])  # Assuming you want to drop

# Optionally, remove duplicates
df = df.drop_duplicates(subset=['sentence'])
#test = test.drop_duplicates(subset=['sentence'])

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

# Initialize lemmatizer (we use spacy because it has french lemmatization)

import spacy

# Load the French Spacy model
nlp = spacy.load('fr_core_news_md')

def preprocess_text(text):
    doc = nlp(text.lower())
    result = []
    for token in doc:
        if not token.is_stop and not token.is_punct:
            result.append(token.lemma_)
    return ' '.join(result)

# Apply preprocessing to the sentence columns
df['sentence'] = df['sentence'].apply(preprocess_text)


# Load pre-trained BERT model and tokenizer
# Initialize BERT tokenizer and model
from transformers import CamembertTokenizer, CamembertModel
import torch

# Load pre-trained CamemBERT model and tokenizer
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')
camembert_model = CamembertModel.from_pretrained('camembert-base')

def get_camembert_embedding(sentence):
    # Prepare the text input for CamemBERT
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=128)
    # Get output from CamemBERT model
    with torch.no_grad():
        outputs = camembert_model(**inputs)
    # Extract the mean of the last hidden state to use as the sentence embedding
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
    return embeddings.detach().cpu().numpy()

# Example of processing a batch of sentences
def get_camembert_embeddings(sentences, batch_size=32):
    embeddings = []
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=128)
        with torch.no_grad():
            outputs = camembert_model(**inputs)
        batch_embeddings = outputs.last_hidden_state.mean(dim=1)
        embeddings.append(batch_embeddings.detach().cpu().numpy())
    return np.vstack(embeddings)

# Process embeddings in batches
embeddings = get_camembert_embeddings(df['sentence'].tolist())
df['embedding'] = list(embeddings)
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

X = np.array(df['embedding'].tolist())
y = df['difficulty'].values

from sklearn.preprocessing import LabelEncoder

# Define the order of your labels
labels_ordered = ['A1', 'A2', 'B1', 'B2', 'C1', 'C2']

# Initialize the LabelEncoder
encoder = LabelEncoder()

# Manually fit the encoder to the ordered labels
encoder.fit(labels_ordered)

# Encode your actual labels
y_encoded = encoder.transform(y)

# Output the encoding to verify
label_mapping = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
print("Label mapping:", label_mapping)


kf = KFold(n_splits=5, shuffle=True, random_state=42)

import torch
import torch.nn as nn
import torch.optim as optim

class TextDifficultyNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(TextDifficultyNN, self).__init__()
        self.layer1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.layer2 = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, x):
        x = self.layer1(x)
        x = self.relu(x)
        x = self.layer2(x)
        x = self.softmax(x)
        return x
# Assuming you have already determined the number of classes
num_classes = len(set(y))  # Replace this with the actual number of difficulty classes
input_dim = 768  # Size of the CamemBERT embedding
hidden_dim = 100  # You can tune this
output_dim = num_classes

model = TextDifficultyNN(input_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Label mapping: {'A1': 0, 'A2': 1, 'B1': 2, 'B2': 3, 'C1': 4, 'C2': 5}


In [23]:
from torch.utils.data import TensorDataset, DataLoader

def train_and_evaluate_model(X, y_encoded, input_dim, hidden_dim, output_dim, lr, batch_size, num_epochs):
    # Convert data to tensors inside the function if not already tensors
    X_tensor = torch.tensor(X, dtype=torch.float32)
    y_tensor = torch.tensor(y_encoded, dtype=torch.long)

    # Model setup
    model = TextDifficultyNN(input_dim, hidden_dim, output_dim)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    # Track performance
    accuracies = []

    # K-fold cross-validation
    for train_idx, val_idx in kf.split(X):
        train_dataset = TensorDataset(X_tensor[train_idx], y_tensor[train_idx])
        val_dataset = TensorDataset(X_tensor[val_idx], y_tensor[val_idx])

        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

        # Training phase
        for epoch in range(num_epochs):
            model.train()
            for inputs, labels in train_loader:
                optimizer.zero_grad()
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()

        # Evaluation phase
        model.eval()
        correct = total = 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                outputs = model(inputs)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        accuracy = correct / total
        accuracies.append(accuracy)

    return np.mean(accuracies)

# Hyperparameter grid
hidden_dims = [150, 200, 250]
learning_rates = [0.001]
batch_sizes = [8]
num_epochs = 30

best_accuracy = 0
best_params = {}

for hidden_dim in hidden_dims:
    for lr in learning_rates:
        for batch_size in batch_sizes:
            accuracy = train_and_evaluate_model(X, y_encoded, input_dim, hidden_dim, output_dim, lr, batch_size, num_epochs)
            print(f"Accuracy: {accuracy:.4f} with hidden_dim={hidden_dim}, lr={lr}, batch_size={batch_size}")
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_params = {'hidden_dim': hidden_dim, 'lr': lr, 'batch_size': batch_size}

print("Best Parameters:", best_params)


Accuracy: 0.7919 with hidden_dim=150, lr=0.001, batch_size=8
Accuracy: 0.8089 with hidden_dim=200, lr=0.001, batch_size=8
Accuracy: 0.8123 with hidden_dim=250, lr=0.001, batch_size=8
Best Parameters: {'hidden_dim': 250, 'lr': 0.001, 'batch_size': 8}


In [24]:
# Initialize the model with these best parameters


model = TextDifficultyNN(input_dim=768, hidden_dim=best_params['hidden_dim'], output_dim=len(encoder.classes_))
optimizer = optim.Adam(model.parameters(), lr=best_params['lr'])
criterion = nn.CrossEntropyLoss()

# Convert the entire dataset to tensors
X_tensor_full = torch.tensor(X, dtype=torch.float32)
y_tensor_full = torch.tensor(y_encoded, dtype=torch.long)

full_dataset = TensorDataset(X_tensor_full, y_tensor_full)
full_loader = DataLoader(full_dataset, batch_size=best_params['batch_size'], shuffle=True)

# Train the model
num_epochs = 80  # Adjust based on the training convergence observed in tuning
for epoch in range(num_epochs):
    model.train()
    for inputs, labels in full_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    if epoch % 10 == 0:  # Print loss every 10 epochs
        print(f'Epoch {epoch+1}, Loss: {loss.item():.4f}')

# Assuming test_embeddings is already prepared and is a numpy array
test = pd.read_csv('unlabelled_test_data.csv')
test['sentence'] = test['sentence'].apply(preprocess_text)
test_embeddings = get_camembert_embeddings(test['sentence'].tolist())
X_test_tensor = torch.tensor(test_embeddings, dtype=torch.float32)
test_loader = DataLoader(X_test_tensor, batch_size=best_params['batch_size'], shuffle=False)

model.eval()  # Set the model to evaluation mode
predictions = []
with torch.no_grad():
    for inputs in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        predictions.extend(predicted.cpu().numpy())

# Decode the predictions back to labels
predicted_labels = encoder.inverse_transform(predictions)

Epoch 1, Loss: 1.2850
Epoch 11, Loss: 0.4933
Epoch 21, Loss: 0.2340
Epoch 31, Loss: 0.2033
Epoch 41, Loss: 0.0405
Epoch 51, Loss: 0.0048
Epoch 61, Loss: 0.0174
Epoch 71, Loss: 0.0046


In [26]:
import pandas as pd

# Assume you have a DataFrame 'test_df' that corresponds to the test embeddings
test['difficulty'] = predicted_labels
#drop the sentence and embedding columns
test_df = test.drop(columns=['sentence'])
# Save predictions to a CSV file
test_df.to_csv('final_predictions_camembert_nn.csv', index=False)
print("Predictions saved to 'final_predictions.csv'.")

torch.save(model.state_dict(), 'final_trained_model.pth')
print("Model saved to 'final_trained_model.pth'.")

#the final results quality are lower than the evaluation so it is overfitting

Predictions saved to 'final_predictions.csv'.
Model saved to 'final_trained_model.pth'.


In [4]:
#option A  TF-IDF Vectorization
import numpy as np
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Example: Using TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=1000)  # Adjust the number of features as needed
X = vectorizer.fit_transform(df['sentence'])
y = df['difficulty'].values

# Initialize the KFold method
kf = KFold(n_splits=5, shuffle=True, random_state=42)  # You can choose the number of splits

# Initialize a classifier, e.g., RandomForest
model = RandomForestClassifier(n_estimators=100, random_state=42)

# To store the fold scores
scores = []

# Execute k-fold cross-validation
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Train the model
    model.fit(X_train, y_train)

    # Predict on the test fold
    predictions = model.predict(X_test)

    # Calculate accuracy or other metrics
    score = accuracy_score(y_test, predictions)
    scores.append(score)

# Print out the mean accuracy across all folds
print("Mean accuracy across all folds:", np.mean(scores))

# Retrain on the entire training set
model.fit(X, y)

# Prepare the test set (Assuming you have preprocessed it as well)
X_final_test = vectorizer.transform(test['sentence'])

# Predict on the unlabelled test set
final_predictions = model.predict(X_final_test)

# Optionally, save or return your predictions
test['predicted_difficulty'] = final_predictions
test.to_csv('final_predictions.csv', index=False)

Mean accuracy across all folds: 0.34812499999999996


In [5]:
import sentencepiece


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/811k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.40M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/508 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Mean accuracy across all folds: 0.43520833333333336


In [8]:
#hyperparameters:
# we need to try our hyperparameters in a sort of validation set (we do a kfold)

in the report we should fill the table (the report is the readme of the github)

it's more useful to play on the choice of the technique and on the choice of the hyperparameters (not so much on data cleaning)