In [None]:
from google.colab import drive
drive.mount('/content/drive')

## **Machine Learning for Business II - Project**
## Business Use Case: Sentiment Analysis on a Disneyland dataset


# **BASELINE**

In [None]:
# Import necessary libraries and modules

# Pandas is used for data manipulation and analysis. 
# Pandas provides data structures like DataFrame
import pandas as pd

# Import the train_test_split function from scikit-learn
# train_test_split is used to split the dataset into training and testing sets.
from sklearn.model_selection import train_test_split

# Import PyTorch for building and training the model
# Torch is a deep learning library that provides tensor operations and support for building neural networks.
import torch

# Import DataLoader and Dataset from PyTorch
# DataLoader is used to efficiently load the dataset in batches during training.
from torch.utils.data import DataLoader, Dataset

# Import BERT components from the Hugging Face transformers library
# - BertTokenizer is used to convert raw text into tokens that BERT can understand.
# - BertForSequenceClassification is a pre-trained BERT model specifically for text classification tasks.
# - AdamW is an optimizer used to adjust the model's weights based on the gradients calculated during training.
# - BertConfig is used to configure the BERT model, such as specifying the number of output labels.
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, BertConfig

# Import the learning rate scheduler for training
# get_linear_schedule_with_warmup is used to adjust the learning rate over time.
# It starts with a warm-up phase, where the learning rate gradually increases, then decreases linearly.
from transformers import get_linear_schedule_with_warmup

# Import NumPy for numerical computations
# NumPy is a library that provides support for arrays and numerical operations such as averaging losses.
import numpy as np

In [None]:
# Load the dataset from an Excel file
# The dataset is stored in an Excel file and loaded into a Pandas DataFrame (df).
df = pd.read_excel("Dataset ML project - Disney Reviews - Cleaned.xlsx")

# Select the relevant columns for training
# - 'X' will store the text content of the reviews, which will be used as input features for the model.
# - 'y' will store the sentiment labels (target variable) for each review, which the model will try to predict.
X = df['Content']            # Column containing the review text
y = df['feeling cont']        # Column containing the sentiment labels

In [None]:
# Map the sentiment labels to integers
# In order for the machine learning model to work with the labels, we need to convert the text labels into numbers.
# This is called label encoding, where each unique sentiment label is assigned a numerical value.
# The label mapping here is as follows:
# - 'négatif' (negative) is mapped to 0
# - 'neutre' (neutral) is mapped to 1
# - 'positif' (positive) is mapped to 2
# - 'mitigé' (mixed) is mapped to 3
label_map = {'négatif': 0, 'neutre': 1, 'positif': 2, 'mitigé': 3}

In [None]:
from transformers import AutoTokenizer

# Load a pre-trained tokenizer for BERT
# The tokenizer is responsible for converting raw text into token IDs that BERT can understand.
# 'bert-base-uncased' refers to a pre-trained version of BERT that converts all text to lowercase (ignores case).
# This pre-trained tokenizer has already learned how to split words into tokens based on large text datasets.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
# Split the dataset into training and test sets
# 'train_test_split' is a function from scikit-learn that splits the data into two sets: one for training and one for testing.
# - X: The features (in this case, the text data).
# - y: The labels (in this case, the sentiment labels).
# - test_size=0.2: 20% of the data will be used as the test set, while 80% will be used for training.
# - random_state=42: This ensures that the split is reproducible (the same random split will occur every time).
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shape of the training data (number of samples and features)
# This will show the number of samples in the training set after the split.
print(X_train.shape)

In [None]:
# Load a pre-trained tokenizer for BERT
# The tokenizer is responsible for converting text into the token format required by BERT.
# 'bert-base-uncased' refers to a pre-trained version of BERT where all text is converted to lowercase (ignores case sensitivity).
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the training texts
# The tokenizer converts the input text into token IDs, attention masks, and other features that BERT needs for processing.
train_encodings = tokenizer(
    X_train.tolist(),             # Convert the training data (X_train) to a list of strings for tokenization
    max_length=350,               # Maximum sequence length: sequences longer than 350 tokens will be truncated
    padding='max_length',         # Add padding to sequences shorter than 350 tokens so all sequences have the same length
    truncation=True,              # Truncate sequences that are longer than the maximum length (350 tokens)
    return_tensors='pt'           # Return the output as PyTorch tensors for model training
)

# Tokenize the test texts
# The same tokenization process is applied to the test data (X_test).
test_encodings = tokenizer(
    X_test.tolist(),              # Convert the test data (X_test) to a list of strings for tokenization
    max_length=350,               # Maximum sequence length: same as for the training data
    padding='max_length',         # Add padding to ensure consistent length
    truncation=True,              # Truncate sequences that exceed the maximum length
    return_tensors='pt'           # Return the output as PyTorch tensors for model evaluation
)

# Convert text labels into numerical labels
# The sentiment labels (text) are converted into numerical format, as the model works with numbers.
# 'label_map' is a dictionary that maps text labels (e.g., 'positive', 'negative') to numbers (e.g., 0, 1, 2, 3).
test_labels = [label_map[label] for label in y_test]
train_labels = [label_map[label] for label in y_train]

In [None]:
# Convert the labels into PyTorch tensors
# Tensors are a data structure used in PyTorch to store data in multi-dimensional arrays.
# Here, we are converting the training and test labels into tensors that the model can use during training and evaluation.
train_labels_tensor = torch.tensor(train_labels)
test_labels_tensor = torch.tensor(test_labels)

from torch.utils.data import TensorDataset, DataLoader
import torch

# Create TensorDatasets with input_ids, attention_mask, and labels
# A TensorDataset is a PyTorch structure that groups together the input data (input_ids and attention_mask) with their corresponding labels.
# This structure will be used to feed the model both the text inputs and the expected labels during training and testing.
X_test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels_tensor)
X_train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels_tensor)

# Create DataLoaders for the test and training datasets
# DataLoader is a PyTorch class that allows us to efficiently load the data in small batches during training and evaluation.
# - batch_size=16: This means that 16 samples will be processed at a time.
# - shuffle=True: This ensures the data is shuffled before each epoch, which helps improve model generalization.
X_test_loader = DataLoader(X_test_dataset, batch_size=16, shuffle=True)
X_train_loader = DataLoader(X_train_dataset, batch_size=16, shuffle=True)

### Define hyperparameters

In [None]:
# Define hyperparameters
# These are manually chosen values for the number of epochs, learning rate, and batch size.
num_epochs = 1           # Number of epochs: how many times the model will see the entire dataset during training
learning_rate = 1e-5     # Learning rate: controls how fast the model updates its weights during training
batch_size = 15          # Batch size: how many samples are processed at one time during training

# Load the pre-trained BERT model for sequence classification
# This BERT model ('bert-base-uncased') has been pre-trained, but we are not yet fine-tuning it.
# The 'num_labels=4' parameter indicates that we are classifying the input into 4 categories (e.g., positive, negative, neutral, mixed).
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)

# Define the optimizer (AdamW) to adjust the model's weights based on the gradients
# The optimizer is initialized with the model's parameters and the chosen learning rate
optimizer = AdamW(model.parameters(), lr=learning_rate)

# Check if a GPU (CUDA) is available; if not, use the CPU
# Using a GPU can significantly speed up the training process
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to the appropriate device (GPU or CPU)
model = model.to(device)


In [None]:
from sklearn.metrics import recall_score
import torch

# Check if a GPU (CUDA) is available, otherwise use the CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def train_test_model(model, train_loader, test_loader, optimizer, num_epochs):
    avg_loss = []  # List to store the average loss for each epoch

    # Train the model
    for epoch in range(num_epochs):  # For example, over 2 epochs
        model = model.to(device)  # Move the model to the appropriate device (GPU or CPU)
        model.train()  # Put the model in training mode
        total_loss = 0  # Initialize total loss for the epoch

        # Loop over batches in the training data
        for batch in train_loader:
            optimizer.zero_grad()  # Reset gradients before each batch

            # Unpack the data from the batch
            input_ids, attention_mask, labels = batch

            # Move data to the appropriate device
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            # Forward pass through the model
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss  # Calculate the loss

            # Backpropagation to compute gradients
            loss.backward()
            optimizer.step()  # Update the model's parameters

            # Add the loss for this batch to the total loss
            total_loss += loss.item()

        # Calculate and store the average loss for this epoch
        avg_loss.append(total_loss / len(train_loader))

    # Evaluate the model on the test data
    model.eval()  # Put the model in evaluation mode (no training)
    y_true, y_pred = [], []  # Lists to store true labels and predictions
    total_loss = 0  # Initialize total loss for evaluation

    # Disable gradient computation during evaluation
    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask, labels = batch

            # Forward pass through the model
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

            # Get the loss
            loss = outputs.loss
            total_loss += loss.item()

            # Get the model's predictions
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)  # Get the class with the highest score

            # Append the true labels and predictions to the lists
            y_true.extend(labels.cpu().tolist())
            y_pred.extend(predictions.cpu().tolist())

    # Calculate the average loss for the evaluation
    avg_loss = total_loss / len(test_loader)

    # Calculate the recall for each class
    recall = recall_score(y_true, y_pred, average=None)

    # Calculate the class-wise error rate (1 - recall)
    class_error_rate = 1 - recall

    # Return the average loss, recall, and class-wise error rate
    return avg_loss, class_error_rate

In [None]:
# Test the pre-trained BERT model on the test set
# The function 'train_test_model' trains the model on the training set and evaluates it on the test set.
# It returns the average loss (avg_loss) and the class-wise error rate (class_error_rate).
avg_loss, class_error_rate = train_test_model(model, X_train_loader, X_test_loader, optimizer, num_epochs)

# Display the results of the model's performance
print(f"Average Loss: {avg_loss}")
print(f"Class-wise Error Rate: {class_error_rate}")

In [None]:
# Save the baseline model results in a text file
# This file will keep a record of the performance of the baseline model
# The results include the final average loss (avg_loss) and the class-wise error rate (class_error_rate)

with open("Baseline score.txt", "w") as f:
    # Write the final loss to the file
    f.write(f"Final Loss: {avg_loss}\n")

    # Write the class-wise error rate (as a list) to the file
    # The class-wise error rate is important for evaluating the model's performance on each sentiment category
    f.write(f"Class-wise error rate: {class_error_rate.tolist()}\n")

# **PIPELINE**

## Step 1 - Loading of the dataset

In [None]:
#%pip install optuna

In [7]:
# Import necessary libraries and modules

# Pandas is used for data manipulation and analysis. 
# Pandas provides data structures like DataFrame
import pandas as pd

# Importing train_test_split from scikit-learn to split the dataset into training and testing sets
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score

# Importing PyTorch for tensor operations (multidimensional arrays which are the core data structures for neural networks) and providing support for building and training deep learning models
import torch

# Importing DataLoader and Dataset from PyTorch for creating data pipelines and managing datasets efficiently
from torch.utils.data import DataLoader, Dataset, Subset

# Importing components from Hugging Face Transformers library for BERT tokenization and sequence classification
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, BertConfig

# Importing scheduler to manage the learning rate schedule during training
from transformers import get_linear_schedule_with_warmup

# Importing numpy for efficient numerical operations, especially for working with arrays
import numpy as np

import optuna

In [10]:
# Load the dataset from an Excel file
df = pd.read_excel("/content/drive/MyDrive/ML business - project/Dataset ML project - Disney Reviews - Cleaned.xlsx")

# Select the relevant columns for training the model
# 'X' will store the content of the reviews (features)
# 'y' will store the sentiment labels (target variable)
X = df['Content']
y = df['feeling cont']

# Split the dataset into training and test sets
# X_train, X_test: The training and test data (features)
# y_train, y_test: The corresponding training and test labels (sentiments)
# test_size=0.2: This means 20% of the data will be used for the test set, and 80% for the training set.
# random_state=42: This is used to ensure that the split is the same every time the code is run (reproducibility)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a mapping of sentiment labels to integers
# We convert the text labels (e.g., 'négatif') into numerical values so the model can process them.
# 'négatif' -> 0, 'neutre' -> 1, 'positif' -> 2, 'mitigé' -> 3
label_map = {'négatif': 0, 'neutre': 1, 'positif': 2, 'mitigé': 3}

# Convert the text labels in 'y_train' into their corresponding numerical values
# For example, if y_train contains 'négatif', it will be converted to 0, and so on.
# This allows the model to use the labels as numbers for training.
labels = [label_map[label] for label in y_train]

In [None]:
# Load the pre-trained BERT tokenizer
# This tokenizer will convert the text data into a format that BERT can understand
# 'bert-base-uncased' means we are using the BERT model that ignores case sensitivity (lowercasing all text)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text data in 'X_train'
# The tokenizer converts each text review into a sequence of numbers (called tokens) that represent the words in BERT’s vocabulary.
encodings = tokenizer(
    X_train.tolist(),             # Convert the training data to a list format for tokenization
    max_length=350,               # Maximum sequence length for each review (350 tokens max)
    padding='max_length',         # Pad shorter sequences with zeros to make all sequences the same length
    truncation=True,              # Cut off sequences that are longer than 350 tokens
    return_tensors='pt'           # Return the output as PyTorch tensors, which are used in model training
)


In [12]:
# Import necessary libraries
import torch
from torch.utils.data import TensorDataset, DataLoader

# Transform the labels into PyTorch tensors
# Tensors are multi-dimensional arrays that PyTorch uses to store data.
# In this case, we are converting our sentiment labels (numbers) into a tensor format that the model can work with.
labels_tensor = torch.tensor(labels)

# Create a TensorDataset using the tokenized inputs and labels
# A TensorDataset is a PyTorch structure that allows us to group input data (input_ids, attention_mask) with their corresponding labels.
# - 'input_ids': The tokenized form of the text (converted words into numbers).
# - 'attention_mask': Indicates which tokens should be attended to (helps the model ignore padding tokens).
# - 'labels_tensor': The labels (e.g., positive, negative) converted into tensor format.
train_dataset = TensorDataset(encodings['input_ids'], encodings['attention_mask'], labels_tensor)

# Create a DataLoader for the training dataset
# The DataLoader splits the dataset into small batches (mini-batches) of 16 samples each.
# It loads the data in batches to efficiently train the model, and we can also shuffle the data during training if needed.
# - batch_size=16: This means that during each iteration of training, 16 samples will be fed to the model at a time.
# - shuffle=False: Data will not be shuffled in this case, but you can set this to True to shuffle the data before each epoch.
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=False)


## Step 2 - Training
Training of a whole comment analysis model on a manually generated training dataset


In [13]:
from sklearn.model_selection import KFold

# Define the number of splits for cross-validation
# K-Fold Cross-Validation is a technique where the dataset is split into 'k' equal parts (or folds).
# The model is trained 'k' times, each time using a different fold as the test set and the remaining folds as the training set.
# This helps to evaluate the model's performance more reliably by using different splits of the data.
# In this case, we are using 5 splits, meaning the dataset will be divided into 5 parts, and the model will be trained and evaluated 5 times.
kf = KFold(n_splits=5)


In [None]:
# Initialize the BERT model for sequence classification
# We are using the pre-trained 'bert-base-uncased' model, which is designed to ignore case (lowercase everything).
# The configuration specifies that the model will classify text into 4 sentiment categories (e.g., positive, negative, neutral, mixed).
config = BertConfig.from_pretrained('bert-base-uncased', num_labels=4)

# Load the pre-trained BERT model with the specified configuration
# 'BertForSequenceClassification' is a version of BERT designed for classification tasks.
# This loads the model with the configuration we just set (which includes 4 possible output labels).
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', config=config)

# Initialize the optimizer for model training
# 'AdamW' is a variant of the Adam optimizer commonly used for training transformers like BERT.
# We specify the model's parameters and set the learning rate to 5e-5, which controls how much the model's weights are adjusted during training.
# A small learning rate like 5e-5 ensures that the model learns gradually to avoid large, unstable updates.
optimizer = AdamW(model.parameters(), lr=5e-5)

In [51]:
# Define the objective function to be optimized by Optuna
def objective(trial):
    # Define the hyperparameter search space

    # Suggest possible batch sizes from a list of values
    batch_size = trial.suggest_categorical("batch_size", [10, 20, 30, 50])

    # Suggest learning rate on a log scale between 1e-5 and 5e-5
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)

    # Suggest the number of epochs from a set of values (2, 3, or 5)
    num_epochs = trial.suggest_int("num_epochs", 2, 3, 5)


In [52]:
# Define the objective function for Optuna to optimize
# Optuna will adjust hyperparameters to minimize the error rate across multiple trials.
def objective(trial):
    # Define the hyperparameter search space:

    # Suggest possible batch sizes (how many samples are processed at once) from a list of values
    batch_size = trial.suggest_categorical("batch_size", [10, 20, 30, 50])

    # Suggest a learning rate on a logarithmic scale between 1e-5 and 5e-5
    # A learning rate controls how fast the model updates its weights
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)

    # Suggest the number of epochs (how many times the model will see the entire dataset) from a range
    # The number of epochs is selected from 1 to 5 in steps of 2 (so possible values are 1, 3, 5)
    num_epochs = trial.suggest_int("num_epochs", 1, 5, step=2)

    avg_loss = []  # To store the average loss for each fold during cross-validation
    avg_error_rate = []  # To store the average error rate for each fold

    # Perform cross-validation: k-folds split the dataset into training and test subsets multiple times
    for fold, (train_idx, test_idx) in enumerate(kf.split(train_dataset)):
        print(f'Fold {fold + 1}')  # Show the current fold number

        # Initialize a new BERT model for sequence classification for each fold
        model = BertForSequenceClassification.from_pretrained('bert-base-uncased', config=config)

        # Define the optimizer with the suggested learning rate
        optimizer = AdamW(model.parameters(), lr=learning_rate)

        # Create subsets for training and testing based on the fold's indices
        train_subset = Subset(train_dataset, train_idx)
        test_subset = Subset(train_dataset, test_idx)

        # Create DataLoaders to load the training and testing data in batches
        # Batch size determines how many samples are processed at once during training
        train_dataloader = DataLoader(train_subset, batch_size=batch_size, shuffle=True)
        test_dataloader = DataLoader(test_subset, batch_size=batch_size, shuffle=False)

        # Train the model on the current fold
        model.train()  # Switch the model to training mode
        total_loss = 0  # To accumulate the total loss over all batches
        for epoch in range(num_epochs):  # Loop over the number of epochs
            for batch in train_dataloader:
                optimizer.zero_grad()  # Reset gradients before processing each batch

                # Extract input data from the batch
                input_ids, attention_mask, labels = batch

                # Forward pass: compute model predictions and the loss
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss  # The loss measures how far off the predictions are

                # Backpropagation: compute gradients and update model weights
                loss.backward()
                optimizer.step()  # Apply the optimizer to update model parameters

                # Add the loss from this batch to the total loss
                total_loss += loss.item()

        # Calculate the average loss for this fold and add it to the list
        avg_loss.append(total_loss / len(train_dataloader))

        # Evaluate the model on the test set (current fold)
        model.eval()  # Switch the model to evaluation mode (no training)
        y_true = []  # True labels
        y_pred = []  # Model predictions

        # Turn off gradient calculations during evaluation to save memory and computation
        with torch.no_grad():
            for batch in test_dataloader:
                input_ids, attention_mask, labels = batch

                # Forward pass through the model to make predictions
                outputs = model(input_ids, attention_mask=attention_mask)
                logits = outputs.logits  # Logits are raw model outputs before being turned into probabilities
                predictions = torch.argmax(logits, dim=-1)  # Get the predicted class (highest score)

                # Collect the true labels and predicted labels
                y_true.extend(labels.cpu().tolist())
                y_pred.extend(predictions.cpu().tolist())

                # Print a sample of true and predicted labels to monitor performance
                for true_label, pred_label in zip(y_true[:10], y_pred[:10]):
                    print(f"True label: {true_label}, Prediction: {pred_label}")

        # Calculate recall for each class (how well the model identifies true positives)
        recall = recall_score(y_true, y_pred, average=None)

        # Calculate the error rate for each class (1 - recall)
        error_rate = 1 - recall
        avg_error_rate.append(np.mean(error_rate))  # Store the average error rate across classes
        print(f"Trial {trial.number} - Class-wise error rate:", error_rate)

        # Optionally, save the metrics for each trial in a file for further analysis
        with open("score.txt", "a") as f:
            f.write(f"Trial {trial.number + 1} - Avg Loss: {np.mean(avg_loss):.4f}, Avg Class-wise error rate: {np.mean(avg_error_rate):.4f}\n")

    # Return the average error rate across all folds to Optuna for optimization
    return np.mean(avg_error_rate)

In [None]:
# Set up the Optuna study for hyperparameter optimization
# "minimize" indicates that the goal is to minimize the error rate (since lower error means better performance)
# This means Optuna will try to find the set of hyperparameters that results in the lowest error rate.
study = optuna.create_study(direction="minimize")

# Run the optimization process for a defined number of trials (in this case, 1 trial for testing purposes)
# The optimization will explore different combinations of hyperparameters by calling the 'objective' function.
# Typically, a larger number of trials (e.g., 20 or 50) is used to find the best results.
study.optimize(objective, n_trials=1)

# Display the best hyperparameters found by Optuna
print("Best trial:")  # Print the results of the best trial
trial = study.best_trial  # Get the best trial (the one with the lowest error rate)
print(f"  Best Error Rate: {trial.value}")  # Print the lowest error rate found during optimization
print(f"  Best Hyperparameters: {trial.params}")  # Print the best hyperparameter combination

# Train the final model using the best hyperparameters found by Optuna
# Extract the best hyperparameters (batch size, learning rate, and number of epochs) from the best trial
best_batch_size = trial.params["batch_size"]
best_learning_rate = trial.params["learning_rate"]
best_num_epochs = trial.params["num_epochs"]


In [None]:
import mlflow
from mlflow.models import infer_signature
from sklearn.linear_model import LogisticRegression

# Initialize a Logistic Regression model from scikit-learn
# Logistic regression is a simple classification model used for binary or multiclass classification tasks.
logistic_regressor = LogisticRegression()

# It's a good practice to infer the model's signature.
# The "signature" captures the input and output structure of the model, including data types and shapes.
# This is useful for tracking and documenting the model in MLflow for later use (especially when deploying it).
signature = infer_signature(X, logistic_regressor.predict(X))

# Log the model into MLflow
# MLflow provides a platform to manage and track machine learning models. When logging a model, it saves:
# - The model itself (in this case, the Logistic Regression model).
# - The inferred signature (which shows the input/output structure).
# - Additional metadata about the model.
# This allows the model to be easily tracked, versioned, and potentially deployed later.
mlflow.sklearn.log_model(logistic_regressor, signature=signature)


## Step 3 - Testing

## Step 4 - Analysis of Metrics

**BONUS**
1) If there is proper Experimentation Tracking.
2) If there is a documented HTTP API that serves the trained model.