In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Install Poetry
!curl -sSL https://install.python-poetry.org | python3 -

# Add Poetry to PATH for the current session
import sys

# Set up a new project directory
!mkdir my_poetry_project
%cd my_poetry_project

# Initialize a new Poetry project
!poetry init --no-interaction

!poetry add torch numpy transformers datasets

!poetry install

!poetry run python -c "import torch; import transformers; print('Dependencies installed successfully!')"

[36mRetrieving Poetry metadata[0m

The latest version ([1m1.8.4[0m) is already installed.
/content/my_poetry_project/my_poetry_project/my_poetry_project
/bin/bash: line 1: poetry: command not found
/bin/bash: line 1: poetry: command not found
/bin/bash: line 1: poetry: command not found
/bin/bash: line 1: poetry: command not found


In [None]:
!pip install torch numpy transformers datasets
!pip install evaluate



In [None]:
%load_ext autoreload
%autoreload 2
# You will have already needed to install a few things; (python 3.9+) > pip install poetry ; poetry install torch numpy transformers datasets
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForMaskedLM, BertForSequenceClassification, BertTokenizer, AutoModelForSequenceClassification
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, pipeline
from datasets import load_dataset, Dataset
import evaluate  # Correct import for metrics

import evaluate
accuracy = evaluate.load("accuracy")

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
PRETRAINED_MODEL_PATH = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=11)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Multi-labelling for Sentiment Analysis


In [None]:
# Good copy
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import AutoModelForSequenceClassification, BertTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding, EarlyStoppingCallback, BertConfig
from sklearn.model_selection import KFold, train_test_split
import evaluate
import copy

from sklearn.metrics import roc_auc_score

early_stopping = EarlyStoppingCallback(early_stopping_patience=3)  # Stop if validation loss doesn't improve for 3 epochs
dropout_prob = 0.3

# 1. Load the dataset
dataset = pd.read_csv('/content/drive/MyDrive/dailyinks/data/merged_df.csv')
dataset.shape

# 2. Define the mapping of emotion columns to numeric labels
emotion_mapping = {
    'afraid': 0,
    'angry': 1,
    'anxious': 2,
    'calm': 3,
    'confused': 4,
    'disgusted': 5,
    'excited': 6,
    'frustrated': 7,
    'happy': 8,
    'proud': 9,
    'sad': 10,
    'satisfied': 11,
    'surprised': 12
}

# 3. Filter the dataset to include only the text and emotion columns
relevant_columns = ['text'] + list(emotion_mapping.keys())
dataset_filtered = dataset[relevant_columns]


# 5. Split the dataset into training+validation and testing sets
train_val_data, test_data = train_test_split(dataset_filtered, test_size=0.2, random_state=42)

# 6. Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tokenizer.model_max_length = 512

# 7. Preprocessing function
def preprocess(row):
    tokenized = tokenizer(row["text"], truncation=True, padding="max_length", max_length=512)
    tokenized["labels"] = [float(row[col]) for col in emotion_mapping.keys()]
    return tokenized

# 8. Define the metrics for evaluation
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = sigmoid(predictions)
    binary_predictions = (predictions > 0.5).astype(int)

    binary_predictions_flat = binary_predictions.flatten()
    labels_flat = labels.flatten()
    # oversample minority classes
    # important classes are already well-represented
    accuracy = accuracy_metric.compute(predictions=binary_predictions_flat, references=labels_flat)["accuracy"]
    f1 = f1_metric.compute(predictions=binary_predictions_flat, references=labels_flat, average="weighted")["f1"]
    precision = precision_metric.compute(predictions=binary_predictions_flat, references=labels_flat, average="weighted")["precision"]
    recall = recall_metric.compute(predictions=binary_predictions_flat, references=labels_flat, average="weighted")["recall"]
    auc = roc_auc_score(labels, predictions, average="weighted", multi_class="ovr")


    return {"accuracy": accuracy, "f1": f1, "precision": precision, "recall": recall, "auc and roc": auc}

config=BertConfig.from_pretrained(
      "bert-base-uncased",
      hidden_dropout_prob=dropout_prob,
      attention_probs_dropout_prob=dropout_prob,
  )
# 9. Load the pre-trained BERT model
latest_sentiment_model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(emotion_mapping),
    problem_type="multi_label_classification",
    id2label={i: label for i, label in enumerate(emotion_mapping.keys())},
    label2id=emotion_mapping,
    local_files_only=True
)

# Freeze most of BERT's parameters
for name, param in latest_sentiment_model.bert.named_parameters():
    param.requires_grad = False

for name, param in latest_sentiment_model.bert.pooler.named_parameters():
    param.requires_grad = True

for name, param in latest_sentiment_model.bert.encoder.layer[-1].named_parameters():
    param.requires_grad = True

# 10. Training arguments template
training_args = TrainingArguments(
    output_dir="checkpoints",
    overwrite_output_dir=True,
    learning_rate=1e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    logging_steps=5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    report_to="none",
    load_best_model_at_end=True,  # This is required for EarlyStoppingCallback
    metric_for_best_model="eval_loss" # Specify the metric to use for early stopping
)

# 11. Implement k-fold cross-validation
k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)

dataset_df = train_val_data.copy()
all_metrics = []

for fold, (train_idx, val_idx) in enumerate(kf.split(dataset_df)):
    print(f"Processing Fold {fold + 1}/{k}...")

    train_data = dataset_df.iloc[train_idx]
    val_data = dataset_df.iloc[val_idx]

    train_dataset = Dataset.from_pandas(train_data).map(preprocess)
    val_dataset = Dataset.from_pandas(val_data).map(preprocess)

    train_dataset = train_dataset.remove_columns(["text"] + list(emotion_mapping.keys()))
    val_dataset = val_dataset.remove_columns(["text"] + list(emotion_mapping.keys()))

    fold_training_args = copy.deepcopy(training_args)
    fold_training_args.output_dir = f"checkpoints/fold_{fold + 1}"

    fold_trainer = Trainer(
        model=latest_sentiment_model,
        args=fold_training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
        compute_metrics=compute_metrics,
        callbacks=[early_stopping]
    )

    fold_trainer.train()
    fold_metrics = fold_trainer.evaluate()
    print(f"Metrics for Fold {fold + 1}: {fold_metrics}")
    all_metrics.append(fold_metrics)

# 12. Average metrics across folds
average_metrics = {key: np.mean([metric[key] for metric in all_metrics]) for key in all_metrics[0]}
print("Average Metrics Across Folds:")
print(average_metrics)

# 13. Evaluate the model on the testing set
test_dataset = Dataset.from_pandas(test_data).map(preprocess)
test_dataset = test_dataset.remove_columns(["text"] + list(emotion_mapping.keys()))

final_metrics = fold_trainer.evaluate(test_dataset)
print("Final Metrics on Testing Set:")
print(final_metrics)

print(dataset_filtered[list(emotion_mapping.keys())].sum())

print(dataset_filtered)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing Fold 1/5...


Map:   0%|          | 0/4102 [00:00<?, ? examples/s]

Map:   0%|          | 0/1026 [00:00<?, ? examples/s]

  fold_trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Auc and roc
1,0.2066,0.164415,0.936722,0.931877,0.930569,0.936722,0.907418
2,0.1543,0.14664,0.941371,0.938453,0.937149,0.941371,0.926918
3,0.1068,0.137404,0.944294,0.940523,0.939745,0.944294,0.934482
4,0.0883,0.136652,0.944969,0.942231,0.941155,0.944969,0.935009
5,0.107,0.135136,0.945644,0.943322,0.942243,0.945644,0.935874


Metrics for Fold 1: {'eval_loss': 0.13513602316379547, 'eval_accuracy': 0.945644024591393, 'eval_f1': 0.9433220846255956, 'eval_precision': 0.9422432843962261, 'eval_recall': 0.945644024591393, 'eval_auc and roc': 0.9358740798021349, 'eval_runtime': 29.3688, 'eval_samples_per_second': 34.935, 'eval_steps_per_second': 4.392, 'epoch': 5.0}
Processing Fold 2/5...


Map:   0%|          | 0/4102 [00:00<?, ? examples/s]

Map:   0%|          | 0/1026 [00:00<?, ? examples/s]

  fold_trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Auc and roc
1,0.1367,0.117284,0.949543,0.949023,0.948587,0.949543,0.963742


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Auc and roc
1,0.1367,0.117284,0.949543,0.949023,0.948587,0.949543,0.963742
2,0.1147,0.118416,0.950142,0.949001,0.948264,0.950142,0.96246
3,0.1,0.116893,0.951417,0.950597,0.95,0.951417,0.962937
4,0.0709,0.117821,0.950142,0.949311,0.948695,0.950142,0.962339
5,0.0468,0.116553,0.950217,0.949185,0.948485,0.950217,0.962913


Metrics for Fold 2: {'eval_loss': 0.11655297875404358, 'eval_accuracy': 0.9502174239016344, 'eval_f1': 0.9491851519102411, 'eval_precision': 0.9484852932002835, 'eval_recall': 0.9502174239016344, 'eval_auc and roc': 0.9629127824951602, 'eval_runtime': 29.2296, 'eval_samples_per_second': 35.101, 'eval_steps_per_second': 4.413, 'epoch': 5.0}
Processing Fold 3/5...


Map:   0%|          | 0/4102 [00:00<?, ? examples/s]

Map:   0%|          | 0/1026 [00:00<?, ? examples/s]

  fold_trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Auc and roc
1,0.0881,0.072077,0.972185,0.972258,0.972338,0.972185,0.98815
2,0.1086,0.073687,0.97091,0.970754,0.970623,0.97091,0.987596
3,0.0695,0.069055,0.971435,0.971286,0.971162,0.971435,0.988392
4,0.042,0.070327,0.97181,0.971575,0.971403,0.97181,0.987581
5,0.0753,0.069445,0.972185,0.97203,0.971903,0.972185,0.987847


Metrics for Fold 3: {'eval_loss': 0.0690554529428482, 'eval_accuracy': 0.9714349977507872, 'eval_f1': 0.9712863861164986, 'eval_precision': 0.9711619738346475, 'eval_recall': 0.9714349977507872, 'eval_auc and roc': 0.9883916385922568, 'eval_runtime': 29.1759, 'eval_samples_per_second': 35.166, 'eval_steps_per_second': 4.421, 'epoch': 5.0}
Processing Fold 4/5...


Map:   0%|          | 0/4103 [00:00<?, ? examples/s]

Map:   0%|          | 0/1025 [00:00<?, ? examples/s]

  fold_trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Auc and roc
1,0.0811,0.048802,0.982739,0.982653,0.982591,0.982739,0.996045
2,0.1008,0.044632,0.982439,0.982235,0.98216,0.982439,0.996132
3,0.0413,0.04416,0.982364,0.982292,0.982236,0.982364,0.995925
4,0.0624,0.043483,0.983189,0.983178,0.983166,0.983189,0.996174
5,0.0393,0.043034,0.982889,0.982853,0.982821,0.982889,0.996194


Metrics for Fold 4: {'eval_loss': 0.04303402826189995, 'eval_accuracy': 0.9828893058161351, 'eval_f1': 0.9828529510300142, 'eval_precision': 0.9828209476204727, 'eval_recall': 0.9828893058161351, 'eval_auc and roc': 0.9961937565181884, 'eval_runtime': 29.1196, 'eval_samples_per_second': 35.2, 'eval_steps_per_second': 4.43, 'epoch': 5.0}
Processing Fold 5/5...


Map:   0%|          | 0/4103 [00:00<?, ? examples/s]

Map:   0%|          | 0/1025 [00:00<?, ? examples/s]

  fold_trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Auc and roc
1,0.0321,0.029117,0.989719,0.989739,0.989763,0.989719,0.99863
2,0.05,0.029989,0.989343,0.989426,0.989568,0.989343,0.99826
3,0.0306,0.027468,0.988668,0.988718,0.988788,0.988668,0.998357
4,0.0121,0.02657,0.988968,0.988974,0.98898,0.988968,0.998407
5,0.0334,0.026621,0.988968,0.989005,0.989054,0.988968,0.998273


Metrics for Fold 5: {'eval_loss': 0.026570387184619904, 'eval_accuracy': 0.988968105065666, 'eval_f1': 0.9889740470293772, 'eval_precision': 0.9889802710545023, 'eval_recall': 0.988968105065666, 'eval_auc and roc': 0.9984066491990614, 'eval_runtime': 29.279, 'eval_samples_per_second': 35.008, 'eval_steps_per_second': 4.406, 'epoch': 5.0}
Average Metrics Across Folds:
{'eval_loss': 0.07806977406144142, 'eval_accuracy': 0.9678307714251233, 'eval_f1': 0.9671241241423454, 'eval_precision': 0.9667383540212263, 'eval_recall': 0.9678307714251233, 'eval_auc and roc': 0.9763557813213604, 'eval_runtime': 29.23458, 'eval_samples_per_second': 35.082, 'eval_steps_per_second': 4.4124, 'epoch': 5.0}


Map:   0%|          | 0/1282 [00:00<?, ? examples/s]

Final Metrics on Testing Set:
{'eval_loss': 0.2029404193162918, 'eval_accuracy': 0.9408976359054362, 'eval_f1': 0.9392254426669173, 'eval_precision': 0.9381423930622346, 'eval_recall': 0.9408976359054362, 'eval_auc and roc': 0.9284281739101088, 'eval_runtime': 36.3679, 'eval_samples_per_second': 35.251, 'eval_steps_per_second': 4.427, 'epoch': 5.0}
afraid         494
angry          768
anxious        124
calm           373
confused       204
disgusted      136
excited        290
frustrated     248
happy         2399
proud          564
sad           1429
satisfied      590
surprised      274
dtype: int64
                                                   text  afraid  angry  \
0     My family was the most salient part of my day,...       0      0   
1     Yoga keeps me focused. I am able to take some ...       0      0   
2     Yesterday, my family and I played a bunch of b...       0      0   
3     Yesterday, I visited my parents and had dinner...       0      0   
4     Yesterday, I 

In [None]:
# std metrics across folds
std_metrics = {key: np.std([metric[key] for metric in all_metrics]) for key in all_metrics[0]}
print("Standard Deviation Metrics Across Folds:")
print(std_metrics)

Standard Deviation Metrics Across Folds:
{'eval_loss': 0.04170962034752266, 'eval_accuracy': 0.0172569900323941, 'eval_f1': 0.018058146970683888, 'eval_precision': 0.018472161582846713, 'eval_recall': 0.0172569900323941, 'eval_auc and roc': 0.023850074505745528, 'eval_runtime': 0.08564658545441328, 'eval_samples_per_second': 0.09841341371987768, 'eval_steps_per_second': 0.01297073629367265, 'epoch': 0.0}


In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Define the path to save the model in Google Drive
save_path = '/content/drive/MyDrive/dailyinks/model/latest_sentiment_model'

# Save the model and tokenizer
latest_sentiment_model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

import shutil

# Create a zip file of the saved model directory
shutil.make_archive('/content/latest_sentiment_model', 'zip', save_path)

# Download the zip file
from google.colab import files
files.download('/content/latest_sentiment_model.zip')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>