In [None]:
# Install Poetry
!curl -sSL https://install.python-poetry.org | python3 -

# Add Poetry to PATH for the current session
import sys
# sys.path.append("/root/.local/bin")

# !poetry env use python3

# Set up a new project directory
!mkdir my_poetry_project
%cd my_poetry_project

# Initialize a new Poetry project
!poetry init --no-interaction

!poetry add torch numpy transformers datasets

!poetry install

# Upgrade fsspec using poetry
# !poetry add --upgrade fsspec

!poetry run python -c "import torch; import transformers; print('Dependencies installed successfully!')"

[36mRetrieving Poetry metadata[0m

# Welcome to [36mPoetry[0m!

This will download and install the latest version of [36mPoetry[0m,
a dependency and package manager for Python.

It will add the `poetry` command to [36mPoetry[0m's bin directory, located at:

[33m/root/.local/bin[0m

You can uninstall at any time by executing this script with the --uninstall option,
and these changes will be reverted.

Installing [36mPoetry[0m ([36m1.8.4[0m)
[1A[2KInstalling [36mPoetry[0m ([1m1.8.4[0m): [33mCreating environment[0m
[1A[2KInstalling [36mPoetry[0m ([1m1.8.4[0m): [33mInstalling Poetry[0m
[1A[2KInstalling [36mPoetry[0m ([1m1.8.4[0m): [33mCreating script[0m
[1A[2KInstalling [36mPoetry[0m ([1m1.8.4[0m): [33mDone[0m

[36mPoetry[0m ([1m1.8.4[0m) is installed now. Great!

To get started you need [36mPoetry[0m's bin directory ([33m/root/.local/bin[0m) in your `PATH`
environment variable.

Add `export PATH="[33m/root/.local/bin[0m:$PATH"` to yo

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# !pip install torch numpy transformers datasets
# !pip install evaluate
%load_ext autoreload
%autoreload 2
# You will have already needed to install a few things; (python 3.9+) > pip install poetry ; poetry install torch numpy transformers datasets
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForMaskedLM, BertForSequenceClassification, BertTokenizer, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding, TrainingArguments, Trainer, pipeline, EarlyStoppingCallback
from datasets import load_dataset, Dataset
import evaluate  # Correct import for metrics
from scipy.special import softmax
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import label_binarize
import copy
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

accuracy = evaluate.load("accuracy")

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# PRETRAINED_MODEL_PATH= AutoModelForMaskedLM.from_pretrained("google-bert/bert-base-uncased")
PRETRAINED_MODEL_PATH = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=11)  # 11 is the number of topics you have

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Single Multi-class for Topic Modeling

In [None]:
# Load the dataset
dataset = pd.read_csv('/content/drive/MyDrive/dailyinks/data/merged_df.csv')  # Load entire dataset

# Define the mapping of topic columns to numeric labels directly
topic_mapping = {
    'exercise': 0,
    'family': 1,
    'food': 2,
    'friends': 3,
    'god': 4,
    'health': 5,
    'love': 6,
    'recreation': 7,
    'school': 8,
    'sleep': 9,
    'work': 10
}

# Map topics in the `topic` column to their corresponding integer labels
dataset['labels'] = dataset['topic'].map(topic_mapping)

# Split the dataset into training+validation and testing sets
train_val_data, test_data = train_test_split(dataset, test_size=0.2, random_state=42)

# Load pre-trained BERT model
topic_model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=11,  # Number of topic labels
    id2label={i: label for i, label in enumerate(topic_mapping.keys())},  # Map integers to label names
    label2id=topic_mapping,  # Map label names to integers
    local_files_only=True  # Assuming you have the model locally
)

topic_model.to(device)

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tokenizer.model_max_length = 512

# Preprocessing function
def preprocess(row):
    tokenized = tokenizer(row["text"], truncation=True, padding="max_length", max_length=512)
    tokenized["labels"] = row["labels"]
    return tokenized

# Freeze BERT embeddings and certain encoder layers, leaving task-specific head trainable
for name, param in topic_model.bert.named_parameters():
    param.requires_grad = False

# RECOMMENDED: for more capacity to learn, make BERT pooler and at least one encoder layer trainable
for name, param in topic_model.bert.pooler.named_parameters():
    param.requires_grad = True

for name, param in topic_model.bert.encoder.layer[-1].named_parameters():
    param.requires_grad = True


# Compute metrics function
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    probabilities = softmax(predictions, axis=1)  # Apply softmax to get class probabilities
    predicted_classes = np.argmax(probabilities, axis=1)  # Get predicted class indices

    # One-hot encode labels for ROC-AUC
    labels_one_hot = label_binarize(labels, classes=list(topic_mapping.values()))

    # Compute metrics
    accuracy = accuracy_metric.compute(predictions=predicted_classes, references=labels)["accuracy"]
    precision = precision_metric.compute(predictions=predicted_classes, references=labels, average="weighted")["precision"]
    recall = recall_metric.compute(predictions=predicted_classes, references=labels, average="weighted")["recall"]
    f1 = f1_metric.compute(predictions=predicted_classes, references=labels, average="weighted")["f1"]
    roc_auc = roc_auc_score(labels_one_hot, probabilities, average="weighted", multi_class="ovr")  # One-vs-Rest

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1, "roc_auc": roc_auc}

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
early_stopping = EarlyStoppingCallback(early_stopping_patience=3)  # Stop if validation loss doesn't improve for 3 epochs

# Define training arguments
training_args = TrainingArguments(
    output_dir="checkpoints",
    overwrite_output_dir=True,
    learning_rate=1e-4,
    lr_scheduler_type="cosine",
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    num_train_epochs=5,
    logging_steps=5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    push_to_hub=False,
    report_to="none",
    load_best_model_at_end=True,  # This is required for EarlyStoppingCallback
    metric_for_best_model="eval_loss", # Specify the metric to use for early stopping
)

# Implement k-fold cross-validation
k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)

dataset_df = train_val_data.copy()
all_metrics = []

for fold, (train_idx, val_idx) in enumerate(kf.split(dataset_df)):
    print(f"Processing Fold {fold + 1}/{k}...")

    train_data = dataset_df.iloc[train_idx]
    val_data = dataset_df.iloc[val_idx]

    train_dataset = Dataset.from_pandas(train_data).map(preprocess)
    val_dataset = Dataset.from_pandas(val_data).map(preprocess)

    fold_training_args = copy.deepcopy(training_args)
    fold_training_args.output_dir = f"checkpoints/fold_{fold + 1}"

    fold_trainer = Trainer(
        model=topic_model,
        args=fold_training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
        compute_metrics=compute_metrics,
        callbacks=[early_stopping]
    )

    fold_trainer.train()
    fold_metrics = fold_trainer.evaluate()
    print(f"Metrics for Fold {fold + 1}: {fold_metrics}")
    all_metrics.append(fold_metrics)

# Average metrics across folds
average_metrics = {key: np.mean([metric[key] for metric in all_metrics]) for key in all_metrics[0]}
print("Average Metrics Across Folds:")
print(average_metrics)

# Standard deviation metrics across folds
std_metrics = {key: np.std([metric[key] for metric in all_metrics]) for key in all_metrics[0]}
print("Standard Deviation Metrics Across Folds:")
print(std_metrics)

# Evaluate the model on the testing set
test_dataset = Dataset.from_pandas(test_data).map(preprocess)

final_metrics = fold_trainer.evaluate(test_dataset)
print("Final Metrics on Testing Set:")
print(final_metrics)

Processing Fold 1/5...


Map:   0%|          | 0/4102 [00:00<?, ? examples/s]

Map:   0%|          | 0/1026 [00:00<?, ? examples/s]

  fold_trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,1.6038,1.312402,0.64425,0.736033,0.64425,0.598334,0.934095
2,0.6342,0.56607,0.847953,0.847247,0.847953,0.842837,0.97784
3,0.5127,0.440225,0.87037,0.870482,0.87037,0.86894,0.983559
4,0.4152,0.414725,0.88499,0.884357,0.88499,0.884013,0.985207
5,0.5439,0.41058,0.885965,0.885451,0.885965,0.884957,0.985511


Metrics for Fold 1: {'eval_loss': 0.4105803370475769, 'eval_accuracy': 0.8859649122807017, 'eval_precision': 0.8854508468186087, 'eval_recall': 0.8859649122807017, 'eval_f1': 0.884957306846644, 'eval_roc_auc': 0.9855112307187546, 'eval_runtime': 29.461, 'eval_samples_per_second': 34.826, 'eval_steps_per_second': 0.305, 'epoch': 5.0}
Processing Fold 2/5...


Map:   0%|          | 0/4102 [00:00<?, ? examples/s]

Map:   0%|          | 0/1026 [00:00<?, ? examples/s]

  fold_trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.4081,0.305418,0.916179,0.915546,0.916179,0.914585,0.990442
2,0.3175,0.284776,0.916179,0.91559,0.916179,0.915674,0.99112
3,0.2118,0.288248,0.91423,0.912897,0.91423,0.913084,0.991321
4,0.1943,0.276803,0.922027,0.92149,0.922027,0.921093,0.991497
5,0.1633,0.276837,0.915205,0.914206,0.915205,0.914367,0.991528


Metrics for Fold 2: {'eval_loss': 0.27680346369743347, 'eval_accuracy': 0.9220272904483431, 'eval_precision': 0.9214897335724737, 'eval_recall': 0.9220272904483431, 'eval_f1': 0.9210928449887366, 'eval_roc_auc': 0.9914973197708636, 'eval_runtime': 29.2004, 'eval_samples_per_second': 35.137, 'eval_steps_per_second': 0.308, 'epoch': 5.0}
Processing Fold 3/5...


Map:   0%|          | 0/4102 [00:00<?, ? examples/s]

Map:   0%|          | 0/1026 [00:00<?, ? examples/s]

  fold_trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.2141,0.137643,0.95614,0.957123,0.95614,0.956097,0.998704
2,0.2108,0.126551,0.965887,0.965996,0.965887,0.965835,0.998836
3,0.1707,0.125362,0.962963,0.963202,0.962963,0.962827,0.99878
4,0.1534,0.121283,0.963938,0.964264,0.963938,0.963941,0.998871
5,0.1418,0.121783,0.961988,0.962328,0.961988,0.961991,0.998858


Metrics for Fold 3: {'eval_loss': 0.12128257751464844, 'eval_accuracy': 0.9639376218323586, 'eval_precision': 0.964264122801796, 'eval_recall': 0.9639376218323586, 'eval_f1': 0.9639406120458018, 'eval_roc_auc': 0.9988707037285777, 'eval_runtime': 29.3094, 'eval_samples_per_second': 35.006, 'eval_steps_per_second': 0.307, 'epoch': 5.0}
Processing Fold 4/5...


Map:   0%|          | 0/4103 [00:00<?, ? examples/s]

Map:   0%|          | 0/1025 [00:00<?, ? examples/s]

  fold_trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.158,0.089711,0.977561,0.978234,0.977561,0.977659,0.99934
2,0.112,0.080049,0.979512,0.980119,0.979512,0.979583,0.999409
3,0.1036,0.075168,0.976585,0.976846,0.976585,0.976555,0.999495
4,0.0757,0.073137,0.977561,0.977954,0.977561,0.97757,0.999504
5,0.053,0.073869,0.978537,0.978885,0.978537,0.978561,0.999487


Metrics for Fold 4: {'eval_loss': 0.07313653826713562, 'eval_accuracy': 0.9775609756097561, 'eval_precision': 0.9779543696933225, 'eval_recall': 0.9775609756097561, 'eval_f1': 0.9775701867443484, 'eval_roc_auc': 0.99950394134755, 'eval_runtime': 29.404, 'eval_samples_per_second': 34.859, 'eval_steps_per_second': 0.306, 'epoch': 5.0}
Processing Fold 5/5...


Map:   0%|          | 0/4103 [00:00<?, ? examples/s]

Map:   0%|          | 0/1025 [00:00<?, ? examples/s]

  fold_trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.0734,0.039366,0.99122,0.991343,0.99122,0.991202,0.999771
2,0.0576,0.043399,0.986341,0.986509,0.986341,0.986243,0.999803
3,0.0611,0.039579,0.987317,0.987488,0.987317,0.987215,0.999832
4,0.0502,0.037144,0.987317,0.987499,0.987317,0.987255,0.999844
5,0.0317,0.037176,0.987317,0.987499,0.987317,0.987255,0.999846


Metrics for Fold 5: {'eval_loss': 0.03714396432042122, 'eval_accuracy': 0.9873170731707317, 'eval_precision': 0.9874988720297185, 'eval_recall': 0.9873170731707317, 'eval_f1': 0.9872547076177535, 'eval_roc_auc': 0.9998437394347897, 'eval_runtime': 29.6284, 'eval_samples_per_second': 34.595, 'eval_steps_per_second': 0.304, 'epoch': 5.0}
Average Metrics Across Folds:
{'eval_loss': 0.18378937616944313, 'eval_accuracy': 0.9473615746683782, 'eval_precision': 0.9473315889831838, 'eval_recall': 0.9473615746683782, 'eval_f1': 0.9469631316486569, 'eval_roc_auc': 0.995045387000107, 'eval_runtime': 29.40064, 'eval_samples_per_second': 34.8846, 'eval_steps_per_second': 0.306, 'epoch': 5.0}


Map:   0%|          | 0/1282 [00:00<?, ? examples/s]

Final Metrics on Testing Set:
{'eval_loss': 0.4132130444049835, 'eval_accuracy': 0.9024960998439937, 'eval_precision': 0.9024369492741785, 'eval_recall': 0.9024960998439937, 'eval_f1': 0.9020777762406281, 'eval_roc_auc': 0.9891053356451315, 'eval_runtime': 36.4509, 'eval_samples_per_second': 35.171, 'eval_steps_per_second': 0.302, 'epoch': 5.0}


In [None]:
# Define the path to save the model in Google Drive
save_path = '/content/drive/MyDrive/dailyinks/model/topic_model'

# Save the model and tokenizer
topic_model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

import shutil

# Create a zip file of the saved model directory
shutil.make_archive('/content/topic_model', 'zip', save_path)

# Download the zip file
from google.colab import files
files.download('/content/topic_model.zip')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
classifier = pipeline("text-classification",model=topic_model, return_all_scores=False, tokenizer=tokenizer, device=device)
classifier("I had a great day with my family and felt really happy.")



[{'label': 'family', 'score': 0.999677300453186}]