In [18]:
%pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [19]:
# Import necessary libraries
import random
import pandas as pd
import torch
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from transformers import (
    BertForSequenceClassification,
    BertTokenizer,
    Trainer,
    TrainingArguments,
)

# from torch.utils.data import DataLoader



In [20]:
# Download necessary NLTK resources
nltk.download("stopwords")
nltk.download("punkt")

# Load 20 Newsgroups dataset
newsgroups = fetch_20newsgroups(
    subset="all",
    categories=["sci.space", "rec.sport.hockey", "talk.politics.guns", "rec.autos"],
)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [21]:
# Create a pandas dataframe from the dataset
df = pd.DataFrame({"text": newsgroups.data, "label": newsgroups.target})



In [22]:
# Preprocess the text data
df["text"] = df["text"].str.lower()  # Lowercase text
df["text"] = df["text"].str.replace(r"[^\w\s]", "")  # Remove punctuation and digits
df["text"] = df["text"].str.replace(r"\d+", "")
df["text"] = df["text"].apply(word_tokenize)  # Tokenize text
stop_words = set(stopwords.words("english"))  # Remove stopwords
df["text"] = df["text"].apply(lambda x: [word for word in x if word not in stop_words])
df["text"] = df["text"].apply(lambda x: " ".join(x))  # Join tokens back into strings
df["text"] = df["text"].str.strip()  # Strip whitespace

# Split the data into training and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Load the pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=len(train_df["label"].unique())
)

# Freeze the base BERT layers
for param in model.base_model.parameters():
    param.requires_grad = False


  df["text"] = df["text"].str.replace(r"[^\w\s]", "")  # Remove punctuation and digits
  df["text"] = df["text"].str.replace(r"\d+", "")
loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/vocab.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/tokenizer_config.json
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_drop

In [23]:
# Tokenize the text data for both the training and test sets
train_encodings = tokenizer(
    train_df["text"].tolist(),
    truncation=True,
    padding=True,
    max_length=512,  # Set the maximum sequence length to 512
)
test_encodings = tokenizer(
    test_df["text"].tolist(),
    truncation=True,
    padding=True,
    max_length=512,  # Set the maximum sequence length to 512
)


In [24]:
# Define a custom PyTorch dataset for the 20 Newsgroups dataset
class NewsGroupDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {
            key: torch.tensor(val[idx])
            for key, val in self.encodings.items()
            if key != "overflowing_tokens"
        }
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


# Convert the tokenized data into PyTorch datasets
train_dataset = NewsGroupDataset(train_encodings, train_df["label"].tolist())
test_dataset = NewsGroupDataset(test_encodings, test_df["label"].tolist())

device = torch.device("cuda")

# Define the training arguments for the Trainer object
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=10,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    max_steps=1000,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=5,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    fp16=True,
)

optimizer = torch.optim.AdamW(
    model.parameters(), lr=5e-5, eps=1e-8  # Increase learning rate
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [25]:
# Train the model
model = model.to(device)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    optimizers=(optimizer, None),
    compute_metrics=lambda pred: {
        "accuracy": accuracy_score(pred.label_ids, pred.predictions.argmax(axis=1)),
        "precision": precision_score(
            pred.label_ids, pred.predictions.argmax(axis=1), average="weighted"
        ),
        "recall": recall_score(
            pred.label_ids, pred.predictions.argmax(axis=1), average="weighted"
        ),
        "f1": f1_score(
            pred.label_ids, pred.predictions.argmax(axis=1), average="weighted"
        ),
    },
)


trainer.train()


eval_results = trainer.evaluate(test_dataset)
print(eval_results)


max_steps is given, it will override any value given in num_train_epochs
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 3108
  Num Epochs = 11
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 1000
  Number of trainable parameters = 3076


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4045,1.38869,0.268638,0.29522,0.268638,0.115468
2,1.3724,1.365763,0.341902,0.346891,0.341902,0.260231
3,1.3546,1.352365,0.361183,0.380752,0.361183,0.332156
4,1.3424,1.341477,0.354756,0.43749,0.354756,0.295747
5,1.344,1.326827,0.38946,0.50614,0.38946,0.34573
6,1.3155,1.311633,0.438303,0.506034,0.438303,0.414737
7,1.3144,1.300544,0.502571,0.512151,0.502571,0.486865
8,1.3195,1.293889,0.501285,0.539949,0.501285,0.485408
9,1.2894,1.288447,0.51928,0.535797,0.51928,0.503882
10,1.2917,1.286644,0.51928,0.541876,0.51928,0.503962


***** Running Evaluation *****
  Num examples = 778
  Batch size = 64
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to ./results/checkpoint-98
Configuration saved in ./results/checkpoint-98/config.json
Model weights saved in ./results/checkpoint-98/pytorch_model.bin
Deleting older checkpoint [results/checkpoint-686] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 778
  Batch size = 64
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to ./results/checkpoint-196
Configuration saved in ./results/checkpoint-196/config.json
Model weights saved in ./results/checkpoint-196/pytorch_model.bin
Deleting older checkpoint [results/checkpoint-784] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 778
  Batch size = 64
Saving model checkpoint to ./results/checkpoint-294
Configuration saved in ./results/checkpoint-294/config.json
Model weights saved in ./results/checkpoint-294/pytorch

{'eval_loss': 1.2866437435150146, 'eval_accuracy': 0.519280205655527, 'eval_precision': 0.5418755643622584, 'eval_recall': 0.519280205655527, 'eval_f1': 0.5039618328947076, 'eval_runtime': 8.6098, 'eval_samples_per_second': 90.362, 'eval_steps_per_second': 1.51, 'epoch': 10.2}


In [26]:
# Test the model on a random sample from the test set
sample_index = random.randint(0, len(test_df) - 1)
sample_text = test_df.iloc[sample_index]["text"]
sample_label = test_df.iloc[sample_index]["label"]
print("Sample text:", sample_text)
print("True label:", newsgroups.target_names[sample_label])


sample_encoding = tokenizer.encode_plus(
    sample_text, truncation=True, padding=True, return_tensors="pt"
)


model.eval()
with torch.no_grad():
    model_and_encoding = {"model": model, "encoding": sample_encoding}

    torch.save(model_and_encoding, "model_and_encoding.pt")


# Load the saved model and encoding
model_and_encoding = torch.load("model_and_encoding.pt")
saved_model = model_and_encoding["model"]
saved_encoding = model_and_encoding["encoding"]

# Move encoding tensor to the same device as the saved model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
saved_encoding = {k: v.to(device) for k, v in saved_encoding.items()}

# Run inference
with torch.no_grad():
    output = saved_model(**saved_encoding)

predicted_label = output[0].argmax().item()
print("Predicted label:", newsgroups.target_names[predicted_label])


Sample text: erniecraycom ernest smith subject aftermarket ac units originator ernieferris lines nntppostinghost ferriscraycom organization cray research inc distribution usa article qcaueinnmtaxoncsuncedu andrew brandt writes looked getting ac installed honda crx si unit plus shipping installation like hours top hunk change anyone know place aftermarket ac installation honda ac unit third party unit seem find anyone put third party ac unit honda carolina would prefer place nearby references would handy thx andy brandtcsuncedu les bartels comments sorry cant help question comment make concerning aftermarket ac units frostking frosttemp forget aftermarket unit cavalier quite unhappy fan noisy doesnt put much air never aftermarket ac installed vehicles cant trust quality performance experience les les bartel im going live forever let add ac installed ford garage work well ac installed factory pickups identical mine talked people result dont know problem ford ernie smith
True label: rec.a