In [1]:
!pip install --upgrade sympy



Binary class classification using pre-trained transformer Bert

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

df=pd.read_csv("/content/drive/MyDrive/pstat197a/claims_clean.csv")

label_encoder=LabelEncoder()
df['bclass_encoded']=label_encoder.fit_transform(df['bclass'])

train_texts, val_texts, train_labels, val_labels=train_test_split(
    df['text_clean'].tolist(),
    df['bclass_encoded'].tolist(),
    test_size=0.2,
    random_state=42
)

In [3]:
from transformers import AutoTokenizer

MODEL_NAME="bert-base-uncased"
tokenizer=AutoTokenizer.from_pretrained(MODEL_NAME)
train_texts=[str(text) for text in train_texts]
val_texts=[str(text) for text in val_texts]

train_encodings=tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings=tokenizer(val_texts, truncation=True, padding=True, max_length=512)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
import torch

class ClaimsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings=encodings
        self.labels=labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item={key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels']=torch.tensor(self.labels[idx])
        return item

train_dataset=ClaimsDataset(train_encodings, train_labels)
val_dataset=ClaimsDataset(val_encodings, val_labels)


In [7]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

model=AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

training_args=TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=1,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none",
)


from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels=pred.label_ids
    preds=pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    acc=accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

trainer=Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer=Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5313,0.552246,0.724299,0.682796,0.881944,0.557018
2,0.5341,0.451435,0.799065,0.823045,0.775194,0.877193
3,0.3533,0.510872,0.813084,0.814815,0.862745,0.77193
4,0.3252,0.44661,0.824766,0.838013,0.825532,0.850877
5,0.3303,0.495205,0.817757,0.828194,0.831858,0.824561


TrainOutput(global_step=535, training_loss=0.4166504871065371, metrics={'train_runtime': 870.9614, 'train_samples_per_second': 9.828, 'train_steps_per_second': 0.614, 'total_flos': 2252230633881600.0, 'train_loss': 0.4166504871065371, 'epoch': 5.0})

In [8]:
# Evaluate the model
eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)

# Save the model
model.save_pretrained("./binary_classification_model")
tokenizer.save_pretrained("./binary_classification_model")


Evaluation Results: {'eval_loss': 0.44660964608192444, 'eval_accuracy': 0.8247663551401869, 'eval_f1': 0.838012958963283, 'eval_precision': 0.825531914893617, 'eval_recall': 0.8508771929824561, 'eval_runtime': 11.7199, 'eval_samples_per_second': 36.519, 'eval_steps_per_second': 2.304, 'epoch': 5.0}


('./binary_classification_model/tokenizer_config.json',
 './binary_classification_model/special_tokens_map.json',
 './binary_classification_model/vocab.txt',
 './binary_classification_model/added_tokens.json',
 './binary_classification_model/tokenizer.json')

In [13]:
import torch

device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model.to(device)

test_encodings={key: val.to(device) for key, val in test_encodings.items()}

model.eval()
with torch.no_grad():
    outputs=model(**test_encodings)
    predictions=torch.argmax(outputs.logits, dim=1)

print("Predictions:", predictions)


Predictions: tensor([0, 0], device='cuda:0')


Multiclass classification

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

df=pd.read_csv("/content/drive/MyDrive/pstat197a/claims_clean.csv")

label_encoder=LabelEncoder()
df["mclass_encoded"]=label_encoder.fit_transform(df["mclass"])

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["text_clean"].tolist(),
    df["mclass_encoded"].tolist(),
    test_size=0.2,
    random_state=42
)

In [16]:
from transformers import AutoTokenizer

MODEL_NAME="bert-base-uncased"
tokenizer=AutoTokenizer.from_pretrained(MODEL_NAME)
train_texts=[str(text) for text in train_texts]
val_texts=[str(text) for text in val_texts]
train_encodings=tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings=tokenizer(val_texts, truncation=True, padding=True, max_length=512)

In [17]:
import torch

class ClaimsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings=encodings
        self.labels=labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item={key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"]=torch.tensor(self.labels[idx])
        return item

train_dataset=ClaimsDataset(train_encodings, train_labels)
val_dataset=ClaimsDataset(val_encodings, val_labels)

In [18]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

num_classes=len(label_encoder.classes_)
model=AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_classes)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none",
)

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

def compute_metrics(pred):
    labels=pred.label_ids
    preds=pred.predictions.argmax(-1)
    acc=accuracy_score(labels, preds)
    f1=f1_score(labels, preds, average="weighted")
    precision=precision_score(labels, preds, average="weighted")
    recall=recall_score(labels, preds, average="weighted")
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

trainer=Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer=Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.605394,0.792056,0.771789,0.76425,0.792056
2,No log,0.513613,0.827103,0.828077,0.830276,0.827103
3,No log,0.481131,0.831776,0.8306,0.831909,0.831776
4,No log,0.457874,0.841121,0.840097,0.839908,0.841121
5,0.572200,0.457002,0.845794,0.844719,0.845098,0.845794


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=535, training_loss=0.5574191120183356, metrics={'train_runtime': 1029.1752, 'train_samples_per_second': 8.317, 'train_steps_per_second': 0.52, 'total_flos': 2252291299491840.0, 'train_loss': 0.5574191120183356, 'epoch': 5.0})

In [19]:
eval_results=trainer.evaluate()
print("Evaluation Results:", eval_results)

model.save_pretrained("./multiclass_classification_model")
tokenizer.save_pretrained("./multiclass_classification_model")

Evaluation Results: {'eval_loss': 0.4570024311542511, 'eval_accuracy': 0.8457943925233645, 'eval_f1': 0.8447187680126114, 'eval_precision': 0.8450982468398252, 'eval_recall': 0.8457943925233645, 'eval_runtime': 13.4727, 'eval_samples_per_second': 31.768, 'eval_steps_per_second': 2.004, 'epoch': 5.0}


('./multiclass_classification_model/tokenizer_config.json',
 './multiclass_classification_model/special_tokens_map.json',
 './multiclass_classification_model/vocab.txt',
 './multiclass_classification_model/added_tokens.json',
 './multiclass_classification_model/tokenizer.json')