In [1]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EvalPrediction
from datasets import Dataset
import re
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
# Load your data
df = pd.read_csv("data.csv")  
df['text'] = df['text'].apply(lambda x: re.sub(r"[^a-zA-Z ]","",x).lower())
# Split into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)  # random_state for reproducibility

In [3]:
tokenizer = AutoTokenizer.from_pretrained("google/bert_uncased_L-2_H-128_A-2") # Use the smallest BERT model

In [4]:
def preprocess_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True,max_length=128)

dataset = Dataset.from_pandas(train_df)
tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Fine-tuning
model = AutoModelForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=len(set(df["label"]))) # Use the smallest BERT model


Map:   0%|          | 0/3840 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy = "epoch",
    save_strategy = "epoch",
    num_train_epochs=3,  # Adjust as needed
    per_device_train_batch_size=8,  # Adjust based on your GPU memory
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)


In [6]:

# Define a function to compute metrics
def compute_metrics(pred: EvalPrediction):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets, # Using same dataset for eval in this example
    compute_metrics=compute_metrics, # Pass the metrics function to the Trainer
)

trainer.train()


  0%|          | 0/1440 [00:00<?, ?it/s]

  0%|          | 0/480 [00:00<?, ?it/s]

{'eval_loss': 0.6656414270401001, 'eval_accuracy': 0.79375, 'eval_runtime': 10.1367, 'eval_samples_per_second': 378.821, 'eval_steps_per_second': 47.353, 'epoch': 1.0}
{'loss': 0.9451, 'grad_norm': 4.368813991546631, 'learning_rate': 5e-05, 'epoch': 1.04}


  0%|          | 0/480 [00:00<?, ?it/s]

{'eval_loss': 0.4076330065727234, 'eval_accuracy': 0.8635416666666667, 'eval_runtime': 10.3033, 'eval_samples_per_second': 372.698, 'eval_steps_per_second': 46.587, 'epoch': 2.0}
{'loss': 0.5293, 'grad_norm': 34.555877685546875, 'learning_rate': 2.340425531914894e-05, 'epoch': 2.08}


  0%|          | 0/480 [00:00<?, ?it/s]

{'eval_loss': 0.3625723421573639, 'eval_accuracy': 0.8783854166666667, 'eval_runtime': 10.2204, 'eval_samples_per_second': 375.721, 'eval_steps_per_second': 46.965, 'epoch': 3.0}
{'train_runtime': 169.631, 'train_samples_per_second': 67.912, 'train_steps_per_second': 8.489, 'train_loss': 0.6354647212558322, 'epoch': 3.0}


TrainOutput(global_step=1440, training_loss=0.6354647212558322, metrics={'train_runtime': 169.631, 'train_samples_per_second': 67.912, 'train_steps_per_second': 8.489, 'total_flos': 3660143984640.0, 'train_loss': 0.6354647212558322, 'epoch': 3.0})

In [7]:

# Save the model
trainer.save_model("./fine_tuned_bert")

test_dataset = Dataset.from_pandas(test_df)
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)

# Evaluate on the test dataset
results = trainer.predict(tokenized_test_dataset)
predicted_labels = np.argmax(results.predictions, axis=1)
accuracy = accuracy_score(test_df["label"], predicted_labels)

print(f"Test Accuracy: {accuracy}")

Map:   0%|          | 0/960 [00:00<?, ? examples/s]

  0%|          | 0/120 [00:00<?, ?it/s]

Test Accuracy: 0.8458333333333333


In [8]:
diff_df = test_df
diff_df['predicted'] = predicted_labels
diff_df = diff_df[diff_df['label'] != diff_df['predicted']]
diff_df

Unnamed: 0,text,label,predicted
3048,i have been a tmobile customer for many yearsc...,1,0
2908,be careful with the europe global roaming this...,1,0
8,long time customer maybe years been using vod...,0,1
2867,my experience with vodafone customer service b...,1,2
3121,unfortunately i just changed broadband and pho...,1,0
...,...,...,...
3006,up until the past couple months we were always...,1,0
111,they offered broadband and landline the broadb...,0,1
3128,i ordered a product from vodafone on saturday ...,1,0
2966,initially this wifi enabled system was ribbish...,1,0


In [15]:
diff_df.to_csv("false_predictions.csv",index=False)