In [1]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EvalPrediction
from datasets import Dataset
import re
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
# Load your data
df = pd.read_csv("data.csv")  
df['text'] = df['text'].apply(lambda x: re.sub(r"[^a-zA-Z ]","",x).lower())
# Split into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)  # random_state for reproducibility

In [3]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") # Use the smallest BERT model

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True,max_length=512)

dataset = Dataset.from_pandas(train_df)
tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Fine-tuning
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(set(df["label"]))) # Use the smallest BERT model


In [5]:

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy = "epoch",
    save_strategy = "epoch",
    num_train_epochs=3,  # Adjust as needed
    per_device_train_batch_size=32,  # Adjust based on your GPU memory
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)


In [None]:

# Define a function to compute metrics
def compute_metrics(pred: EvalPrediction):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets, # Using same dataset for eval in this example
    compute_metrics=compute_metrics, # Pass the metrics function to the Trainer
)

trainer.train()


In [None]:

# Save the model
trainer.save_model("./best-model")

test_dataset = Dataset.from_pandas(test_df)
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)

# Evaluate on the test dataset
results = trainer.predict(tokenized_test_dataset)
predicted_labels = np.argmax(results.predictions, axis=1)
accuracy = accuracy_score(test_df["label"], predicted_labels)

print(f"Test Accuracy: {accuracy}")

In [None]:
diff_df = test_df
diff_df['predicted'] = predicted_labels
diff_df = diff_df[diff_df['label'] != diff_df['predicted']]
diff_df

In [15]:
diff_df.to_csv("false_predictions.csv",index=False)