In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
import tensorflow as tf
import transformers
import pandas as pd

In [None]:
import pandas as pd
import torch
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
)

from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset


class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        encoding = self.tokenizer(
            row["sentence1"],
            row["sentence2"],
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )

        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item["labels"] = torch.tensor(row["label"], dtype=torch.long)

        return item

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    
    # Calculate precision, recall, f1-score, and confusion matrix
    report = classification_report(labels, predictions, output_dict=True)
    conf_matrix = confusion_matrix(labels, predictions)
    accuracy = accuracy_score(labels, predictions)

    metrics = {
        "accuracy": accuracy,
        "precision": report["weighted avg"]["precision"],
        "recall": report["weighted avg"]["recall"],
        "f1": report["weighted avg"]["f1-score"],
        
        "conf_matrix": conf_matrix.tolist(),  # Return confusion matrix separately
    }
    return metrics

In [None]:
import pandas as pd

quora_test = pd.read_csv('/kaggle/input/quoradataset/test_sample.csv')
quora_train = pd.read_csv('/kaggle/input/quoradataset/train.csv')

In [None]:
quora_train.columns = ['sentence1', 'sentence2', 'label']

In [None]:
from sklearn.model_selection import train_test_split


quora_train_df, quora_test_df = train_test_split(quora_train, test_size=0.2, random_state=42)

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

train_dataset = CustomDataset(quora_train_df, tokenizer)
test_dataset = CustomDataset(quora_test_df, tokenizer)

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

In [None]:
# Define training arguments

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    report_to=["none"],  # Disable wandb
)

# Define Trainer
trainer = Trainer(
    model=model,  # The model to be trained
    args=training_args,  # Training arguments
    train_dataset=train_dataset,  # Training dataset
    eval_dataset=test_dataset,  # Evaluation dataset
    tokenizer=tokenizer,  # Tokenizer for preprocessing
    compute_metrics=compute_metrics,  # Custom metrics function
)


# Train the model
trainer.train()