<a href="https://colab.research.google.com/github/DwighttSchrutee/Practical_Data_Science/blob/main/438_Final_project(sarcasmDetection).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.utils import resample
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader

# Custom dataset class for sarcasm detection
class SarcasmDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        text = self.dataframe.iloc[index]['tweet']
        label = self.dataframe.iloc[index]['sarcastic']
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Load data
train_df = pd.read_csv('sarcasm.csv')[['tweet', 'sarcastic']]

test_df = pd.read_csv('test.csv')[['text', 'sarcastic']]
# Drop NaN values
train_df = train_df.dropna(subset=['tweet', 'sarcastic'])
test_df = test_df.dropna(subset=['text', 'sarcastic'])
test_df.rename(columns={'text': 'tweet'}, inplace=True)

# Balance the dataset by oversampling the minority class
train_majority = train_df[train_df.sarcastic == 0]
train_minority = train_df[train_df.sarcastic == 1]
train_minority_oversampled = resample(train_minority,
                                      replace=True,
                                      n_samples=len(train_majority),
                                      random_state=42)
train_oversampled = pd.concat([train_majority, train_minority_oversampled])
train_oversampled = train_oversampled.sample(frac=1, random_state=42)

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create data loaders
train_dataset = SarcasmDataset(train_oversampled, tokenizer, max_len=100)
test_dataset = SarcasmDataset(test_df, tokenizer, max_len=100)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Initialize BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs'
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=lambda data: {'input_ids': torch.stack([f['input_ids'] for f in data]),
                                'attention_mask': torch.stack([f['attention_mask'] for f in data]),
                                'labels': torch.stack([f['labels'] for f in data])}
)

# Train the model
trainer.train()

# Evaluate the model
predictions = []
labels = []
for batch in test_loader:
    with torch.no_grad():
        outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
    logits = outputs.logits
    predictions.extend(torch.argmax(logits, dim=1).tolist())
    labels.extend(batch['labels'].tolist())

# Calculate Accuracy and F1 Score
accuracy = accuracy_score(labels, predictions)
f1 = f1_score(labels, predictions)

print(f'Accuracy: {accuracy}')
print(f'F1 Score: {f1}')
