In [5]:
import pandas as pd
import torch
from torch.utils.data import Dataset
import re
from transformers import AutoTokenizer

class TextDataset(Dataset):
    def __init__(self, data_dir, train, tokenizer):

        self.data_dir = data_dir
        self.train = train
        self.data = []
        self.tokenizer = tokenizer

        self.process_csv()

        print(f"✅ Dataset successfully loaded with {len(self.data)} text entries.")



    def process_csv(self):
        try:
            df = pd.read_csv(self.data_dir)


            for _, row in df.dropna(subset=['text']).iterrows():
                preprocessed_text = self.preprocess_text(str(row['text']))
                if self.train:
                  label = row['target']
                  self.data.append((preprocessed_text, label))
                else:
                  self.data.append((preprocessed_text, None))


        except Exception as e:
            print(f"Error while processing csv: {e}")

    def preprocess_text(self, text):
        """ Cleans text: lowercasing, removing URLs, mentions, and emojis. """
        text = text.lower()  # Convert to lowercase
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Remove URLs
        text = re.sub(r'@\w+', '', text)  # Remove user mentions
        text = re.sub(r"[^a-z.!? ]+", "", text)  # Remove non-alphabetic characters except ., !, ?

        # Remove emojis
        emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # Chinese characters
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\ufe0f"
                               "]+", flags=re.UNICODE)
        text = emoji_pattern.sub(r'', text)
        text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
        return text

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text, label = self.data[idx]

        # ✅ Tokenize text before returning it
        encoding = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=384,
            return_tensors="pt",
        )

        if self.train:
          return {
              "input_ids": encoding["input_ids"].squeeze(0),
              "attention_mask": encoding["attention_mask"].squeeze(0),
              "label": torch.tensor(label, dtype=torch.long),
          }
        else:
          return {
              "input_ids": encoding["input_ids"].squeeze(0),
              "attention_mask": encoding["attention_mask"].squeeze(0),
          }


In [None]:
import torch
import evaluate
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import random_split
import sys
import os


os.environ["WANDB_DISABLED"] = "true"

# Detect device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"✅ Using device: {device}")



tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

train_data_dir = 'train.csv'
dataset = TextDataset(train_data_dir, True, tokenizer)

print(f"✅ Loaded {len(dataset)} text entries from dataset.")
if len(dataset) == 0:
    raise ValueError("🚨 ERROR: Dataset is empty. Check your path and user files.")

# Train/validation split
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Load model
print("🧠 Loading BERT model...")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.to(device)

# Metric computation
def compute_metrics(eval_pred):
    accuracy = evaluate.load("accuracy")
    precision = evaluate.load("precision")
    recall = evaluate.load("recall")
    f1 = evaluate.load("f1")

    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)

    return {
        "accuracy": accuracy.compute(predictions=predictions, references=labels)["accuracy"],
        "precision": precision.compute(predictions=predictions, references=labels, average="binary")["precision"],
        "recall": recall.compute(predictions=predictions, references=labels, average="binary")["recall"],
        "f1": f1.compute(predictions=predictions, references=labels, average="binary")["f1"]
    }

# Training arguments
training_args = TrainingArguments(
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Training
print("🚀 Training started...")
trainer.train()

# Evaluation
eval_results = trainer.evaluate()
print("📊 Evaluation Results:", eval_results)

# Pretty Print Results
accuracy = eval_results.get("eval_accuracy", 0.0)
precision = eval_results.get("eval_precision", 0.0)
recall = eval_results.get("eval_recall", 0.0)
f1 = eval_results.get("eval_f1", 0.0)

print(f"\n🎯 Test Accuracy: {accuracy:.4f}")
print(f"🎯 Precision: {precision:.4f}")
print(f"🎯 Recall: {recall:.4f}")
print(f"🎯 F1 Score: {f1:.4f}")

In [None]:
import numpy as np
test_data_dir = 'test.csv'
test_dataset = TextDataset(test_data_dir, False, tokenizer)

predictions_output = trainer.predict(test_dataset)
predicted_labels = np.argmax(predictions_output.predictions, axis=1)
df = pd.read_csv(test_data_dir)


submission_df = pd.DataFrame({'id': df['id'], 'target': predicted_labels})
submission_df.to_csv('submission.csv', index=False)