In [None]:
!pip install sentence_transformers
!pip install nlpaug
import pandas as pd
import re
#import emoji
from sklearn.utils import resample
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import f1_score, accuracy_score
import torch
from torch.utils.data import Dataset
import nlpaug.augmenter.word as naw

# Load datasets
train_data = pd.read_csv("SubTask-C-train.csv")
val_labels = pd.read_csv("SubTask-C-(index,label)val.csv")
val_tweets = pd.read_csv("SubTask-C-(index,tweet)val.csv")

# Merge evaluation datasets
val_data = pd.merge(val_tweets, val_labels, on="index")


class0 = train_data[train_data['label'] == 0]
class1 = train_data[train_data['label'] == 1]
class2 = train_data[train_data['label'] == 2]

max_size = max(len(class0), len(class1), len(class2))

class0_upsampled = resample(class0, replace=True, n_samples=max_size, random_state=42)
class1_upsampled = resample(class1, replace=True, n_samples=max_size, random_state=42)
class2_upsampled = resample(class2, replace=True, n_samples=max_size, random_state=42)

train_data_balanced = pd.concat([class0_upsampled, class1_upsampled, class2_upsampled])


# Tokenizer for BERT model
tokenizer = AutoTokenizer.from_pretrained("l3cube-pune/hindi-tweets-bert-hateful")

# PyTorch Dataset class
class HateSpeechDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Tokenize the input text
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            truncation=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Prepare train and validation datasets
train_dataset = HateSpeechDataset(
    texts=train_data_balanced['tweet'].tolist(),
    labels=train_data_balanced['label'].tolist(),
    tokenizer=tokenizer
)

val_dataset = HateSpeechDataset(
    texts=val_data['tweet'].tolist(),
    labels=val_data['label'].tolist(),
    tokenizer=tokenizer
)

# Load BERT model for classification
model = AutoModelForSequenceClassification.from_pretrained("l3cube-pune/hindi-tweets-bert-hateful", num_labels=3)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    dataloader_pin_memory=False,
    evaluation_strategy="steps",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    save_steps=50,
    eval_steps=50,
    save_strategy="steps",
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1",
    greater_is_better=True,
    report_to="none"  # Disables logging to WandB or other integrations
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Evaluate the model
eval_result = trainer.evaluate()

print(f"Accuracy: {eval_result['eval_accuracy']}")
print(f"F1 Score: {eval_result['eval_f1']}")





The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/453 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/3.16M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/6.41M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/951M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at l3cube-pune/hindi-tweets-bert-hateful and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


NameError: name 'compute_metrics' is not defined