In [None]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124

In [None]:
!pip install datasets

In [None]:
!pip install numpy==1.26

In [None]:
#HinGe model - Best performing model (final model)
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments,AutoModelForMaskedLM
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Load the dataset
racial_train = pd.read_csv('Racial_train.csv')
racial_val = pd.read_csv('Racial_val.csv')
thar_dataset = pd.read_csv('THAR-Dataset.csv')

def balance_dataset(df, label_column):
    df[label_column] = df[label_column].astype(int)  # Ensure labels are integers
    majority_class = df[df[label_column] == 0]
    minority_class = df[df[label_column] == 1]
    minority_class_upsampled = minority_class.sample(len(majority_class), replace=True)
    balanced_df = pd.concat([majority_class, minority_class_upsampled])
    return balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Balance the dataset
balanced_df = balance_dataset(racial_train, 'labels')

# Convert labels to integers
racial_val['labels'] = racial_val['labels'].astype(int)
thar_dataset['labels'] = thar_dataset['labels'].replace({"AntiReligion":1,"Non-AntiReligion":0}).astype(int)

tokenizer = AutoTokenizer.from_pretrained("l3cube-pune/hing-bert")
model = AutoModelForSequenceClassification.from_pretrained("l3cube-pune/hing-bert", num_labels=2)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Using device: {device}")

def tokenize_function(examples):
    return tokenizer(examples['clean_text'], padding='max_length', truncation=True, max_length=128)

# Convert DataFrames to Hugging Face Dataset
train_dataset = Dataset.from_pandas(balanced_df)
val_dataset = Dataset.from_pandas(racial_val)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# Ensure correct label format
train_dataset = train_dataset.map(lambda examples: {'labels': [int(label) for label in examples['labels']]}, batched=True)
val_dataset = val_dataset.map(lambda examples: {'labels': [int(label) for label in examples['labels']]}, batched=True)

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Enhanced metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Training arguments with improved settings
training_args1 = TrainingArguments(
    output_dir='./results_hinge_bert',
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs_hinge_bert',
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
)
training_args2 = TrainingArguments(
    output_dir='./results_hinge_bert',
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-6,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs_hinge_bert',
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
)

# Process THAR dataset
thar_train, thar_val = train_test_split(thar_dataset, test_size=0.2, random_state=42)

thar_train['labels'] = thar_train['labels'].replace({"AntiReligion":1,"Non-AntiReligion":0}).astype(int)
thar_val['labels'] = thar_val['labels'].replace({"AntiReligion":1,"Non-AntiReligion":0}).astype(int)

thar_train_dataset = Dataset.from_pandas(thar_train)
thar_val_dataset = Dataset.from_pandas(thar_val)

thar_train_dataset = thar_train_dataset.map(tokenize_function, batched=True)
thar_val_dataset = thar_val_dataset.map(tokenize_function, batched=True)

thar_train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
thar_val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# First training on THAR dataset
print("Training on THAR dataset...")
thar_trainer = Trainer(
    model=model,
    args=training_args1,
    train_dataset=thar_train_dataset,
    eval_dataset=thar_val_dataset,
    compute_metrics=compute_metrics,
)

thar_trainer.train()

# Save THAR training results
thar_eval_results = thar_trainer.evaluate()
print(f"THAR dataset evaluation results: {thar_eval_results}")
model.save_pretrained("./hinge_bert_thar_finetuned")
tokenizer.save_pretrained("./hinge_bert_thar_finetuned")

# Second training on racial dataset
print("Training on racial dataset...")
trainer = Trainer(
    model=model,  # Continue with the same model
    args=training_args2,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

# Save final model
model.save_pretrained("./hinge_bert_final_finetuned")
tokenizer.save_pretrained("./hinge_bert_final_finetuned")

# Final evaluation
racial_eval_results = trainer.evaluate()
print(f"Racial dataset evaluation results: {racial_eval_results}")

# Test on a few examples
def predict_text(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    prediction = torch.argmax(logits, dim=1).item()

    return "Racial Hoaxes detected" if prediction == 1 else "Not a Racial Hoaxes"

# Test examples
test_texts = [
    "आप सभी बहुत अच्छे लोग हैं",  # You all are very good people
    "I hate people from that community",
    "यह एक सामान्य वाक्य है",  # This is a normal sentence
    "These people should not be allowed in our country"
]

print("\nTesting the model on example texts:")
for text in test_texts:
    print(f"Text: {text}")
    print(f"Prediction: {predict_text(text)}\n")

In [None]:
from sklearn.metrics import classification_report
val = pd.read_csv("/content/Racial_val.csv")
val = Dataset.from_pandas(val)
val = val.map(tokenize_function, batched=True)
val_pred = trainer.predict(val)
val_preds = np.argmax(val_pred.predictions, axis=-1)
df_preds = pd.DataFrame({'Predicted_Labels': val_preds})
df_preds.to_csv('predictions_val.csv', index=False)
true_labels = df_preds['Predicted_Labels']
report = classification_report(true_labels, val['labels'])
print(report)

In [None]:
#cjvt/roberta-en-hi-codemixed model
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Load the dataset
racial_train = pd.read_csv('Racial_train.csv')
racial_val = pd.read_csv('Racial_val.csv')
thar_dataset = pd.read_csv('THAR-Dataset.csv')

def balance_dataset(df, label_column):
    df[label_column] = df[label_column].astype(int)  # Ensure labels are integers
    majority_class = df[df[label_column] == 0]
    minority_class = df[df[label_column] == 1]
    minority_class_upsampled = minority_class.sample(len(majority_class), replace=True)
    balanced_df = pd.concat([majority_class, minority_class_upsampled])
    return balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Balance the dataset
balanced_df = balance_dataset(racial_train, 'labels')

# Convert labels to integers
racial_val['labels'] = racial_val['labels'].astype(int)
thar_dataset['labels'] = thar_dataset['labels'].replace({"AntiReligion":1,"Non-AntiReligion":0}).astype(int)

# Load model and tokenizer for code-mixed English-Hindi RoBERTa
tokenizer = AutoTokenizer.from_pretrained("cjvt/roberta-en-hi-codemixed")
model = AutoModelForSequenceClassification.from_pretrained("cjvt/roberta-en-hi-codemixed", num_labels=2)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Using device: {device}")

def tokenize_function(examples):
    return tokenizer(examples['clean_text'], padding='max_length', truncation=True, max_length=128)

# Convert DataFrames to Hugging Face Dataset
train_dataset = Dataset.from_pandas(balanced_df)
val_dataset = Dataset.from_pandas(racial_val)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# Ensure correct label format
train_dataset = train_dataset.map(lambda examples: {'labels': [int(label) for label in examples['labels']]}, batched=True)
val_dataset = val_dataset.map(lambda examples: {'labels': [int(label) for label in examples['labels']]}, batched=True)

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Enhanced metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Training arguments with improved settings
training_args1 = TrainingArguments(
    output_dir='./results_roberta_codemixed',
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs_roberta_codemixed',
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
)
training_args2 = TrainingArguments(
    output_dir='./results_roberta_codemixed',
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-6,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs_roberta_codemixed',
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
)

# Process THAR dataset
thar_train, thar_val = train_test_split(thar_dataset, test_size=0.2, random_state=42)

thar_train_dataset = Dataset.from_pandas(thar_train)
thar_val_dataset = Dataset.from_pandas(thar_val)

thar_train_dataset = thar_train_dataset.map(tokenize_function, batched=True)
thar_val_dataset = thar_val_dataset.map(tokenize_function, batched=True)

thar_train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
thar_val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# First training on THAR dataset
print("Training on THAR dataset...")
thar_trainer = Trainer(
    model=model,
    args=training_args1,
    train_dataset=thar_train_dataset,
    eval_dataset=thar_val_dataset,
    compute_metrics=compute_metrics,
)

thar_trainer.train()

# Save THAR training results
thar_eval_results = thar_trainer.evaluate()
print(f"THAR dataset evaluation results: {thar_eval_results}")
model.save_pretrained("./roberta_codemixed_thar_finetuned")
tokenizer.save_pretrained("./roberta_codemixed_thar_finetuned")

# Second training on racial dataset
print("Training on racial dataset...")
trainer = Trainer(
    model=model,  # Continue with the same model
    args=training_args2,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

# Save final model
model.save_pretrained("./roberta_codemixed_final_finetuned")
tokenizer.save_pretrained("./roberta_codemixed_final_finetuned")

# Final evaluation
racial_eval_results = trainer.evaluate()
print(f"Racial dataset evaluation results: {racial_eval_results}")

# Test on a few examples
def predict_text(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    prediction = torch.argmax(logits, dim=1).item()

    return "Racial Hoaxes detected" if prediction == 1 else "Not a Racial Hoaxes"

# Test examples
test_texts = [
    "आप सभी बहुत अच्छे लोग हैं",  # You all are very good people
    "I hate people from that community",
    "यह एक सामान्य वाक्य है",  # This is a normal sentence
    "These people should not be allowed in our country",
    "This party is amazing yaar, मैं तुम्हें later call करूंगा"  # Code-mixed example
]

print("\nTesting the model on example texts:")
for text in test_texts:
    print(f"Text: {text}")
    print(f"Prediction: {predict_text(text)}\n")

In [None]:
from sklearn.metrics import classification_report
val = pd.read_csv("/content/Racial_val.csv")
val = Dataset.from_pandas(val)
val = val.map(tokenize_function, batched=True)
val_pred = trainer.predict(val)
val_preds = np.argmax(val_pred.predictions, axis=-1)
df_preds = pd.DataFrame({'Predicted_Labels': val_preds})
df_preds.to_csv('predictions_val.csv', index=False)
true_labels = df_preds['Predicted_Labels']
report = classification_report(true_labels, val['labels'])
print(report)

In [None]:
#indic bert
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Load the dataset
racial_train = pd.read_csv('Racial_train.csv')
racial_val = pd.read_csv('Racial_val.csv')
thar_dataset = pd.read_csv('THAR-Dataset.csv')

def balance_dataset(df, label_column):
    df[label_column] = df[label_column].astype(int)  # Ensure labels are integers
    majority_class = df[df[label_column] == 0]
    minority_class = df[df[label_column] == 1]
    minority_class_upsampled = minority_class.sample(len(majority_class), replace=True)
    balanced_df = pd.concat([majority_class, minority_class_upsampled])
    return balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Balance the dataset
balanced_df = balance_dataset(racial_train, 'labels')

# Convert labels to integers
racial_val['labels'] = racial_val['labels'].astype(int)
thar_dataset['labels'] = thar_dataset['labels'].replace({"AntiReligion":1,"Non-AntiReligion":0}).astype(int)

# Load tokenizer and model for Indic-BERT instead of HinGE-BERT
model_name = "ai4bharat/indic-bert"  # Using Indic-BERT model which supports Hindi-English
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Using device: {device}")

def tokenize_function(examples):
    return tokenizer(examples['clean_text'], padding='max_length', truncation=True, max_length=128)

# Convert DataFrames to Hugging Face Dataset
train_dataset = Dataset.from_pandas(balanced_df)
val_dataset = Dataset.from_pandas(racial_val)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# Ensure correct label format
train_dataset = train_dataset.map(lambda examples: {'labels': [int(label) for label in examples['labels']]}, batched=True)
val_dataset = val_dataset.map(lambda examples: {'labels': [int(label) for label in examples['labels']]}, batched=True)

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Enhanced metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Training arguments with improved settings
training_args1 = TrainingArguments(
    output_dir='./results_indic_bert',
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs_indic_bert',
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
)
training_args2 = TrainingArguments(
    output_dir='./results_indic_bert',
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-6,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs_indic_bert',
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
)

# Process THAR dataset
thar_train, thar_val = train_test_split(thar_dataset, test_size=0.2, random_state=42)

thar_train_dataset = Dataset.from_pandas(thar_train)
thar_val_dataset = Dataset.from_pandas(thar_val)

thar_train_dataset = thar_train_dataset.map(tokenize_function, batched=True)
thar_val_dataset = thar_val_dataset.map(tokenize_function, batched=True)



thar_train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
thar_val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# First training on THAR dataset
print("Training on THAR dataset...")
thar_trainer = Trainer(
    model=model,
    args=training_args1,
    train_dataset=thar_train_dataset,
    eval_dataset=thar_val_dataset,
    compute_metrics=compute_metrics,
)

thar_trainer.train()

# Save THAR training results
thar_eval_results = thar_trainer.evaluate()
print(f"THAR dataset evaluation results: {thar_eval_results}")
model.save_pretrained("./indic_bert_thar_finetuned")
tokenizer.save_pretrained("./indic_bert_thar_finetuned")

# Second training on racial dataset
print("Training on racial dataset...")
trainer = Trainer(
    model=model,  # Continue with the same model
    args=training_args2,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

# Save final model
model.save_pretrained("./indic_bert_final_finetuned")
tokenizer.save_pretrained("./indic_bert_final_finetuned")

# Final evaluation
racial_eval_results = trainer.evaluate()
print(f"Racial dataset evaluation results: {racial_eval_results}")

# Test on a few examples
def predict_text(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    prediction = torch.argmax(logits, dim=1).item()

    return "Racial Hoaxes detected" if prediction == 1 else "Not a Racial Hoaxes"

# Test examples
test_texts = [
    "आप सभी बहुत अच्छे लोग हैं",  # You all are very good people
    "I hate people from that community",
    "यह एक सामान्य वाक्य है",  # This is a normal sentence
    "These people should not be allowed in our country"
]

print("\nTesting the model on example texts:")
for text in test_texts:
    print(f"Text: {text}")
    print(f"Prediction: {predict_text(text)}\n")

In [None]:
#BAAI BGE-M3 Model LoRA fine tuning
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from peft import get_peft_model, LoraConfig, TaskType

# Load the dataset
racial_train = pd.read_csv('Racial_train.csv')
racial_val = pd.read_csv('Racial_val.csv')
thar_dataset = pd.read_csv('THAR-Dataset.csv')

def balance_dataset(df, label_column):
    df[label_column] = df[label_column].astype(int)  # Ensure labels are integers
    majority_class = df[df[label_column] == 0]
    minority_class = df[df[label_column] == 1]
    minority_class_upsampled = minority_class.sample(len(majority_class), replace=True)
    balanced_df = pd.concat([majority_class, minority_class_upsampled])
    return balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Balance the dataset
balanced_df = balance_dataset(racial_train, 'labels')

# Convert labels to integers
racial_val['labels'] = racial_val['labels'].astype(int)
thar_dataset['labels'] = thar_dataset['labels'].replace({"AntiReligion":1,"Non-AntiReligion":0}).astype(int)

# Load tokenizer and model for BGE-M3
model_name = "BAAI/bge-m3"  # Using BGE-M3 model which is a powerful multilingual model
tokenizer = AutoTokenizer.from_pretrained(model_name)
base_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Configure LoRA parameters
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,          # Sequence classification task
    r=16,                                # Rank of the update matrices
    lora_alpha=32,                       # Alpha parameter for LoRA scaling
    lora_dropout=0.1,                    # Dropout probability for LoRA layers
    bias="none",                         # Don't train biases
    target_modules=["query", "key", "value"]  # Target attention modules
)

# Apply LoRA adapter to the model
model = get_peft_model(base_model, lora_config)

# Print model architecture to verify
print(f"Model architecture: {model.__class__.__name__}")
print(f"Base model size: {sum(p.numel() for p in base_model.parameters()) / 1e6:.2f}M parameters")
print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad) / 1e6:.2f}M parameters")
trainable_params_pct = 100 * sum(p.numel() for p in model.parameters() if p.requires_grad) / sum(p.numel() for p in model.parameters())
print(f"Percentage of trainable parameters: {trainable_params_pct:.2f}%")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Using device: {device}")

def tokenize_function(examples):
    # BGE-M3 can handle longer sequences, but keeping reasonable length for efficiency
    return tokenizer(examples['clean_text'], padding='max_length', truncation=True, max_length=256)

# Convert DataFrames to Hugging Face Dataset
train_dataset = Dataset.from_pandas(balanced_df)
val_dataset = Dataset.from_pandas(racial_val)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# Ensure correct label format
train_dataset = train_dataset.map(lambda examples: {'labels': [int(label) for label in examples['labels']]}, batched=True)
val_dataset = val_dataset.map(lambda examples: {'labels': [int(label) for label in examples['labels']]}, batched=True)

# Set format for PyTorch - BGE-M3 might have additional tokens
required_columns = ['input_ids', 'attention_mask', 'labels']
# Check if token_type_ids is in the tokenizer output
sample_encoding = tokenizer("Sample text", return_tensors="pt")
all_columns = list(sample_encoding.keys()) + ['labels']
all_columns = [col for col in all_columns if col in train_dataset.features]

train_dataset.set_format(type='torch', columns=all_columns)
val_dataset.set_format(type='torch', columns=all_columns)

# Enhanced metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Training arguments with improved settings for LoRA
training_args = TrainingArguments(
    output_dir='./results_bge_m3_lora',
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-4,                # Higher learning rate can be used with LoRA
    per_device_train_batch_size=16,    # Can use larger batch size since we're training fewer parameters
    per_device_eval_batch_size=16,
    num_train_epochs=5,                # Can train for more epochs with LoRA
    weight_decay=0.01,
    logging_dir='./logs_bge_m3_lora',
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    fp16=True,                         # Enable mixed precision training if available
    gradient_accumulation_steps=2,     # Can use smaller accumulation with fewer parameters
)

training_argss = TrainingArguments(
    output_dir='./results_bge_m3_loraa',
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,                # Higher learning rate can be used with LoRA
    per_device_train_batch_size=16,    # Can use larger batch size since we're training fewer parameters
    per_device_eval_batch_size=16,
    num_train_epochs=5,                # Can train for more epochs with LoRA
    weight_decay=0.01,
    logging_dir='./logs_bge_m3_lora',
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    fp16=True,                         # Enable mixed precision training if available
    gradient_accumulation_steps=2,     # Can use smaller accumulation with fewer parameters
)

# Process THAR dataset
thar_train, thar_val = train_test_split(thar_dataset, test_size=0.2, random_state=42)

thar_train_dataset = Dataset.from_pandas(thar_train)
thar_val_dataset = Dataset.from_pandas(thar_val)

thar_train_dataset = thar_train_dataset.map(tokenize_function, batched=True)
thar_val_dataset = thar_val_dataset.map(tokenize_function, batched=True)

thar_train_dataset = thar_train_dataset.map(lambda examples: {'labels': [int(label) for label in examples['labels']]}, batched=True)
thar_val_dataset = thar_val_dataset.map(lambda examples: {'labels': [int(label) for label in examples['labels']]}, batched=True)

thar_train_dataset.set_format(type='torch', columns=all_columns)
thar_val_dataset.set_format(type='torch', columns=all_columns)

# First training on THAR dataset
print("Training on THAR dataset using LoRA...")
thar_trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=thar_train_dataset,
    eval_dataset=thar_val_dataset,
    compute_metrics=compute_metrics,
)

# Training with error handling
try:
    thar_trainer.train()

    # Save THAR training results
    thar_eval_results = thar_trainer.evaluate()
    print(f"THAR dataset evaluation results: {thar_eval_results}")
    model.save_pretrained("./bge_m3_thar_lora_finetuned")
    tokenizer.save_pretrained("./bge_m3_thar_lora_finetuned")

    # Second training on racial dataset
    print("Training on racial dataset using LoRA...")
    trainer = Trainer(
        model=model,  # Continue with the same model
        args=training_argss,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    # Save final model
    model.save_pretrained("./bge_m3_final_lora_finetuned")
    tokenizer.save_pretrained("./bge_m3_final_lora_finetuned")

    # Final evaluation
    racial_eval_results = trainer.evaluate()
    print(f"Racial dataset evaluation results: {racial_eval_results}")

except Exception as e:
    print(f"Training error: {e}")
    print("\nFalling back to a smaller model version...")

    # Fallback to a smaller version if the main one fails
    model_name = "BAAI/bge-small-en-v1.5"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    base_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    # Adjust LoRA config for smaller model (might have different architecture)
    lora_config = LoraConfig(
        task_type=TaskType.SEQ_CLS,
        r=8,                          # Smaller rank for smaller model
        lora_alpha=16,
        lora_dropout=0.1,
        bias="none",
        target_modules=["query", "key", "value", "output.dense"]  # May need adjustment for different model
    )

    model = get_peft_model(base_model, lora_config)
    model.to(device)

    # Update training arguments for smaller model
    training_args = TrainingArguments(
        output_dir='./results_bge_small_lora',
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-4,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        num_train_epochs=5,
        weight_decay=0.01,
        logging_dir='./logs_bge_small_lora',
        logging_steps=10,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
    )

    # Re-tokenize data
    def retokenize_function(examples):
        return tokenizer(examples['clean_text'], padding='max_length', truncation=True, max_length=128)

    # Reprocess datasets with new tokenizer
    train_dataset = Dataset.from_pandas(balanced_df)
    val_dataset = Dataset.from_pandas(racial_val)
    thar_train_dataset = Dataset.from_pandas(thar_train)
    thar_val_dataset = Dataset.from_pandas(thar_val)

    train_dataset = train_dataset.map(retokenize_function, batched=True)
    val_dataset = val_dataset.map(retokenize_function, batched=True)
    thar_train_dataset = thar_train_dataset.map(retokenize_function, batched=True)
    thar_val_dataset = thar_val_dataset.map(retokenize_function, batched=True)

    train_dataset = train_dataset.map(lambda examples: {'labels': [int(label) for label in examples['labels']]}, batched=True)
    val_dataset = val_dataset.map(lambda examples: {'labels': [int(label) for label in examples['labels']]}, batched=True)
    thar_train_dataset = thar_train_dataset.map(lambda examples: {'labels': [int(label) for label in examples['labels']]}, batched=True)
    thar_val_dataset = thar_val_dataset.map(lambda examples: {'labels': [int(label) for label in examples['labels']]}, batched=True)

    # Set format for PyTorch with new tokenizer
    sample_encoding = tokenizer("Sample text", return_tensors="pt")
    all_columns = list(sample_encoding.keys()) + ['labels']
    all_columns = [col for col in all_columns if col in train_dataset.features]

    train_dataset.set_format(type='torch', columns=all_columns)
    val_dataset.set_format(type='torch', columns=all_columns)
    thar_train_dataset.set_format(type='torch', columns=all_columns)
    thar_val_dataset.set_format(type='torch', columns=all_columns)

    # Retry training with smaller model
    thar_trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=thar_train_dataset,
        eval_dataset=thar_val_dataset,
        compute_metrics=compute_metrics,
    )

    thar_trainer.train()

    # Save results
    thar_eval_results = thar_trainer.evaluate()
    print(f"THAR dataset evaluation results (fallback model): {thar_eval_results}")
    model.save_pretrained("./bge_small_thar_lora_finetuned")
    tokenizer.save_pretrained("./bge_small_thar_lora_finetuned")

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    racial_eval_results = trainer.evaluate()
    print(f"Racial dataset evaluation results (fallback model): {racial_eval_results}")
    model.save_pretrained("./bge_small_final_lora_finetuned")
    tokenizer.save_pretrained("./bge_small_final_lora_finetuned")

# Merge LoRA weights with the base model for inference
from peft import PeftModel

# Load base model and LoRA weights for inference
def get_merged_model():
    base_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
    model_path = "./bge_m3_final_lora_finetuned"  # Or use fallback path if needed
    lora_model = PeftModel.from_pretrained(base_model, model_path)
    # Merge LoRA parameters with base model for faster inference
    merged_model = lora_model.merge_and_unload()
    return merged_model

# For inference, use the merged model
inference_model = model  # Use the trained model directly for testing
inference_model.eval()

# Test on a few examples
def predict_text(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=256)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = inference_model(**inputs)

    logits = outputs.logits
    prediction = torch.argmax(logits, dim=1).item()

    return "Racial Hoaxes detected" if prediction == 1 else "Not a Racial Hoaxes"

# Test examples
test_texts = [
    "आप सभी बहुत अच्छे लोग हैं",  # You all are very good people
    "I hate people from that community",
    "यह एक सामान्य वाक्य है",  # This is a normal sentence
    "These people should not be allowed in our country"
]

print("\nTesting the model on example texts:")
for text in test_texts:
    print(f"Text: {text}")
    print(f"Prediction: {predict_text(text)}\n")

# Performance evaluation function
def evaluate_model_performance(test_texts, ground_truth):
    predictions = []
    for text in test_texts:
        result = predict_text(text)
        predictions.append(1 if result == "Racial Hoaxes" else 0)

    correct = sum(p == g for p, g in zip(predictions, ground_truth))
    accuracy = correct / len(ground_truth)

    print(f"\nTest accuracy: {accuracy * 100:.2f}%")
    print("Detailed results:")
    for i, (text, pred, truth) in enumerate(zip(test_texts, predictions, ground_truth)):
        status = "✓" if pred == truth else "✗"
        print(f"{i+1}. {status} Text: \"{text}\"")
        print(f"   Predicted: {'Racial Hoaxes' if pred == 1 else 'Non-Racial Hoaxes'}, Actual: {'Racial Hoaxes' if truth == 1 else 'Non-Racial Hoaxes'}")

ground_truth = [0, 1, 0, 1]
evaluate_model_performance(test_texts, ground_truth)

In [None]:
#BAAI BGE-M3 Model full fine tuning
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Load the dataset
racial_train = pd.read_csv('Racial_train.csv')
racial_val = pd.read_csv('Racial_val.csv')
thar_dataset = pd.read_csv('THAR-Dataset.csv')

def balance_dataset(df, label_column):
    df[label_column] = df[label_column].astype(int)  # Ensure labels are integers
    majority_class = df[df[label_column] == 0]
    minority_class = df[df[label_column] == 1]
    minority_class_upsampled = minority_class.sample(len(majority_class), replace=True)
    balanced_df = pd.concat([majority_class, minority_class_upsampled])
    return balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Balance the dataset
balanced_df = balance_dataset(racial_train, 'labels')

# Convert labels to integers
racial_val['labels'] = racial_val['labels'].astype(int)
thar_dataset['labels'] = thar_dataset['labels'].replace({"AntiReligion":1,"Non-AntiReligion":0}).astype(int)

# Load tokenizer and model for BGE-M3
model_name = "BAAI/bge-m3"  # Using BGE-M3 model which is a powerful multilingual model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Print model architecture to verify
print(f"Model architecture: {model.__class__.__name__}")
print(f"Model size: {sum(p.numel() for p in model.parameters()) / 1e6:.2f}M parameters")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Using device: {device}")

def tokenize_function(examples):
    # BGE-M3 can handle longer sequences, but keeping reasonable length for efficiency
    return tokenizer(examples['clean_text'], padding='max_length', truncation=True, max_length=256)

# Convert DataFrames to Hugging Face Dataset
train_dataset = Dataset.from_pandas(balanced_df)
val_dataset = Dataset.from_pandas(racial_val)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# Ensure correct label format
train_dataset = train_dataset.map(lambda examples: {'labels': [int(label) for label in examples['labels']]}, batched=True)
val_dataset = val_dataset.map(lambda examples: {'labels': [int(label) for label in examples['labels']]}, batched=True)

# Set format for PyTorch - BGE-M3 might have additional tokens
required_columns = ['input_ids', 'attention_mask', 'labels']
# Check if token_type_ids is in the tokenizer output
sample_encoding = tokenizer("Sample text", return_tensors="pt")
all_columns = list(sample_encoding.keys()) + ['labels']
all_columns = [col for col in all_columns if col in train_dataset.features]

train_dataset.set_format(type='torch', columns=all_columns)
val_dataset.set_format(type='torch', columns=all_columns)

# Enhanced metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Training arguments with improved settings
training_args1 = TrainingArguments(
    output_dir='./results_bge_m3',
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,  # Lower learning rate for large models
    per_device_train_batch_size=8,  # Smaller batch size due to model size
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs_bge_m3',
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    fp16=True,  # Enable mixed precision training if available
    gradient_accumulation_steps=4,  # Accumulate gradients to simulate larger batch sizes
)
training_args2 = TrainingArguments(
    output_dir='./results_bge_m3',
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-6,  # Lower learning rate for large models
    per_device_train_batch_size=8,  # Smaller batch size due to model size
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs_bge_m3',
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    fp16=True,  # Enable mixed precision training if available
    gradient_accumulation_steps=4,  # Accumulate gradients to simulate larger batch sizes
)

# Process THAR dataset
thar_train, thar_val = train_test_split(thar_dataset, test_size=0.2, random_state=42)

thar_train_dataset = Dataset.from_pandas(thar_train)
thar_val_dataset = Dataset.from_pandas(thar_val)

thar_train_dataset = thar_train_dataset.map(tokenize_function, batched=True)
thar_val_dataset = thar_val_dataset.map(tokenize_function, batched=True)

thar_train_dataset = thar_train_dataset.map(lambda examples: {'labels': [int(label) for label in examples['labels']]}, batched=True)
thar_val_dataset = thar_val_dataset.map(lambda examples: {'labels': [int(label) for label in examples['labels']]}, batched=True)

thar_train_dataset.set_format(type='torch', columns=all_columns)
thar_val_dataset.set_format(type='torch', columns=all_columns)

# First training on THAR dataset
print("Training on THAR dataset...")
thar_trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=thar_train_dataset,
    eval_dataset=thar_val_dataset,
    compute_metrics=compute_metrics,
)

# Training with error handling
try:
    thar_trainer.train()

    # Save THAR training results
    thar_eval_results = thar_trainer.evaluate()
    print(f"THAR dataset evaluation results: {thar_eval_results}")
    model.save_pretrained("./bge_m3_thar_finetuned")
    tokenizer.save_pretrained("./bge_m3_thar_finetuned")

    # Second training on racial dataset
    print("Training on racial dataset...")
    trainer = Trainer(
        model=model,  # Continue with the same model
        args=training_args1,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    # Save final model
    model.save_pretrained("./bge_m3_final_finetuned")
    tokenizer.save_pretrained("./bge_m3_final_finetuned")

    # Final evaluation
    racial_eval_results = trainer.evaluate()
    print(f"Racial dataset evaluation results: {racial_eval_results}")

except Exception as e:
    print(f"Training error: {e}")
    print("\nFalling back to a smaller model version...")

    # Fallback to a smaller version if the main one fails
    model_name = "BAAI/bge-small-en-v1.5"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
    model.to(device)

    # Update training arguments for smaller model
    training_args = TrainingArguments(
        output_dir='./results_bge_small',
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        logging_dir='./logs_bge_small',
        logging_steps=10,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
    )

    # Re-tokenize data
    def retokenize_function(examples):
        return tokenizer(examples['clean_text'], padding='max_length', truncation=True, max_length=128)

    # Reprocess datasets with new tokenizer
    train_dataset = Dataset.from_pandas(balanced_df)
    val_dataset = Dataset.from_pandas(racial_val)
    thar_train_dataset = Dataset.from_pandas(thar_train)
    thar_val_dataset = Dataset.from_pandas(thar_val)

    train_dataset = train_dataset.map(retokenize_function, batched=True)
    val_dataset = val_dataset.map(retokenize_function, batched=True)
    thar_train_dataset = thar_train_dataset.map(retokenize_function, batched=True)
    thar_val_dataset = thar_val_dataset.map(retokenize_function, batched=True)

    train_dataset = train_dataset.map(lambda examples: {'labels': [int(label) for label in examples['labels']]}, batched=True)
    val_dataset = val_dataset.map(lambda examples: {'labels': [int(label) for label in examples['labels']]}, batched=True)
    thar_train_dataset = thar_train_dataset.map(lambda examples: {'labels': [int(label) for label in examples['labels']]}, batched=True)
    thar_val_dataset = thar_val_dataset.map(lambda examples: {'labels': [int(label) for label in examples['labels']]}, batched=True)

    # Set format for PyTorch with new tokenizer
    sample_encoding = tokenizer("Sample text", return_tensors="pt")
    all_columns = list(sample_encoding.keys()) + ['labels']
    all_columns = [col for col in all_columns if col in train_dataset.features]

    train_dataset.set_format(type='torch', columns=all_columns)
    val_dataset.set_format(type='torch', columns=all_columns)
    thar_train_dataset.set_format(type='torch', columns=all_columns)
    thar_val_dataset.set_format(type='torch', columns=all_columns)

    # Retry training with smaller model
    thar_trainer = Trainer(
        model=model,
        args=training_args1,
        train_dataset=thar_train_dataset,
        eval_dataset=thar_val_dataset,
        compute_metrics=compute_metrics,
    )

    thar_trainer.train()

    # Save results
    thar_eval_results = thar_trainer.evaluate()
    print(f"THAR dataset evaluation results (fallback model): {thar_eval_results}")
    model.save_pretrained("./bge_small_thar_finetuned")
    tokenizer.save_pretrained("./bge_small_thar_finetuned")

    trainer = Trainer(
        model=model,
        args=training_args1,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    racial_eval_results = trainer.evaluate()
    print(f"Racial dataset evaluation results (fallback model): {racial_eval_results}")
    model.save_pretrained("./bge_small_final_finetuned")
    tokenizer.save_pretrained("./bge_small_final_finetuned")

# Test on a few examples
def predict_text(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=256)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    prediction = torch.argmax(logits, dim=1).item()

    return "Racial Hoaxes" if prediction == 1 else "Non-Racial Hoaxes"

# Test examples
test_texts = [
    "आप सभी बहुत अच्छे लोग हैं",  # You all are very good people
    "I hate people from that community",
    "यह एक सामान्य वाक्य है",  # This is a normal sentence
    "These people should not be allowed in our country"
]

print("\nTesting the model on example texts:")
for text in test_texts:
    print(f"Text: {text}")
    print(f"Prediction: {predict_text(text)}\n")

# Performance evaluation function
def evaluate_model_performance(test_texts, ground_truth):
    predictions = []
    for text in test_texts:
        result = predict_text(text)
        predictions.append(1 if result == "Racial Hoaxes" else 0)

    correct = sum(p == g for p, g in zip(predictions, ground_truth))
    accuracy = correct / len(ground_truth)

    print(f"\nTest accuracy: {accuracy * 100:.2f}%")
    print("Detailed results:")
    for i, (text, pred, truth) in enumerate(zip(test_texts, predictions, ground_truth)):
        status = "✓" if pred == truth else "✗"
        print(f"{i+1}. {status} Text: \"{text}\"")
        print(f"   Predicted: {'Racial Hoaxes' if pred == 1 else 'Non-Racial Hoaxes'}, Actual: {'Racial Hoaxes' if truth == 1 else 'Non-Racial Hoaxes'}")
    val = pd.read_csv("/content/Racial_val.csv")
    val = Dataset.from_pandas(val)
    val = val.map(tokenize_function, batched=True)
    val_pred = trainer.predict(val)
    val_preds = np.argmax(val_pred.predictions, axis=-1)
    df_preds = pd.DataFrame({'Predicted_Labels': val_preds})
    true_labels = df_preds['Predicted_Labels']
    report = classification_report(true_labels, val['labels'])
    print(report)

ground_truth = [0, 1, 0, 1]
evaluate_model_performance(test_texts, ground_truth)

In [None]:
evaluate_model_performance(test_texts, ground_truth)

In [None]:
from sklearn.metrics import classification_report
import numpy as np

In [None]:
val = pd.read_csv("/content/Racial_val.csv")
val = Dataset.from_pandas(val)
val = val.map(tokenize_function, batched=True)
val_pred = trainer.predict(val)
val_preds = np.argmax(val_pred.predictions, axis=-1)
df_preds = pd.DataFrame({'Predicted_Labels': val_preds})
true_labels = df_preds['Predicted_Labels']
report = classification_report(true_labels, val['labels'])
print(report)

In [None]:
#Muril model
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

# Load the dataset
racial_train = pd.read_csv('Racial_train.csv')
racial_val = pd.read_csv('Racial_val.csv')
thar_dataset = pd.read_csv('THAR-Dataset.csv')

def balance_dataset(df, label_column):
    df[label_column] = df[label_column].astype(int)  # Ensure labels are integers
    majority_class = df[df[label_column] == 0]
    minority_class = df[df[label_column] == 1]
    minority_class_upsampled = minority_class.sample(len(majority_class), replace=True)
    balanced_df = pd.concat([majority_class, minority_class_upsampled])
    return balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Balance the dataset
balanced_df = balance_dataset(racial_train, 'labels')

# Convert labels to integers
racial_val['labels'] = racial_val['labels'].astype(int)
thar_dataset['labels'] = thar_dataset['labels'].replace({"AntiReligion":1,"Non-AntiReligion":0}).astype(int)

# Load tokenizer and model
model_name = "google/muril-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Using device: {device}")

def tokenize_function(examples):
    return tokenizer(examples['clean_text'], padding='max_length', truncation=True)

# Convert DataFrames to Hugging Face Dataset
train_dataset = Dataset.from_pandas(balanced_df)
val_dataset = Dataset.from_pandas(racial_val)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# Ensure correct label format
train_dataset = train_dataset.map(lambda examples: {'labels': [int(label) for label in examples['labels']]}, batched=True)
val_dataset = val_dataset.map(lambda examples: {'labels': [int(label) for label in examples['labels']]}, batched=True)

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

thar_train, thar_val = train_test_split(thar_dataset, test_size=0.2, random_state=42)

thar_train_dataset = Dataset.from_pandas(thar_train)
thar_val_dataset = Dataset.from_pandas(thar_val)

thar_train_dataset = thar_train_dataset.map(tokenize_function, batched=True)
thar_val_dataset = thar_val_dataset.map(tokenize_function, batched=True)

thar_train_dataset = thar_train_dataset.map(lambda examples: {'labels': [int(label) for label in examples['labels']]}, batched=True)
thar_val_dataset = thar_val_dataset.map(lambda examples: {'labels': [int(label) for label in examples['labels']]}, batched=True)

thar_train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
thar_val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

def compute_metrics(pred):
    logits, labels = pred
    preds = torch.argmax(torch.tensor(logits), axis=-1)
    return {'accuracy': (preds == torch.tensor(labels)).float().mean().item()}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=thar_train_dataset,
    eval_dataset=thar_val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

Check test data prediction and save it on fine tuned model

In [None]:
df = pd.read_csv("/content/Racial_test_without_labels.csv")
df = Dataset.from_pandas(df)
df = df.map(tokenize_function, batched=True)
predictions = trainer.predict(df)
preds = np.argmax(predictions.predictions, axis=-1)
print(preds)
df_preds = pd.DataFrame({'Predicted_Labels': preds})
df_preds.to_csv('predictions.csv', index=False)
print("Predictions saved to predictions.csv")
final = pd.concat([df,df_preds],axis=1)
final.to_csv('predictions_final.csv', index=False)

In [None]:
from sklearn.metrics import classification_report

Check val error on fine tuned model

In [None]:
from sklearn.metrics import classification_report
val = pd.read_csv("/content/Racial_test.csv")
val.drop(['ID'],axis=1,inplace=True)
val = Dataset.from_pandas(val)
val = val.map(tokenize_function, batched=True)
val_pred = trainer.predict(val)
val_preds = np.argmax(val_pred.predictions, axis=-1)
df_preds = pd.DataFrame({'Predicted_Labels': val_preds})
df_preds.to_csv('predictions_val.csv', index=False)
true_labels = df_preds['Predicted_Labels']
report = classification_report(true_labels, val['labels'])
print(report)

In [None]:
!pip install transformers

In [None]:
#HinGe model
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments,AutoModelForMaskedLM
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Load the dataset
racial_train = pd.read_csv('Racial_train.csv')
racial_val = pd.read_csv('Racial_val.csv')

def balance_dataset(df, label_column):
    df[label_column] = df[label_column].astype(int)  # Ensure labels are integers
    majority_class = df[df[label_column] == 0]
    minority_class = df[df[label_column] == 1]
    minority_class_upsampled = minority_class.sample(len(majority_class), replace=True)
    balanced_df = pd.concat([majority_class, minority_class_upsampled])
    return balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Balance the dataset
balanced_df = balance_dataset(racial_train, 'labels')

# Convert labels to integers
racial_val['labels'] = racial_val['labels'].astype(int)

tokenizer = AutoTokenizer.from_pretrained("l3cube-pune/hing-bert")
model = AutoModelForSequenceClassification.from_pretrained("l3cube-pune/hing-bert", num_labels=2)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Using device: {device}")

def tokenize_function(examples):
    return tokenizer(examples['clean_text'], padding='max_length', truncation=True, max_length=128)

# Convert DataFrames to Hugging Face Dataset
train_dataset = Dataset.from_pandas(balanced_df)
val_dataset = Dataset.from_pandas(racial_val)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# Ensure correct label format
train_dataset = train_dataset.map(lambda examples: {'labels': [int(label) for label in examples['labels']]}, batched=True)
val_dataset = val_dataset.map(lambda examples: {'labels': [int(label) for label in examples['labels']]}, batched=True)

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Enhanced metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Training arguments with improved settings
training_args = TrainingArguments(
    output_dir='./results_hinge_bert',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs_hinge_bert',
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
)


# Second training on racial dataset
print("Training on racial dataset...")
trainer = Trainer(
    model=model,  # Continue with the same model
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

# Save final model
model.save_pretrained("./hinge_bert_final_finetuned")
tokenizer.save_pretrained("./hinge_bert_final_finetuned")

# Final evaluation
racial_eval_results = trainer.evaluate()
print(f"Racial dataset evaluation results: {racial_eval_results}")

# Test on a few examples
def predict_text(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    prediction = torch.argmax(logits, dim=1).item()

    return "Racial Hoaxes" if prediction == 1 else "Non-Racial Hoaxes"

# Test examples
test_texts = [
    "आप सभी बहुत अच्छे लोग हैं",  # You all are very good people
    "I hate people from that community",
    "यह एक सामान्य वाक्य है",  # This is a normal sentence
    "These people should not be allowed in our country"
]

print("\nTesting the model on example texts:")
for text in test_texts:
    print(f"Text: {text}")
    print(f"Prediction: {predict_text(text)}\n")