<a href="https://colab.research.google.com/github/123ranika/jsfinal/blob/main/Transformer_Ensemble.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
import pandas as pd
import numpy as np
import torch
!pip install datasets
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split



In [23]:
# Define the models we'll use in our ensemble
model_names = [
    "ai4bharat/indic-bert",
    "bert-base-multilingual-uncased",
    "neuralspace-reverie/indic-transformers-hi-bert",
    "xlm-roberta-base"
]


In [24]:


# Load and preprocess the datasets
def load_and_preprocess_data(file_path):
    df = pd.read_csv(file_path)
    # Ensure 'text' column exists and contains string data
    if 'text' not in df.columns:
        raise ValueError(f"'text' column not found in {file_path}")
    df['text'] = df['text'].astype(str)
    return df

try:
    train_df = pd.read_excel('/content/train.xlsx')
    val_df = pd.read_excel('/content/validation.xlsx')
    test_df = pd.read_excel('/content/Test.xlsx')
    temp_df = pd.read_excel('/content/validation.xlsx')
    val_df['labels'] = temp_df['labels']
except Exception as e:
    print(f"Error loading data: {e}")
    raise

In [25]:
# Combine train and validation for stratified split
combined_df = pd.concat([train_df, val_df], ignore_index=True)

In [26]:
# Perform stratified split after handling NaN values and ensuring consistent data type in 'labels' column
# Replace NaN values with a placeholder (e.g., -1)
combined_df['labels'] = combined_df['labels'].fillna(-1)

# Convert 'labels' column to a consistent data type (e.g., int or str)
combined_df['labels'] = combined_df['labels'].astype(str)  # Or astype(str) depending on your data


# Get the value counts of each label
label_counts = combined_df['labels'].value_counts()

# Filter out labels with only one instance
valid_labels = label_counts[label_counts > 1].index

# Filter the combined dataframe to keep only valid labels
combined_df_filtered = combined_df[combined_df['labels'].isin(valid_labels)]

# Now perform the split on the filtered dataframe
train_df, val_df = train_test_split(combined_df_filtered, test_size=0.1, stratify=combined_df_filtered['labels'], random_state=42)

In [27]:
# Convert to Hugging Face datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

  return cls(pa.Table.from_pandas(*args, **kwargs))


In [28]:
# Helper function for computing metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


In [29]:
# Function to train a single model
def train_model(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    def tokenize_function(examples):
        # Ensure 'text' is a list of strings
        texts = [str(text) for text in examples['text']]
        return tokenizer(texts, padding="max_length", truncation=True, max_length=128)

    tokenized_train = train_dataset.map(tokenize_function, batched=True)
    tokenized_val = val_dataset.map(tokenize_function, batched=True)

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    training_args = TrainingArguments(
        output_dir=f"./results_{model_name.split('/')[-1]}",
        num_train_epochs=5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=64,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir=f'./logs_{model_name.split("/")[-1]}',
        logging_steps=10,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
    )

    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    return trainer, tokenizer

In [30]:
# Train all models
trained_models = []
for model_name in model_names:
    print(f"Training model: {model_name}")
    try:
        trainer, tokenizer = train_model(model_name)
        trained_models.append((trainer, tokenizer))
    except Exception as e:
        print(f"Error training model {model_name}: {e}")
        continue

Training model: ai4bharat/indic-bert


Map:   0%|          | 0/7199 [00:00<?, ? examples/s]

Error training model ai4bharat/indic-bert: 'text'
Training model: bert-base-multilingual-uncased


Map:   0%|          | 0/7199 [00:00<?, ? examples/s]

Error training model bert-base-multilingual-uncased: 'text'
Training model: neuralspace-reverie/indic-transformers-hi-bert
Error training model neuralspace-reverie/indic-transformers-hi-bert: neuralspace-reverie/indic-transformers-hi-bert is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`
Training model: xlm-roberta-base


Map:   0%|          | 0/7199 [00:00<?, ? examples/s]

Error training model xlm-roberta-base: 'text'


In [31]:
# Function to get predictions from a single model
def get_predictions(trainer, tokenizer, dataset):
    def tokenize_function(examples):
        # Ensure 'text' is a list of strings
        texts = [str(text) for text in examples['text']]
        return tokenizer(texts, padding="max_length", truncation=True, max_length=128)

    tokenized_dataset = dataset.map(tokenize_function, batched=True)
    predictions = trainer.predict(tokenized_dataset)
    return predictions.predictions


In [32]:
# Get predictions from all models
all_predictions = []
for trainer, tokenizer in trained_models:
    print(f"Getting predictions from model: {trainer.model.name_or_path}")
    try:
        predictions = get_predictions(trainer, tokenizer, test_dataset)
        all_predictions.append(predictions)
    except Exception as e:
        print(f"Error getting predictions from model {trainer.model.name_or_path}: {e}")
        continue


In [35]:
# Ensemble predictions using majority voting
if all_predictions:
    ensemble_predictions = np.stack(all_predictions)

    # Apply majority voting along the model axis for each sample
    majority_votes = np.apply_along_axis(lambda x: np.bincount(x).argmax(), 0, ensemble_predictions.argmax(axis=2))

    # Create a DataFrame with predictions
    results_df = pd.DataFrame({
        'texts': test_df['texts'],
        'labels': majority_votes
    })

    # Save the results to a new CSV file
    #results_df.to_csv('ensemble_test_results.csv', index=False)
    print("Ensemble predictions have been saved to 'ensemble_test_results.csv'")
else:
    print("No predictions were made. Check the errors above.")



No predictions were made. Check the errors above.
