In [4]:
import json
import torch
from transformers import BertTokenizer, BertForSequenceClassification, logging

# Suppress unnecessary logs
logging.set_verbosity_error()

# Load the tokenizer and model for BERT
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3)  # Adjust num_labels based on task

# Set the model to evaluation mode
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to generate predictions from BERT
def get_bert_prediction(task, item):
    try:
        if task in ['sst2', 'qnli']:
            # Tasks that use a single 'sentence'
            sentence = item['sentence']
            inputs = tokenizer(sentence, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
        elif task in ['rte', 'mnli', 'mnli-mm']:
            # Tasks that use 'sentence1' and 'sentence2' or 'premise' and 'hypothesis'
            sentence1 = item.get('sentence1') or item.get('premise')
            sentence2 = item.get('sentence2') or item.get('hypothesis')
            inputs = tokenizer(sentence1, sentence2, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
        elif task == 'qqp':
            # Tasks that use 'question1' and 'question2'
            question1 = item['question1']
            question2 = item['question2']
            inputs = tokenizer(question1, question2, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
        else:
            raise ValueError(f"Unknown task: {task}")

        # Move inputs to the correct device
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            prediction = logits.argmax(-1).item()

        return prediction

    except KeyError as e:
        # Print a more informative error message if a key is missing
        print(f"KeyError: {str(e)} - this key is missing in the data item: {item}")
        raise

# Load the dataset
with open('dev.json', 'r') as f:
    dataset = json.load(f)

# Generate predictions for each task and item in the dataset
predictions = {}
for task, items in dataset.items():
    task_predictions = []
    for item in items:
        prediction = get_bert_prediction(task, item)
        task_predictions.append({'idx': item['idx'], 'prediction': prediction})
    predictions[task] = task_predictions

# Save predictions to a file
with open('predictions_bert.json', 'w') as f:
    json.dump(predictions, f, indent=4)

In [6]:
!pip install datasets -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.1/542.1 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires requests==2.31.0, but you have requests 2.32.3 which is incompatible.[0m[31m
[0m

In [7]:
import json
import pandas as pd
from datasets import load_metric

# Load dev data
with open('dev.json', 'r') as f:
    dev_data = json.load(f)

# Load BERT predictions
with open('predictions_bert.json', 'r') as f:
    predictions = json.load(f)

def evaluate_task(task_name, metric, dev_data, predictions):
    labels = [item['label'] for item in dev_data[task_name]]
    preds = [item['prediction'] for item in predictions[task_name]]
    return metric.compute(predictions=preds, references=labels)

tasks = ['sst2', 'qqp', 'mnli', 'mnli-mm', 'qnli', 'rte']
task_to_metric = {
    "sst2": "accuracy",
    "qqp": "f1",
    "mnli": "accuracy",
    "mnli-mm": "accuracy",
    "qnli": "accuracy",
    "rte": "accuracy"
}

results = {}

for task in tasks:
    metric = load_metric("glue", task if task != 'mnli-mm' else 'mnli')
    result = evaluate_task(task, metric, dev_data, predictions)
    results[task] = result

if 'mnli' in results and 'mnli-mm' in results:
    combined_mnli_score = (results['mnli']['accuracy'] + results['mnli-mm']['accuracy']) / 2
    results['mnli_combined'] = {'accuracy': combined_mnli_score}

# Convert results to DataFrame
results_list = []
for task, result in results.items():
    for metric_name, value in result.items():
        results_list.append({'task': task, 'metric': metric_name, 'value': value})

df_results = pd.DataFrame(results_list)

df_results.to_csv('evaluation_results_bert.csv', index=False)

print(df_results)

  metric = load_metric("glue", task if task != 'mnli-mm' else 'mnli')
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datase

            task    metric     value
0           sst2  accuracy  0.506757
1            qqp  accuracy  0.410256
2            qqp        f1  0.581818
3           mnli  accuracy  0.322314
4        mnli-mm  accuracy  0.277778
5           qnli  accuracy  0.500000
6            rte  accuracy  0.567901
7  mnli_combined  accuracy  0.300046


In [10]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, logging

# Load the tokenizer and model for ROBERTA
model_name = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=3)  # Adjust num_labels as needed

# Set the model to evaluation mode
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def get_roberta_prediction(task, item):
    try:
        if task in ['sst2', 'qnli']:
            # Tasks that use a single 'sentence'
            sentence = item['sentence']
            inputs = tokenizer(sentence, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
        elif task in ['rte', 'mnli', 'mnli-mm']:
            # Tasks that use 'sentence1' and 'sentence2' or 'premise' and 'hypothesis'
            sentence1 = item.get('sentence1') or item.get('premise')
            sentence2 = item.get('sentence2') or item.get('hypothesis')
            inputs = tokenizer(sentence1, sentence2, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
        elif task == 'qqp':
            # Tasks that use 'question1' and 'question2'
            question1 = item['question1']
            question2 = item['question2']
            inputs = tokenizer(question1, question2, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
        else:
            raise ValueError(f"Unknown task: {task}")

        # Move inputs to the correct device
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            prediction = logits.argmax(-1).item()

        return prediction

    except KeyError as e:
        # Print a more informative error message if a key is missing
        print(f"KeyError: {str(e)} - this key is missing in the data item: {item}")
        raise

In [11]:
predictions = {}
for task, items in dataset.items():
    task_predictions = []
    for item in items:
        prediction = get_roberta_prediction(task, item)
        task_predictions.append({'idx': item['idx'], 'prediction': prediction})
    predictions[task] = task_predictions

# Save predictions to a file
with open('predictions_roberta.json', 'w') as f:
    json.dump(predictions, f, indent=4)

In [12]:
import json
import pandas as pd
from datasets import load_metric

# Load dev data
with open('dev.json', 'r') as f:
    dev_data = json.load(f)

# Load BERT predictions
with open('predictions_roberta.json', 'r') as f:
    predictions = json.load(f)

def evaluate_task(task_name, metric, dev_data, predictions):
    labels = [item['label'] for item in dev_data[task_name]]
    preds = [item['prediction'] for item in predictions[task_name]]
    return metric.compute(predictions=preds, references=labels)

tasks = ['sst2', 'qqp', 'mnli', 'mnli-mm', 'qnli', 'rte']
task_to_metric = {
    "sst2": "accuracy",
    "qqp": "f1",
    "mnli": "accuracy",
    "mnli-mm": "accuracy",
    "qnli": "accuracy",
    "rte": "accuracy"
}

results = {}

for task in tasks:
    metric = load_metric("glue", task if task != 'mnli-mm' else 'mnli')
    result = evaluate_task(task, metric, dev_data, predictions)
    results[task] = result

if 'mnli' in results and 'mnli-mm' in results:
    combined_mnli_score = (results['mnli']['accuracy'] + results['mnli-mm']['accuracy']) / 2
    results['mnli_combined'] = {'accuracy': combined_mnli_score}

# Convert results to DataFrame
results_list = []
for task, result in results.items():
    for metric_name, value in result.items():
        results_list.append({'task': task, 'metric': metric_name, 'value': value})

df_results = pd.DataFrame(results_list)

df_results.to_csv('evaluation_results_roberta.csv', index=False)

print(df_results)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datase

            task    metric     value
0           sst2  accuracy  0.486486
1            qqp  accuracy  0.589744
2            qqp        f1  0.000000
3           mnli  accuracy  0.264463
4        mnli-mm  accuracy  0.370370
5           qnli  accuracy  0.500000
6            rte  accuracy  0.432099
7  mnli_combined  accuracy  0.317417


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [13]:
from transformers import ElectraTokenizer, ElectraForSequenceClassification, logging


model_name = 'google/electra-small-discriminator'
tokenizer = ElectraTokenizer.from_pretrained(model_name)
model = ElectraForSequenceClassification.from_pretrained(model_name, num_labels=3)  # Adjust num_labels as needed

# Set the model to evaluation mode
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def get_electra_prediction(task, item):
    try:
        if task in ['sst2', 'qnli']:
            # Tasks that use a single 'sentence'
            sentence = item['sentence']
            inputs = tokenizer(sentence, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
        elif task in ['rte', 'mnli', 'mnli-mm']:
            # Tasks that use 'sentence1' and 'sentence2' or 'premise' and 'hypothesis'
            sentence1 = item.get('sentence1') or item.get('premise')
            sentence2 = item.get('sentence2') or item.get('hypothesis')
            inputs = tokenizer(sentence1, sentence2, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
        elif task == 'qqp':
            # Tasks that use 'question1' and 'question2'
            question1 = item['question1']
            question2 = item['question2']
            inputs = tokenizer(question1, question2, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
        else:
            raise ValueError(f"Unknown task: {task}")

        # Move inputs to the correct device
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            prediction = logits.argmax(-1).item()

        return prediction

    except KeyError as e:
        # Print a more informative error message if a key is missing
        print(f"KeyError: {str(e)} - this key is missing in the data item: {item}")
        raise


predictions = {}
for task, items in dataset.items():
    task_predictions = []
    for item in items:
        prediction = get_roberta_prediction(task, item)
        task_predictions.append({'idx': item['idx'], 'prediction': prediction})
    predictions[task] = task_predictions

# Save predictions to a file
with open('predictions_electra.json', 'w') as f:
    json.dump(predictions, f, indent=4)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/54.2M [00:00<?, ?B/s]