In [1]:
import torch
from adapters import AutoAdapterModel
from transformers import AutoTokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = AutoAdapterModel.from_pretrained("roberta-base").to(device)

Some weights of RobertaAdapterModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['heads.default.3.bias', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
adapter_name = model.load_adapter("./final_adapter", set_active=True)
model.set_active_adapters(adapter_name)

In [3]:
from transformers import TextClassificationPipeline

classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer, device=device)

classifier("What are the tax rates in Russia?")

The model 'RobertaAdapterModel' is not supported for . Supported models are ['AlbertForSequenceClassification', 'BartForSequenceClassification', 'BertForSequenceClassification', 'BigBirdForSequenceClassification', 'BigBirdPegasusForSequenceClassification', 'BioGptForSequenceClassification', 'BloomForSequenceClassification', 'CamembertForSequenceClassification', 'CanineForSequenceClassification', 'LlamaForSequenceClassification', 'ConvBertForSequenceClassification', 'CTRLForSequenceClassification', 'Data2VecTextForSequenceClassification', 'DebertaForSequenceClassification', 'DebertaV2ForSequenceClassification', 'DistilBertForSequenceClassification', 'ElectraForSequenceClassification', 'ErnieForSequenceClassification', 'ErnieMForSequenceClassification', 'EsmForSequenceClassification', 'FalconForSequenceClassification', 'FlaubertForSequenceClassification', 'FNetForSequenceClassification', 'FunnelForSequenceClassification', 'GemmaForSequenceClassification', 'GPT2ForSequenceClassification',

[{'label': 'finance', 'score': 0.9999765157699585}]

In [4]:
from datasets import load_dataset

id2label = {
    0: "finance",
    1: "medicine",
    2: "leetcode",
    3: "exam",
    4: "webgpt",
    5: "gpt4tools",
    6: "cot",
    7: "stackoverflow",
}

# id2label = {
#     0: "finance",
#     1: "medicine",
#     2: "cot",
#     3: "stackoverflow",
# }

eval_file = "./cache/unified_eval.jsonl"
dataset_eval = load_dataset("json", split="train", data_files=eval_file)
labels = dataset_eval.unique("labels")
domains = [id2label[label] for label in labels]
domains

['finance',
 'medicine',
 'leetcode',
 'exam',
 'webgpt',
 'gpt4tools',
 'cot',
 'stackoverflow']

In [5]:
import pandas as pd
from datasets import tqdm as hf_tqdm

# Initialize results storage
results = []

# Evaluate the classifier on each domain
for label in hf_tqdm(labels):
    domain = id2label[label]
    domain_dataset = dataset_eval.filter(lambda x: x['labels'] == label)
    domain_size = len(domain_dataset)
    
    correct_classifications = 0
    for record in hf_tqdm(domain_dataset):
        prediction = classifier(record["text"], truncation=True, max_length=512)[0]["label"]
        if prediction.lower() == domain:
            correct_classifications += 1

    classifier_accuracy = correct_classifications / domain_size * 100

    # Add results to the list
    results.append({
        "Domain": domain.upper(),
        "test size": domain_size,
        "Classifier": f"{classifier_accuracy:.2f}%",
        "Router": "N/A"  # Placeholder as Router results are not provided
    })

# Calculate average
average_size = sum([result["test size"] for result in results]) / len(results)
average_classifier_accuracy = sum(
    [float(result["Classifier"].strip('%')) for result in results]
) / len(results)

# Add the average row
results.append({
    "Domain": "Average",
    "test size": average_size,
    "Classifier": f"{average_classifier_accuracy:.2f}%",
    "Router": "N/A"  # Placeholder as Router results are not provided
})

# Convert to DataFrame
results_df = pd.DataFrame(results)

# Display the table
print(results_df)

# Save to CSV
results_df.to_csv("./evaluation_results.csv", index=False)

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

--- Logging error ---
Traceback (most recent call last):
  File "/usr/lib/python3.11/logging/__init__.py", line 1110, in emit
    msg = self.format(record)
          ^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/logging/__init__.py", line 953, in format
    return fmt.format(record)
           ^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/logging/__init__.py", line 687, in format
    record.message = record.getMessage()
                     ^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/logging/__init__.py", line 377, in getMessage
    msg = msg % self.args
          ~~~~^~~~~~~~~~~
TypeError: not all arguments converted during string formatting
Call stack:
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/home/pasha/Documents/Repository/gpt/MoDA/moa-reproduciton/venv/lib/python3.11/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/home/pasha/Documents/Repository/gpt/MoDA

Filter:   0%|          | 0/16000 [00:00<?, ? examples/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

Filter:   0%|          | 0/16000 [00:00<?, ? examples/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

Filter:   0%|          | 0/16000 [00:00<?, ? examples/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

Filter:   0%|          | 0/16000 [00:00<?, ? examples/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

Filter:   0%|          | 0/16000 [00:00<?, ? examples/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

Filter:   0%|          | 0/16000 [00:00<?, ? examples/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

Filter:   0%|          | 0/16000 [00:00<?, ? examples/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

          Domain  test size Classifier Router
0        FINANCE     2000.0     99.95%    N/A
1       MEDICINE     2000.0     99.90%    N/A
2       LEETCODE     2000.0      0.00%    N/A
3           EXAM     2000.0      0.00%    N/A
4         WEBGPT     2000.0      0.00%    N/A
5      GPT4TOOLS     2000.0      0.00%    N/A
6            COT     2000.0    100.00%    N/A
7  STACKOVERFLOW     2000.0    100.00%    N/A
8        Average     2000.0     49.98%    N/A
