In [1]:
import numpy as np
import json
import random
import os

import matplotlib.pyplot as plt

from collections import OrderedDict, Counter
from datasets import load_dataset, load_metric, ClassLabel
from transformers import (
    pipeline,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)

## Preparing dataset

In [2]:
with open('data/arxivData.json', 'r') as f:
    ds = json.load(f)

with open('data/arxiv_data_processed.json', 'w') as f:
    for data_dict in ds:
        json.dump(data_dict, f)
        f.write('\n')

In [3]:
dataset_dict = load_dataset('json', data_files='data/arxiv_data_processed.json')

dataset = dataset_dict['train'].train_test_split(test_size=0.2)

dataset_cols = dataset.column_names['test']

Using custom data configuration default-92283b1bada8c0a9


Downloading and preparing dataset json/default to /home/covariance/.cache/huggingface/datasets/json/default-92283b1bada8c0a9/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /home/covariance/.cache/huggingface/datasets/json/default-92283b1bada8c0a9/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
category_to_term = {
  'computer_science': ['cs'],
  'economics': ['econ'],
  'electrical_engineering': ['eess'],
  'mathematics': ['math'],
  'physics': ['astro-ph', 'cond-mat', 'gr-qc', 'hep-ex', 'hep-lat', 'hep-ph', 'hep-th', 'math-ph', 'nlin', 'nucl-ex', 'nucl-th', 'physics', 'quant-ph'],
  'biology': ['q-bio'],
  'finance': ['q-fin'],
  'statistics': ['stat']
}

term_to_category = {term: cat for (cat, terms) in category_to_term.items() for term in terms}
categories = sorted(category_to_term.keys()) + ['unknown']

In [5]:
categories

['biology',
 'computer_science',
 'economics',
 'electrical_engineering',
 'finance',
 'mathematics',
 'physics',
 'statistics',
 'unknown']

In [6]:
def get_label(example):
    term = eval(example['tag'])[0]['term']
    term = term.split('.')[0]
    category = term_to_category.get(term, 'unknown')
    example['label'] = categories.index(category)
    return example

dataset = dataset.map(get_label, batched=False)

  0%|          | 0/32800 [00:00<?, ?ex/s]

  0%|          | 0/8200 [00:00<?, ?ex/s]

In [7]:
def get_dataset_by_column(dataset, tokenizer, column):
    def tokenize(examples):
        return tokenizer(examples[column], padding="max_length", truncation=True)

    dataset = dataset.map(tokenize, batched=True)
    dataset = dataset.remove_columns(dataset_cols)
    dataset = dataset.cast_column('label', ClassLabel(names=categories))
    return dataset


In [8]:
tokenizer_summary = AutoTokenizer.from_pretrained("distilbert-base-cased")
dataset_summary = get_dataset_by_column(dataset, tokenizer_summary, "summary")

tokenizer_title = AutoTokenizer.from_pretrained("distilbert-base-cased")
dataset_title = get_dataset_by_column(dataset, tokenizer_title, "title")

  0%|          | 0/33 [00:00<?, ?ba/s]

  0%|          | 0/9 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/4 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/33 [00:00<?, ?ba/s]

  0%|          | 0/9 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/4 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

In [9]:
dataset_summary

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 32800
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 8200
    })
})

## Fine-tuning`distilbert-base-cased`

In [10]:
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [11]:
def train(train_dataset, eval_dataset, compute_metrics, tokenizer):

    training_args = TrainingArguments(
        output_dir='data/test_trainer',
        evaluation_strategy="epoch",
        disable_tqdm=False,
        num_train_epochs=1,
    )
  
    model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-cased", num_labels=len(categories))
  
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer
    )
    trainer.train()

    return model, trainer

In [12]:
small_train_dataset = dataset_title["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = dataset_title["test"].shuffle(seed=42).select(range(100))

In [13]:
model_title, trainer_title = train(small_train_dataset, small_eval_dataset, compute_metrics, tokenizer_title)

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weigh

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.444774,0.89


***** Running Evaluation *****
  Num examples = 100
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




In [14]:
trainer_title.save_model('model')

Saving model checkpoint to model
Configuration saved in model/config.json
Model weights saved in model/pytorch_model.bin
tokenizer config file saved in model/tokenizer_config.json
Special tokens file saved in model/special_tokens_map.json


### Testing on samples

In [19]:
from transformers import DistilBertForSequenceClassification

model = DistilBertForSequenceClassification.from_pretrained('./model', local_files_only=True)
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased')

loading configuration file ./model/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-cased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL_7": 7,
    "LABEL_8": 8
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_":

In [22]:
def run_model(input_text: str):
    tokens = tokenizer.encode(input_text, truncation=True, padding=True, return_tensors='pt')
    preds = model(tokens)
    [probs] = preds.logits.softmax(dim=-1).tolist()

    idx = np.argsort(probs)

    for cat, prob in zip(np.array(categories)[idx], sorted(probs)):
        print(f"{cat.rjust(25)}: {prob:.3f}")

In [23]:
biology_text = """
De novo peptide sequencing aims to recover amino acid sequences of a peptide from tandem mass spectrometry (MS) data. Existing approaches for de novo analysis enumerate MS evidence for all amino acid classes during inference. It leads to over-trimming on receptive fields of MS data and restricts MS evidence associated with following undecoded amino acids. Our approach, DPST, circumvents these limitations with two key components: (1) A confidence value aggregation encoder to sketch spectrum representations according to amino-acid-based connectivity among MS; (2) A global-local fusion decoder to progressively assimilate contextualized spectrum representations with a predefined preconception of localized MS evidence and amino acid priors. Our components originate from a closed-form solution and selectively attend to informative amino-acid-aware MS representations. Through extensive empirical studies, we demonstrate the superiority of DPST, showing that it outperforms state-of-the-art approaches by a margin of 12% - 19% peptide accuracy.
"""

run_model(biology_text)

                  finance: 0.002
   electrical_engineering: 0.002
                economics: 0.002
                  unknown: 0.002
                  biology: 0.003
              mathematics: 0.004
                  physics: 0.007
               statistics: 0.068
         computer_science: 0.910


### Ooopsie.

It appears that our dataset was not balanced at all, but I did not do anything to rebalance it. That is why it would always be computer science.