In [1]:
import pyarrow as pa
import pyarrow.dataset as ds
import pandas as pd
from datasets import Dataset
from datasets import DatasetDict
from datasets import load_metric

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, get_scheduler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from torch.utils.data import DataLoader
from torch.optim import AdamW
import torch

from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

### Get the dataset

In [3]:
tokenizer = AutoTokenizer.from_pretrained('malteos/scincl')

Downloading: 100%|██████████| 327/327 [00:00<00:00, 240kB/s]
Downloading: 100%|██████████| 596/596 [00:00<00:00, 322kB/s]
Downloading: 100%|██████████| 222k/222k [00:00<00:00, 14.9MB/s]
Downloading: 100%|██████████| 112/112 [00:00<00:00, 106kB/s]


In [4]:
merged_df = pd.read_csv('/netscratch/abu/forc_I_dataset.csv')
baseline_data = merged_df[['title', 'abstract', 'label']]

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
sample_data = baseline_data.sample(1000)

In [6]:
# remove nan from abstracts
baseline_data['abstract'] = ["" if pd.isna(abstract) else abstract for abstract in baseline_data['abstract']]
# Get text
baseline_data['text'] = [row['title'] + tokenizer.sep_token + (row['abstract'] or '')
                    for index, row in baseline_data.iterrows()]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [7]:
# Encode labels with LabelEncoder
label_encoder = LabelEncoder()
labels = baseline_data['label'].to_list()
baseline_data['label'] = label_encoder.fit_transform(labels)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [54]:
set(labels)

{'Algebra',
 'Algebraic Geometry',
 'Analysis',
 'Animal Sciences',
 'Applied Mathematics',
 'Applied Statistics',
 'Artificial Intelligence',
 'Arts and Humanities',
 'Astrophysics and Astronomy',
 'Atmospheric Sciences',
 'Atomic, Molecular and Optical Physics',
 'Audio and Speech Processing',
 'Bioinformatics',
 'Biological and Chemical Physics',
 'Biomedical Engineering and Bioengineering',
 'Category Theory',
 'Cell Behavior',
 'Chemistry',
 'Civil and Environmental Engineering',
 'Communication Technology and New Media',
 'Complex Variables',
 'Computational Engineering',
 'Computational Geometry',
 'Computational Linguistics',
 'Computational Physics',
 'Computer Engineering',
 'Computer Science and Game Theory',
 'Computer Sciences',
 'Computer Vision and Pattern Recognition',
 'Computer and Systems Architecture',
 'Computers and Society',
 'Condensed Matter Physics',
 'Controls and Control Theory',
 'Cosmology',
 'Cosmology, Relativity, and Gravity',
 'Cryptography and Securit

In [8]:
baseline_data = baseline_data[['label', 'text']]

#### Split to train and test datasets

In [9]:
train_df, test_df = train_test_split(baseline_data, test_size=0.2)

#### Convert to Huggingface dataset

In [10]:
train_dataset = ds.dataset(pa.Table.from_pandas(train_df).to_batches())

In [11]:
train_dataset = Dataset(pa.Table.from_pandas(train_df))

In [12]:
test_dataset = ds.dataset(pa.Table.from_pandas(test_df).to_batches())

In [13]:
test_dataset = Dataset(pa.Table.from_pandas(test_df))

#### Make DatasetDict

In [14]:
dd = DatasetDict({"train":train_dataset,"test":test_dataset})

In [15]:
dd

DatasetDict({
    train: Dataset({
        features: ['label', 'text', '__index_level_0__'],
        num_rows: 47582
    })
    test: Dataset({
        features: ['label', 'text', '__index_level_0__'],
        num_rows: 11896
    })
})

In [16]:
dd['train'][100]

{'label': 61,
 'text': 'attracting new users or business as usual? a case study of converting academic subscription-based journals to open access[SEP]this paper studies a selection of 11 norwegian journals in the humanities and social sciences and their conversion from subscription to open access, a move heavily incentivized by governmental mandates and open access policies. by investigating the journals’ visiting logs in the period 2014–2019, the study finds that a conversion to open access induces higher visiting numbers; all journals in the study had a significant increase, which can be attributed to the conversion. converting a journal had no spillover in terms of increased visits to previously published articles still behind the paywall in the same journals. visits from previously subscribing norwegian higher education institutions did not account for the increase in visits, indicating that the increase must be accounted for by visitors from other sectors. the results could be rel

#### tokenize the text in the dataset

In [17]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding=True, truncation=True, max_length=512)

In [18]:
tokenized_datasets = dd.map(tokenize_function, batched=True)

100%|██████████| 48/48 [00:24<00:00,  1.96ba/s]
100%|██████████| 12/12 [00:06<00:00,  1.98ba/s]


In [19]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['label', 'text', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 47582
    })
    test: Dataset({
        features: ['label', 'text', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 11896
    })
})

#### Postprocessing

In [20]:
# remove unnecessary columns from dataset
tokenized_datasets = tokenized_datasets.remove_columns(["text", "__index_level_0__"])

In [21]:
# rename the label column to labels because the model expects the argument to be named as the latter
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

In [22]:
# set the format of the dataset to return PyTorch instrad og lists
tokenized_datasets.set_format("torch")

In [23]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 47582
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 11896
    })
})

### Training with PyTorch

In [24]:
# create DataLoader objects to iterate over batches of data when training
train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, batch_size=8)
test_dataloader = DataLoader(tokenized_datasets["test"], batch_size=8)

In [25]:
# get the model
model = AutoModelForSequenceClassification.from_pretrained('malteos/scincl', num_labels=123)

Downloading: 100%|██████████| 419M/419M [00:07<00:00, 60.4MB/s] 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at malteos/scincl and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
# define optimizer with the learning rate and the scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [27]:
# specify the GPU to access
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31090, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

#### Training loop

In [28]:
from tqdm.auto import tqdm
progress_bar = tqdm(range(num_training_steps))

  0%|          | 0/17844 [00:00<?, ?it/s]

In [29]:
model.train()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31090, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [30]:
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

100%|██████████| 17844/17844 [1:39:16<00:00,  3.21it/s]

### Evaluation

In [42]:
metric_f1 = load_metric('f1')
metric_precision = load_metric('precision')
metric_recall = load_metric('recall')
metric_acc = load_metric('accuracy')

In [43]:
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31090, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [44]:
for batch in test_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric_f1.add_batch(predictions=predictions, references=batch["labels"])
    metric_precision.add_batch(predictions=predictions, references=batch["labels"])
    metric_recall.add_batch(predictions=predictions, references=batch["labels"])
    metric_acc.add_batch(predictions=predictions, references=batch["labels"])

In [45]:
metric_f1.compute(average='weighted')

{'f1': 0.729156362003899}

In [46]:
metric_precision.compute(average='weighted')

{'precision': 0.731032051498242}

In [47]:
metric_recall.compute(average='weighted')

{'recall': 0.7329354404841963}

In [48]:
metric_acc.compute()

{'accuracy': 0.7329354404841963}