In [1]:
!pip3 install transformers datasets accelerate

Collecting transformers
  Downloading transformers-4.13.0-py3-none-any.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 5.3 MB/s 
[?25hCollecting datasets
  Downloading datasets-1.16.1-py3-none-any.whl (298 kB)
[K     |████████████████████████████████| 298 kB 48.1 MB/s 
[?25hCollecting accelerate
  Downloading accelerate-0.5.1-py3-none-any.whl (58 kB)
[K     |████████████████████████████████| 58 kB 5.8 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 47.7 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 39.3 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 513 kB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
 

# Importing Necessary Libraries

In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from tqdm.notebook import tqdm
tqdm.pandas()
from collections import Counter
import random
import time

from sklearn.model_selection import train_test_split
import torch.nn.functional as F
from transformers import AdamW
from transformers import get_scheduler


import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding
from datasets import Dataset, DatasetDict
from accelerate import Accelerator
from torch.utils.data import DataLoader
from transformers import TrainingArguments
from transformers import Trainer
from sklearn.metrics import f1_score, accuracy_score, classification_report

Setting up random number generators and seed for deterministic effects

In [3]:
SEED = 2137

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

g = torch.Generator()
g.manual_seed(SEED)

def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

#Importing dataset

In [4]:
train_df = pd.read_csv('drive/MyDrive/ML_projects/Articles_NLP/train_data_1.csv',index_col=0)

In [5]:
label_dict = dict(train_df.groupby('label').mean().reset_index()[['label_en', 'label']].values)

In [6]:
train_df.drop(['title', 'label'], axis=1, inplace=True)

In [7]:
train_df

Unnamed: 0,abstract,label_en
0,Predictive models allow subject-specific inf...,0
1,Rotation invariance and translation invarian...,0
2,We introduce and develop the notion of spher...,1
3,The stochastic Landau--Lifshitz--Gilbert (LL...,1
4,Let $\Omega \subset \mathbb{R}^n$ be a bound...,1
...,...,...
15923,Features and applications of quasi-spherical...,2
15924,An aggregate data meta-analysis is a statist...,4
15925,Large inter-datacenter transfers are crucial...,0
15926,Polycrystalline diamond coatings have been g...,2


# Splitting the dataset to Train/Eval/Test datasets

In [8]:
train_x, test_x, train_y, test_y = train_test_split(train_df.loc[:, 'abstract'], train_df.loc[:, 'label_en'], test_size = 0.2, stratify=train_df['label_en'], random_state=SEED)

In [9]:
val_x, test_x, val_y, test_y = train_test_split(test_x, test_y, test_size=0.5, stratify=test_y, random_state=SEED)

In [10]:
print(f"""Training Samples: {len(train_x)}
Validation Samples: {len(val_x)}
Test Samples: {len(test_x)}""")

Training Samples: 12740
Validation Samples: 1593
Test Samples: 1593


Creation of HF Dataset

In [11]:
df_train = pd.DataFrame(data={'content':train_x, 'labels':train_y}).reset_index(drop=False)
df_val = pd.DataFrame(data={'content':val_x, 'labels':val_y}).reset_index(drop=False)
df_test = pd.DataFrame(data={'content':test_x, 'labels':test_y}).reset_index(drop=False)

raw_dataset = DatasetDict({
    'train':Dataset.from_pandas(df_train[['content', 'labels']]),
    'val':Dataset.from_pandas(df_train[['content', 'labels']]),
    'test':Dataset.from_pandas(df_test[['content', 'labels']])
})

# Importing Model and Tokenizer

In [12]:
checkpoint = 'distilbert-base-uncased'

model1 = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=6)
tokenizer = AutoTokenizer.from_pretrained(checkpoint, add_special_tokens=True)

special_tkns = []

tokenizer.add_tokens(special_tkns)
model1.resize_token_embeddings(len(tokenizer)) 

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'pre_classi

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [13]:
def tokenize_function(example):
  return tokenizer(example['content'], truncation=True) #, padding=padding_value, max_length=input_length)

Tokenizeing the datasets

In [14]:
datasets_tokenized = raw_dataset.map(tokenize_function)
datasets_tokenized = datasets_tokenized.remove_columns(['content'])
datasets_tokenized = datasets_tokenized.with_format('torch')

  0%|          | 0/12740 [00:00<?, ?ex/s]

  0%|          | 0/12740 [00:00<?, ?ex/s]

  0%|          | 0/1593 [00:00<?, ?ex/s]

# Initializing the dataloaders

In [16]:
batch_size = 8

train_loader = DataLoader(datasets_tokenized['train'], batch_size=batch_size, collate_fn=data_collator, generator=g, worker_init_fn=seed_worker, shuffle=False)
val_loader = DataLoader(datasets_tokenized['val'], batch_size=batch_size, collate_fn=data_collator, generator=g, worker_init_fn=seed_worker, shuffle=False)
val_loader = DataLoader(datasets_tokenized['test'], batch_size=batch_size, collate_fn=data_collator, generator=g, worker_init_fn=seed_worker, shuffle=False)


# Training

## Preparing evaluation metrics

In [15]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import load_metric


def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }



## Initializing accelerator

In [18]:
accelerator = Accelerator()
train_loader, val_loader, model1 = accelerator.prepare(train_loader, val_loader, model1)

In [19]:
accelerator.device

device(type='cuda')

In [21]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

model1 = model1.to(device)

## Initializing HF Trainer and Training Arguments

In [18]:
train_args = TrainingArguments("trainer_checkpoints", do_eval = True, eval_steps = 1000, evaluation_strategy='steps', per_device_train_batch_size=8, per_device_eval_batch_size=8, num_train_epochs=3, seed=SEED, save_steps=3000, save_strategy='steps')

trainer = Trainer(model1,
                  train_args,
                  train_dataset = datasets_tokenized['train'],
                  eval_dataset = datasets_tokenized['val'],
                  data_collator=data_collator,
                  tokenizer=tokenizer,
                  compute_metrics=compute_metrics
                  )

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


## Training (Fine-Tuning)

In [23]:
trainer.train()

***** Running training *****
  Num examples = 12740
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 4779


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1000,0.5245,0.475635,0.84113,0.836349,0.845357,0.84113
2000,0.3721,0.272456,0.91562,0.916278,0.917316,0.91562
3000,0.3204,0.216422,0.936421,0.936598,0.936968,0.936421
4000,0.2135,0.166753,0.956593,0.956441,0.957011,0.956593


***** Running Evaluation *****
  Num examples = 12740
  Batch size = 8
***** Running Evaluation *****
  Num examples = 12740
  Batch size = 8
***** Running Evaluation *****
  Num examples = 12740
  Batch size = 8
Saving model checkpoint to trainer_checkpoints/checkpoint-3000
Configuration saved in trainer_checkpoints/checkpoint-3000/config.json
Model weights saved in trainer_checkpoints/checkpoint-3000/pytorch_model.bin
tokenizer config file saved in trainer_checkpoints/checkpoint-3000/tokenizer_config.json
Special tokens file saved in trainer_checkpoints/checkpoint-3000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 12740
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=4779, training_loss=0.36513010421341047, metrics={'train_runtime': 3856.9671, 'train_samples_per_second': 9.909, 'train_steps_per_second': 1.239, 'total_flos': 3678031322755488.0, 'train_loss': 0.36513010421341047, 'epoch': 3.0})

## Saving model and tokenizer

In [24]:
model1.save_pretrained('first_model', push_to_hub=False)
tokenizer.save_pretrained('first_tokenizer', push_to_hub=False)

Configuration saved in first_model/config.json
Model weights saved in first_model/pytorch_model.bin
tokenizer config file saved in first_tokenizer/tokenizer_config.json
Special tokens file saved in first_tokenizer/special_tokens_map.json


('first_tokenizer/tokenizer_config.json',
 'first_tokenizer/special_tokens_map.json',
 'first_tokenizer/vocab.txt',
 'first_tokenizer/added_tokens.json',
 'first_tokenizer/tokenizer.json')

In [25]:
model1.save_pretrained('drive/MyDrive/ML_projects/Articles_NLP/first_model', push_to_hub=False)
tokenizer.save_pretrained('drive/MyDrive/ML_projects/Articles_NLP/first_tokenizer', push_to_hub=False)

Configuration saved in drive/MyDrive/ML_projects/Articles_NLP/first_model/config.json
Model weights saved in drive/MyDrive/ML_projects/Articles_NLP/first_model/pytorch_model.bin
tokenizer config file saved in drive/MyDrive/ML_projects/Articles_NLP/first_tokenizer/tokenizer_config.json
Special tokens file saved in drive/MyDrive/ML_projects/Articles_NLP/first_tokenizer/special_tokens_map.json


('drive/MyDrive/ML_projects/Articles_NLP/first_tokenizer/tokenizer_config.json',
 'drive/MyDrive/ML_projects/Articles_NLP/first_tokenizer/special_tokens_map.json',
 'drive/MyDrive/ML_projects/Articles_NLP/first_tokenizer/vocab.txt',
 'drive/MyDrive/ML_projects/Articles_NLP/first_tokenizer/added_tokens.json',
 'drive/MyDrive/ML_projects/Articles_NLP/first_tokenizer/tokenizer.json')

# Model evaluation

In [16]:
model1 = AutoModelForSequenceClassification.from_pretrained("drive/MyDrive/ML_projects/Articles_NLP/first_model", num_labels=6)

In [20]:
trainer.evaluate(eval_dataset = datasets_tokenized['test'])

***** Running Evaluation *****
  Num examples = 1593
  Batch size = 8


{'eval_accuracy': 0.8700564971751412,
 'eval_f1': 0.8703670105672353,
 'eval_loss': 0.5294434428215027,
 'eval_precision': 0.8730369176761515,
 'eval_recall': 0.8700564971751412,
 'eval_runtime': 44.8094,
 'eval_samples_per_second': 35.551,
 'eval_steps_per_second': 4.463}