In [1]:
import pandas as pd

papers = pd.read_csv('../Data_Preparation/papers.csv')
papers.rename(columns={'description': 'abstract'}, inplace=True)

In [2]:
import torch

In [3]:
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device('cpu')

No GPU available, using the CPU instead.


In [4]:
import re
import nltk

nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
stop_words.difference_update(['not', 'can'])

def preprocess_text(s: str) -> str:
    s = s.lower()
    # Change 't to 'not'
    s = re.sub(r"\'t", " not", s)
    # Remove @name
    s = re.sub(r'(@.*?)[\s]', ' ', s)
    # Isolate and remove punctuations except '?'
    s = re.sub(r'([\'\"\.\(\)\!\?\\\/\,])', r' \1 ', s)
    s = re.sub(r'[^\w\s\?]', ' ', s)
    # Remove some special characters
    s = re.sub(r'([\;\:\|•«\n])', ' ', s)
    # Remove stopwords except 'not' and 'can'
    s = " ".join([word for word in s.split() if word not in stop_words])
    # Remove trailing whitespace
    s = re.sub(r'\s+', ' ', s).strip()
    return s

[nltk_data] Downloading package stopwords to /home/kami/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
papers['preprocessed_abstract'] = papers['abstract'].apply(preprocess_text)
papers['preprocessed_title'] = papers['title'].apply(preprocess_text)

In [None]:
from transformers import AutoTokenizer
# MODEL_PATH = 'google-bert/bert-base-uncased'
MODEL_PATH = 'allenai/longformer-base-4096'
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
papers[['preprocessed_title', 'preprocessed_abstract']]
papers['text'] = papers['preprocessed_title'] + '\n' + papers['preprocessed_abstract']

In [8]:
subject_areas = papers.subject_areas.str.split(',').explode().unique()

class2idx = {subject_areas: i for i, subject_areas in enumerate(subject_areas)}
idx2class = {i: subject_areas for subject_areas, i in class2idx.items()}

In [None]:
#Create dataset
#Go to each row in papers and tokenize the text and convert the subject_areas to multi-hot encoding
from torch.utils.data import Dataset

class PapersDataset(Dataset):
    def __init__(self, papers, tokenizer, class2idx, max_length=512):
        self.papers = papers
        self.tokenizer = tokenizer
        self.class2idx = class2idx
        self.max_length = max_length

    def __len__(self):
        return len(self.papers)

    def __getitem__(self, idx):
        row = self.papers.iloc[idx]
        text = row['text']
        labels = row['subject_areas'].split(',')
        
        # Tokenize the text
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        # Create multi-hot encoding for labels
        label_tensor = torch.zeros(len(self.class2idx))
        for label in labels:
            if label in self.class2idx:
                label_tensor[self.class2idx[label]] = 1
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': label_tensor
        }
    
dataset = PapersDataset(papers, tokenizer, class2idx)

#Split dataset into train and test
from torch.utils.data import random_split
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

In [10]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_PATH,
    num_labels=len(class2idx),
    id2label=idx2class,
    label2id=class2idx,
    problem_type='multi_label_classification'
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding
import os

os.environ["WANDB_DISABLED"] = "true"

training_args = TrainingArguments(
    output_dir="my_multi_label_classify_model",
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=1,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to=None
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [13]:
import evaluate
import numpy as np

# Combine metrics with averaging methods for multilabel
clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

# Set threshold at 0.5
THRESHOLD = 0.5

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # Apply sigmoid to raw logits
    probs = sigmoid(predictions)

    # Apply Threshold
    y_pred = (probs > THRESHOLD).astype(int).reshape(-1)
    y_true = labels.astype(int).reshape(-1)

    return clf_metrics.compute(predictions=y_pred, references=y_true)

Downloading builder script: 4.20kB [00:00, 3.73MB/s]
Downloading builder script: 6.79kB [00:00, 216kB/s]
Downloading builder script: 7.56kB [00:00, 11.9MB/s]
Downloading builder script: 7.38kB [00:00, 9.98MB/s]


In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics
)

In [22]:
papers.text.str.len().median()

np.float64(1195.0)