In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F
import json
import pandas as pd
from tqdm import tqdm

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false" # avoid warning from Huggingface's tokenizers library in Jupyter notebooks

# 8 categories - unweighted loss

In [None]:
model_path = "models/finetuned_scibert_scivocab_uncased_8cats"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

## 1. Single example

In [None]:
title = "New insights into quantum computing hardware"
abstract = "This paper explores the scalability of qubit interactions in next-gen quantum processors..."
text = f"{title}. {abstract}"  # concatenate if you trained on both

In [None]:
text

In [None]:
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length = 256)
inputs

In [None]:
# 4. Predict
model.eval()

with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    probabilities = F.softmax(logits, dim=-1)
    predicted_class = torch.argmax(probabilities, dim=1).item()

In [None]:
probabilities

In [None]:
with open("models/finetuned_scibert_scivocab_uncased_8cats/id2label.json", "r") as f:
    id2label = {int(k): v for k, v in json.load(f).items()}


id2label

In [None]:
print(f"Predicted label: {id2label[predicted_class]}")

## 2. Full test data

In [None]:
test_df = pd.read_parquet("data/df_test.parquet.gzip")

In [None]:
texts = test_df["text"].tolist()

In [None]:
model.eval()

max_len = 256

def predict_finetuned(texts):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length = max_len)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1).tolist()
    return predictions

In [None]:
# with tqdm

max_len = 256

finetuned_preds = []

for text in tqdm(test_df["text"].tolist()):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=max_len)

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()

    predicted_label = id2label[predicted_class]
    finetuned_preds.append(predicted_label)

In [None]:
finetuned_preds

In [None]:
test_df_w_preds = test_df
test_df_w_preds['pred'] = finetuned_preds

In [None]:
display(test_df_w_preds)

In [None]:
test_df_w_preds.to_parquet("data/df_test_pred_finetuned_8cats.parquet.gzip")

# 8 categories - weighted loss

In [None]:
model_path = "models/finetuned_scibert_scivocab_uncased_weighted_8cats"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

In [None]:
model.eval()

max_len = 256

def predict_finetuned(texts):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length = max_len)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1).tolist()
    return predictions

In [None]:
# with tqdm

max_len = 256

finetuned_preds = []

for text in tqdm(test_df["text"].tolist()):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=max_len)

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()

    predicted_label = id2label[predicted_class]
    finetuned_preds.append(predicted_label)

In [None]:
test_df_w_preds = test_df
test_df_w_preds['pred'] = finetuned_preds

In [None]:
display(test_df_w_preds)

In [None]:
test_df_w_preds.to_parquet("data/df_test_pred_finetuned_weighted_8cats.parquet.gzip")

# 17 categories - weighted loss

In [None]:
model_path = "models/finetuned_scibert_scivocab_uncased_weighted_17cats"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

In [None]:
test_df = pd.read_parquet("data/df_test_17cats.parquet.gzip")

In [None]:
texts = test_df["text"].tolist()

In [None]:
with open("models/finetuned_scibert_scivocab_uncased_weighted_17cats/id2label.json", "r") as f:
    id2label = {int(k): v for k, v in json.load(f).items()}


id2label

In [None]:
model.eval()

max_len = 256

def predict_finetuned(texts):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length = max_len)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1).tolist()
    return predictions

In [None]:
# with tqdm

max_len = 256

finetuned_preds = []

for text in tqdm(test_df["text"].tolist()):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=max_len)

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()

    predicted_label = id2label[predicted_class]
    finetuned_preds.append(predicted_label)

In [None]:
test_df_w_preds = test_df
test_df_w_preds['pred'] = finetuned_preds

In [None]:
display(test_df_w_preds)

In [None]:
test_df_w_preds.to_parquet("data/df_test_pred_finetuned_weighted_17cats.parquet.gzip")