In [1]:
import pandas as pd
import os
import torch
from pathlib import Path
from datasets import Dataset
from transformers import pipeline

## Fill up Primary category `prim_cat`

In [2]:
# df
p = Path('/lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database/meta_raw_table.csv')
df = pd.read_csv(p, sep='|')

# titles w/o primary category
title_list = list(df[(~df['title'].isna())]['title'])

In [3]:
len(title_list)

16930

In [4]:
# Cahce DIR
os.environ['TRANSFORMERS_CACHE'] = '/eagle/projects/argonne_tpc/siebenschuh/HF_cache'

# Device
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Initialize the zero-shot classification pipeline
classifier = pipeline('zero-shot-classification', model='facebook/bart-large-mnli', device=device)

# Scientific categories
labels = ["Biology", "Chemistry", "Physics", "Mathematics", "Computer Science", "Medicine", "Engineering", "Economics"]

# Scientific sub-categories
sub_labels = [
    "Astronomy",
    "Biotechnology",
    "Cancer Biology",
    "Cell Biology",
    "Climatology",
    "Evolutionary Biology",
    "Genetics",
    "Immunology",
    "Neuroscience",
    "Psychiatry",
    "Public Health",
    "Bioinformatics",
    "Biochemistry",
    "Bioengineering",
    "Biophysics",
    "Epidemiology",
    "Cardiovascular Medicine",
    "Oncology",
    "Neurology",
    "Mechanical Engineering",
    "Electrical Engineering",
    "Civil Engineering",
    "Chemical Engineering",
    "Environmental Engineering",
    "Industrial Engineering",
    "Aerospace Engineering",
    "Materials Science and Engineering",
    "Computer Engineering",
    "Architecture",
    "Analytical Chemistry",
    "Organic Chemistry",
    "Inorganic Chemistry",
    "Physical Chemistry",
    "Polymer Chemistry",
    "Microeconomics",
    "Macroeconomics",
    "Econometrics",
    "Development Economics",
    "Behavioral Economics",
    "Financial Economics",
    "Labor Economics",
    "Health Economics",
    "International Economics",
    "Environmental Economics",
    "Algebra",
    "Calculus",
    "Statistics",
    "Probability",
    "Geometry",
    "Topology",
    "Number Theory",
    "Mathematical Logic",
    "Discrete Mathematics",
    "Applied Mathematics",
    "Classical Mechanics",
    "Quantum Mechanics",
    "Thermodynamics",
    "Electromagnetism",
    "Optics",
    "Acoustics",
    "Nuclear Physics",
    "Condensed Matter Physics",
    "Particle Physics",
    "Astrophysics"
]

def classify_batch(batch):
    # Classify the titles into the main scientific category
    category_classifications = classifier(batch['title'], candidate_labels=labels, multi_label=False)
    top_categories = [classification['labels'][0] for classification in category_classifications]
    category_scores = [classification['scores'][0] for classification in category_classifications]
    
    # Classify the titles into the scientific sub-category
    subcategory_classifications = classifier(batch['title'], candidate_labels=sub_labels, multi_label=False)
    top_subcategories = [classification['labels'][0] for classification in subcategory_classifications]
    subcategory_scores = [classification['scores'][0] for classification in subcategory_classifications]
    
    return {
        'predicted_category': top_categories,
        'category_confidence': category_scores,
        'predicted_subcategory': top_subcategories,
        'subcategory_confidence': subcategory_scores
    }

# Actual Prediction
dataset = Dataset.from_dict({"title": title_list})

# Apply the classification function to the dataset in batches
classified_dataset = dataset.map(classify_batch, batched=True)

# Convert back to a list of dictionaries
classified_titles = classified_dataset.to_dict()

# Prediction table
df_pred = pd.DataFrame(classified_titles)

# store in `/lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database/predicted_categories`


In [8]:
len(title_list)



16930

In [12]:
df_pred

Unnamed: 0,title,predicted_category,category_confidence,predicted_subcategory,subcategory_confidence
0,Communities in C.elegans connectome through th...,Engineering,0.274968,Civil Engineering,0.102572
1,Self-organized vortex phases and hydrodynamic ...,Biology,0.368099,Cell Biology,0.087583
2,Collective Arbitrage and the Value of Cooperation,Economics,0.472769,Macroeconomics,0.044785
3,Predicting Strategic Energy Storage Behaviors,Engineering,0.253934,Probability,0.170267
4,Optimally Coordinated Energy Management Framew...,Economics,0.356369,Probability,0.074423
...,...,...,...,...,...
95,Thought Graph: Generating Thought Process for ...,Biology,0.782852,Bioinformatics,0.061667
96,Discrete Laplacian thermostat for flocks and s...,Physics,0.254050,Applied Mathematics,0.083358
97,Improved Dynamics for the Maximum Common Subgr...,Mathematics,0.523249,Probability,0.055966
98,Evolutionary Algorithms Simulating Molecular E...,Biology,0.285143,Evolutionary Biology,0.057767


In [None]:
df.head()

In [6]:
classified_titles

{'title': ['Communities in C.elegans connectome through the prism of non-backtracking walks',
  'Self-organized vortex phases and hydrodynamic interactions in Bos taurus sperm cells',
  'Collective Arbitrage and the Value of Cooperation',
  'Predicting Strategic Energy Storage Behaviors',
  'Optimally Coordinated Energy Management Framework for Profit Maximization Considering Dispatchable and Non-Dispatchable Energy Resources',
  'Decentralised Finance and Automated Market Making: Execution and Speculation',
  'Information Arbitrage in Bipartite Heat Engines',
  'Anisotropic body compliance facilitates robotic sidewinding in complex environments',
  'Optimal Entry and Exit with Signature in Statistical Arbitrage',
  'Shared Sequencing and Latency Competition as a Noisy Contest',
  'Jaynes Machine: The universal microstructure of deep neural networks',
  'Statistical arbitrage portfolio construction based on preference relations',
  'ZeroSwap: Data-driven Optimal Market Making in DeFi',

Unnamed: 0,title,predicted_category,category_confidence,predicted_subcategory,subcategory_confidence
0,Communities in C.elegans connectome through th...,Engineering,0.274968,Civil Engineering,0.102572
1,Self-organized vortex phases and hydrodynamic ...,Biology,0.368099,Cell Biology,0.087583
2,Collective Arbitrage and the Value of Cooperation,Economics,0.472769,Macroeconomics,0.044785
3,Predicting Strategic Energy Storage Behaviors,Engineering,0.253934,Probability,0.170267
4,Optimally Coordinated Energy Management Framew...,Economics,0.356369,Probability,0.074423
...,...,...,...,...,...
95,Thought Graph: Generating Thought Process for ...,Biology,0.782852,Bioinformatics,0.061667
96,Discrete Laplacian thermostat for flocks and s...,Physics,0.254050,Applied Mathematics,0.083358
97,Improved Dynamics for the Maximum Common Subgr...,Mathematics,0.523249,Probability,0.055966
98,Evolutionary Algorithms Simulating Molecular E...,Biology,0.285143,Evolutionary Biology,0.057767
