In [None]:
import json
import pandas as pd

# Load JSON data
file_path = '/kaggle/input/dataset/papers_cat.json'  # Replace with the actual file path
with open(file_path, 'r') as f:
    data = json.load(f)

# Convert JSON data into a list of records
records = []
for key, value in data.items():
    title = value.get("title", "").strip()
    abstract = value.get("abstract", "").strip()
    category = value.get("categories", [None])[0]  # Use the first category
    if title and abstract and category:  # Ensure no empty fields
        records.append({
            "input_text": f"{title} {abstract}",
            "category": category
        })

# Convert to a DataFrame
df = pd.DataFrame(records)
print(df.head())  # Preview the data


In [None]:


# Define all categories with ACM subject names
detailed_categories = [
    {'category_code': 'cs.AI', 'category_name': 'Artificial Intelligence', 'acm_subject_names': [
        'Artificial Intelligence', 'Applications and Expert Systems', 'Deduction and Theorem Proving',
        'Knowledge Representation Formalisms and Methods', 'Problem Solving, Control Methods, and Search',
        'Distributed Artificial Intelligence'
    ]},
    {'category_code': 'cs.AR', 'category_name': 'Hardware Architecture', 'acm_subject_names': [
        'Computer Systems Organization', 'Processor Architectures', 'Computer System Implementation'
    ]},
    {'category_code': 'cs.CC', 'category_name': 'Computational Complexity', 'acm_subject_names': [
        'Computation by Abstract Devices', 'Tradeoffs between Complexity Measures', 'Formal Languages',
        'Numerical Algorithms and Problems', 'Nonnumerical Algorithms and Problems'
    ]},
    {'category_code': 'cs.CE', 'category_name': 'Computational Engineering, Finance, and Science', 'acm_subject_names': [
        'Physical Sciences and Engineering', 'Life and Medical Sciences', 'Social and Behavioral Sciences'
    ]},
    {'category_code': 'cs.CG', 'category_name': 'Computational Geometry', 'acm_subject_names': [
        'Computational Geometry and Object Modeling', 'Nonnumerical Algorithms and Problems'
    ]},
    {'category_code': 'cs.CL', 'category_name': 'Computation and Language', 'acm_subject_names': [
        'Natural Language Processing'
    ]},
    {'category_code': 'cs.CR', 'category_name': 'Cryptography and Security', 'acm_subject_names': [
        'Security and Protection', 'Data Encryption'
    ]},
    {'category_code': 'cs.CV', 'category_name': 'Computer Vision and Pattern Recognition', 'acm_subject_names': [
        'Vision and Scene Understanding', 'Image Processing and Computer Vision', 'Pattern Recognition'
    ]},
    {'category_code': 'cs.CY', 'category_name': 'Computers and Society', 'acm_subject_names': [
        'Computing Milieux', 'History of Computing', 'Computers and Education', 'Computers and Society',
        'Legal Aspects of Computing', 'Computing Profession'
    ]},
    {'category_code': 'cs.DB', 'category_name': 'Databases', 'acm_subject_names': [
        'Data Storage Representations', 'Files', 'Information Systems', 'Database Management',
        'Administrative Data Processing'
    ]},
    {'category_code': 'cs.DC', 'category_name': 'Distributed, Parallel, and Cluster Computing', 'acm_subject_names': [
        'Multiple Data Stream Architectures', 'Parallel Architectures', 'Distributed Systems',
        'Concurrent Programming', 'Reliability', 'Organization and Design', 'Data Structures'
    ]},
    {'category_code': 'cs.DL', 'category_name': 'Digital Libraries', 'acm_subject_names': [
        'Online Information Services', 'Library Automation', 'Digital Libraries', 'Document and Text Processing'
    ]},
    {'category_code': 'cs.DM', 'category_name': 'Discrete Mathematics', 'acm_subject_names': [
        'Discrete Mathematics', 'Probability and Statistics'
    ]},
    {'category_code': 'cs.DS', 'category_name': 'Data Structures and Algorithms', 'acm_subject_names': [
        'Data Structures', 'Data Storage Representations', 'Numerical Algorithms and Problems',
        'Nonnumerical Algorithms and Problems'
    ]},
    {'category_code': 'cs.ET', 'category_name': 'Emerging Technologies', 'acm_subject_names': [
        'CMOS-based technologies', 'Nanoscale Electronics', 'Photonics', 'Spintronics', 'Superconductors',
        'Mechanical and Biochemical Technologies', 'Quantum Technologies'
    ]},
    {'category_code': 'cs.FL', 'category_name': 'Formal Languages and Automata Theory', 'acm_subject_names': [
        'Models of Computation', 'Formal Languages'
    ]},
    {'category_code': 'cs.GL', 'category_name': 'General Literature', 'acm_subject_names': [
        'Introductory and Survey', 'References, Dictionaries, Encyclopedias, Glossaries'
    ]},
    {'category_code': 'cs.GR', 'category_name': 'Graphics', 'acm_subject_names': [
        'Graphics Systems', 'Picture or Image Generation', 'Graphics Utilities', 'Three-Dimensional Graphics and Realism'
    ]},
    {'category_code': 'cs.GT', 'category_name': 'Computer Science and Game Theory', 'acm_subject_names': [
        'Mechanism Design', 'Learning in Games', 'Foundations of Agent Modeling in Games',
        'Coordination in Non-Cooperative Environments'
    ]},
    {'category_code': 'cs.HC', 'category_name': 'Human-Computer Interaction', 'acm_subject_names': [
        'User Interfaces', 'Group and Organization Interfaces', 'Hypertext or Hypermedia', 'Sound and Music Computing'
    ]},
    {'category_code': 'cs.IR', 'category_name': 'Information Retrieval', 'acm_subject_names': [
        'Content Analysis and Indexing', 'Information Storage', 'Information Search and Retrieval', 'Systems and Software'
    ]},
    {'category_code': 'cs.IT', 'category_name': 'Information Theory', 'acm_subject_names': [
        'Systems and Information Theory', 'Coding and Information Theory'
    ]},
    {'category_code': 'cs.LG', 'category_name': 'Machine Learning', 'acm_subject_names': [
        'Supervised Learning', 'Unsupervised Learning', 'Reinforcement Learning', 'Bandit Problems', 
        'Robustness, Explanation, Fairness'
    ]},
    {'category_code': 'cs.LO', 'category_name': 'Logic in Computer Science', 'acm_subject_names': [
        'Software or Program Verification', 'Specifying and Verifying and Reasoning About Programs',
        'Mathematical Logic and Formal Languages', 'Grammars and Other Rewriting Systems', 'Formal Languages'
    ]},

    {'category_code': 'cs.MM', 'category_name': 'Multimedia', 'acm_subject_names': [
        'Multimedia Information Systems'
    ]},
    {'category_code': 'cs.MS', 'category_name': 'Mathematical Software', 'acm_subject_names': [
        'Mathematical Software'
    ]},
    {'category_code': 'cs.NA', 'category_name': 'Numerical Analysis', 'acm_subject_names': [
        'Numerical Analysis'
    ]},
    {'category_code': 'cs.NE', 'category_name': 'Neural and Evolutionary Computing', 'acm_subject_names': [
        'Other Architecture Styles', 'Learning', 'Pattern Recognition'
    ]},
    {'category_code': 'cs.NI', 'category_name': 'Networking and Internet Architecture', 'acm_subject_names': [
        'Network Architecture and Design', 'Network Protocols', 'Network Operations', 'Distributed Systems',
        'Local and Wide-Area Networks', 'Internetworking'
    ]},
    {'category_code': 'cs.OH', 'category_name': 'Other Computer Science', 'acm_subject_names': [
        'Miscellaneous'
    ]},
    {
    'category_code': 'cs.OS',
    'category_name': 'Operating Systems',
    'acm_subject_names': [
        'Process Management', 'Storage Management', 'File Systems Management', 
        'Communications Management', 'Reliability', 'Organization and Design', 
        'Systems Programs and Utilities'
    ]
},
{
    'category_code': 'cs.PF',
    'category_name': 'Performance',
    'acm_subject_names': [
        'Operating Systems Performance', 'Installation Management'
    ]
},
{
    'category_code': 'cs.PL',
    'category_name': 'Programming Languages',
    'acm_subject_names': [
        'Programming Techniques', 'Programming Languages'
    ]
},
{
    'category_code': 'cs.RO',
    'category_name': 'Robotics',
    'acm_subject_names': [
        'Robotics'
    ]
},
{
    'category_code': 'cs.SC',
    'category_name': 'Symbolic Computation',
    'acm_subject_names': [
        'Symbolic and Algebraic Manipulation'
    ]
},
{
    'category_code': 'cs.SD',
    'category_name': 'Sound',
    'acm_subject_names': [
        'Sound and Music Computing', 'User/Machine Systems', 
        'Multimedia Information Systems', 'User Interfaces', 
        'Natural Language Processing', 'Applications', 'Arts and Humanities', 
        'Social Issues'
    ]
},
{
    'category_code': 'cs.SE',
    'category_name': 'Software Engineering',
    'acm_subject_names': [
        'Requirements or Specifications', 'Design Tools and Techniques', 
        'Coding Tools and Techniques', 'Testing and Debugging', 
        'Programming Environments', 'Distribution, Maintenance, and Enhancement', 
        'Metrics', 'Management', 'Design', 'Software Architectures', 
        'Interoperability', 'Reusable Software'
    ]
},
{
    'category_code': 'cs.SI',
    'category_name': 'Social and Information Networks',
    'acm_subject_names': [
        'Analysis of Algorithms and Program Complexity', 'Discrete Mathematics', 
        'Probability and Statistics', 'Database Management', 
        'Artificial Intelligence', 'Information Storage and Retrieval', 
        'Information Systems Applications', 'Information Interfaces and Presentation', 
        'Administrative Data Processing', 'Physical Sciences and Engineering', 
        'Life and Medical Sciences', 'Social and Behavioral Sciences', 
        'Arts and Humanities', 'Computer Aided Engineering', 
        'Computer in Other Systems'
    ]
},
{
    'category_code': 'cs.SY',
    'category_name': 'Systems and Control',
    'acm_subject_names': [
        'Automotive and Aerospace Control Systems', 'Network Control', 
        'Biological Systems',  
        'Robotics', 'Reinforcement Learning', 'Sensor Networks', 
        'Control of Cyber-Physical and Energy-Related Systems', 
        'Control of Computing Systems'
    ]
},
{'category_code': 'cs.MA', 'category_name': 'Multiagent Systems', 'acm_subject_names': [
    'Distributed Artificial Intelligence'
]}

]

In [None]:
# Define categories and create mappings
categories = [cat['category_code'] for cat in detailed_categories]
label2id = {label: i for i, label in enumerate(categories)}
id2label = {i: label for label, i in label2id.items()}

# Add label column to DataFrame
df['label'] = df['category'].map(label2id)


In [None]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(test_df, test_size=0.5, random_state=42)

print(f"Train size: {len(train_df)}, Validation size: {len(val_df)}, Test size: {len(test_df)}")


In [None]:
from datasets import Dataset

# Convert DataFrames to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)


In [None]:
from collections import defaultdict
import random

def sample_dataset(dataset, num_samples_per_class=50):
    labels = dataset["label"]
    indices_by_label = defaultdict(list)
    
    # Group indices by label
    for i, lab in enumerate(labels):
        indices_by_label[lab].append(i)
    
    # For each class, sample up to 'num_samples_per_class' examples
    selected_indices = []
    for lab, idx_list in indices_by_label.items():
        if len(idx_list) > num_samples_per_class:
            selected_indices.extend(random.sample(idx_list, num_samples_per_class))
        else:
            # If fewer than num_samples_per_class are available, take all
            selected_indices.extend(idx_list)
    
    # Create a new dataset with the selected indices
    return dataset.select(selected_indices)

# Sample your train, val, and test datasets before tokenization
train_dataset = sample_dataset(train_dataset, 100)
val_dataset = sample_dataset(val_dataset, 100)
test_dataset = sample_dataset(test_dataset, 100)

# Now proceed with tokenization and formatting
from transformers import BartTokenizer

tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-mnli")

def tokenize_function(example):
    return tokenizer(
        example["input_text"],
        truncation=True,
        padding="max_length",
        max_length=256,
    )

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])


In [None]:
!pip install transformers datasets


In [None]:
!pip install torch_xla
!pip install accelerate
!pip install --upgrade ipywidgets

In [None]:
from transformers import BartForSequenceClassification, BartTokenizer, Trainer, TrainingArguments
import os
import torch
from datasets import Dataset

# Environment variable to avoid fragmentation
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "garbage_collection_threshold:0.6,max_split_size_mb:128"

# Load model and tokenizer
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-mnli")

num_labels = len(categories)  # Ensure categories match the dataset
model = BartForSequenceClassification.from_pretrained(
    "facebook/bart-large-mnli",
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)

# Enable gradient checkpointing
model.gradient_checkpointing_enable()

# Training arguments with reduced batch size and mixed precision
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,  # Increase batch size if possible
    per_device_eval_batch_size=8,
    learning_rate=2e-5,  # Reduce learning rate
    num_train_epochs=5,  # Train for more epochs
    weight_decay=0.01,  # Add weight regularization
    warmup_steps=500,  # Include warmup steps for stable training
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    fp16=True,  # Enable mixed precision for faster training
)


# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

# Free up GPU memory
torch.cuda.empty_cache()

# Train the model
trainer.train()

In [None]:
# Assume you have a test_dataset similar to train_dataset and val_dataset
predictions = trainer.predict(test_dataset)
pred_logits = predictions.predictions  # Raw logits
pred_labels = predictions.label_ids    # True labels if available

In [None]:
input_text = ["As the data size in Machine Learning fields grows exponentially, it is\ninevitable to accelerate the computation by utilizing the ever-growing large\nnumber of available cores provided by high-performance computing hardware.\nHowever, existing parallel methods for clustering or regression often suffer\nfrom problems of low accuracy, slow convergence, and complex\nhyperparameter-tuning. Furthermore, the parallel efficiency is usually\ndifficult to improve while striking a balance between preserving model\nproperties and partitioning computing workloads on distributed systems. In this\npaper, we propose a novel and simple data structure capturing the most\nimportant information among data samples. It has several advantageous\nproperties supporting a hierarchical clustering strategy that is irrelevant to\nthe hardware parallelism, well-defined metrics for determining optimal\nclustering, balanced partition for maintaining the compactness property, and\nefficient parallelization for accelerating computation phases. Then we combine\nthe clustering with regression techniques as a parallel library and utilize a\nhybrid structure of data and model parallelism to make predictions. Experiments\nillustrate that our library obtains remarkable performance on convergence,\naccuracy, and scalability.\n"]
test_inputs = [" Modern Systems-on-Chip (SoC) designs are increasingly heterogeneous and\ncontain specialized semi-programmable accelerators in addition to programmable\nprocessors. In contrast to the pre-accelerator era, when the ISA played an\nimportant role in verification by enabling a clean separation of concerns\nbetween software and hardware, verification of these \"accelerator-rich\" SoCs\npresents new challenges. From the perspective of hardware designers, there is a\nlack of a common framework for the formal functional specification of\naccelerator behavior. From the perspective of software developers, there exists\nno unified framework for reasoning about software/hardware interactions of\nprograms that interact with accelerators. This paper addresses these challenges\nby providing a formal specification and high-level abstraction for accelerator\nfunctional behavior. It formalizes the concept of an Instruction Level\nAbstraction (ILA), developed informally in our previous work, and shows its\napplication in modeling and verification of accelerators. This formal ILA\nextends the familiar notion of instructions to accelerators and provides a\nuniform, modular, and hierarchical abstraction for modeling software-visible\nbehavior of both accelerators and programmable processors. We demonstrate the\napplicability of the ILA through several case studies of accelerators (for\nimage processing, machine learning, and cryptography), and a general-purpose\nprocessor (RISC-V). We show how the ILA model facilitates equivalence checking\nbetween two ILAs, and between an ILA and its hardware finite-state machine\n(FSM) implementation. Further, this equivalence checking supports accelerator\nupgrades using the notion of ILA compatibility, similar to processor upgrades\nusing ISA compatibility.\n"]
model.eval()

import torch.nn.functional as F

model.eval()

for input_text in test_inputs:
    # Tokenize the input text
    encoded_input = tokenizer(
        input_text,
        truncation=True,
        padding="max_length",
        max_length=256,
        return_tensors="pt"
    ).to(device)

    # Perform inference
    with torch.no_grad():
        outputs = model(**encoded_input)
        logits = outputs.logits

    # Apply softmax to get probabilities
    probs = F.softmax(logits, dim=1)

    # Get the predicted class and its probability
    predicted_class_id = torch.argmax(probs, dim=1).item()
    predicted_label = id2label[predicted_class_id]
    predicted_probability = probs[0, predicted_class_id].item()

    # Print results
    print(f"Input: {input_text}")
    print(f"Logits: {logits}")
    print(f"Predicted Class ID: {predicted_class_id}")
    print(f"Predicted Label: {predicted_label}")
    print(f"Predicted Probability: {predicted_probability:.4f}")

    # Print all class probabilities (optional)
    print("\nClass Probabilities:")
    for idx, prob in enumerate(probs[0]):
        print(f"  {id2label[idx]}: {prob:.4f}")


In [None]:
model_dir = "/kaggle/working/fine_tuned_model"

# Save the model and tokenizer
model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)

print(f"Model and tokenizer saved to {model_dir}")


In [None]:
!zip -r fine_tuned_model.zip /kaggle/working/fine_tuned_model