In [None]:
!pip install scikit-multilearn datasets

In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from skmultilearn.model_selection import iterative_train_test_split

from datasets import Dataset, DatasetDict, load_dataset

## Function to create HuggingFace dataset

In [None]:
def load_csv(filename):
    """
    Loads the CSV file containing the multi-label classification dataset.

    Parameters
    ==========
    filename (str) : The name of the CSV file to load

    Returns
    =======
    pandas DataFrame
    """
    # Read the CSV file from the specified path in Google Drive
    df = pd.read_csv(filename)
    return df


def create_huggingface_dataset(df, push_to_hub=False, dataset_name='multi_class_classification_dataset'):
    """
    Creates a Hugging Face DatasetDict from a CSV file, performing iterative train-test splits to ensure balanced label distribution across splits.
    Optionally, this dataset can be pushed to the Hugging Face Hub.

    Parameters
    ==========
    df (pandas DataFrame) : The DataFrame containing the multi-label classification dataset
    push_to_hub (bool) : Whether to push the created dataset to the Hugging Face Hub
    dataset_name (str) : The name of the dataset repository on the Hugging Face Hub (only relevant if `push_to_hub` is True)

    Returns
    =======
    DatasetDict : Hugging Face DatasetDict containing 'train', 'val', and 'test' splits
    """
    # Convert the entire DataFrame to a numpy array
    data = df.iloc[:, :].values

    # Extract row index (ID), text, and labels
    # Create a text string by combining Title and Abstract
    idx, text, labels = list(zip(*[
        (
            int(row[0]),
            f"Title: {row[1].strip()},\nAbstract: {row[2].strip()}",
            row[3:]
        )
        for row in data
    ]))

    # Convert labels to a NumPy array of int type
    labels = np.array(labels, dtype=int)
    text = list(text)

    # Create an array of row IDs
    row_ids = np.arange(len(labels))
    row_ids = row_ids.reshape(-1, 1)

    # Perform an iterative train-test split to ensure balanced label distribution
    train_idx, y_train, val_idx, y_val = iterative_train_test_split(row_ids, labels, test_size=0.2)

    # Prepare training data using the train indices
    x_train = [text[idx] for idx in train_idx.flatten()]

    # Prepare validation data indices; from the validation set, create a further split for val/test
    val_idx_final, y_val_final, test_idx, y_test = iterative_train_test_split(val_idx, y_val, test_size=0.75)

    # Create the final validation and test data from the indices
    x_val_final = [text[idx] for idx in val_idx_final.flatten()]
    x_test = [text[idx] for idx in test_idx.flatten()]

    # Construct a DatasetDict with train, val, and test splits
    ds = DatasetDict({
        'train': Dataset.from_dict({'text': x_train, 'labels': y_train}),
        'val': Dataset.from_dict({'text': x_val_final, 'labels': y_val_final}),
        'test': Dataset.from_dict({'text': x_test, 'labels': y_test})
    })

    # Push the dataset to the Hugging Face Hub under the specified repository
    if push_to_hub:
        ds.push_to_hub(f"bhujith10/{dataset_name}")

    return ds


Load the file

In [None]:
df = load_csv('train.csv')

In [None]:
df.head()

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0
1,2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0
2,3,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0,0,1,0,0,0
3,4,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,0,0,1,0,0,0
4,5,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...,1,0,0,1,0,0


In [None]:
df[['Computer Science', 'Physics', 'Mathematics',
       'Statistics', 'Quantitative Biology', 'Quantitative Finance']].sum().reset_index().rename(columns={'index':'field',0:'count'})

Unnamed: 0,field,count
0,Computer Science,8594
1,Physics,6013
2,Mathematics,5618
3,Statistics,5206
4,Quantitative Biology,587
5,Quantitative Finance,249


Convert dataframe into HuggingFace dataset

In [None]:
ds = create_huggingface_dataset(df)

In [None]:
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 16771
    })
    val: Dataset({
        features: ['text', 'labels'],
        num_rows: 1055
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 3146
    })
})

In [None]:
ds['train'][0]

{'text': 'Title: Rotation Invariance Neural Network,\nAbstract: Rotation invariance and translation invariance have great values in image\nrecognition tasks. In this paper, we bring a new architecture in convolutional\nneural network (CNN) named cyclic convolutional layer to achieve rotation\ninvariance in 2-D symbol recognition. We can also get the position and\norientation of the 2-D symbol by the network to achieve detection purpose for\nmultiple non-overlap target. Last but not least, this architecture can achieve\none-shot learning in some cases using those invariance.',
 'labels': [1, 0, 0, 0, 0, 0]}

Load the uploaded dataset

In [None]:
ds = load_dataset('bhujith10/multi_class_classification_dataset')
ds