In [1]:
%load_ext autoreload
%autoreload 2

from datasets import load_dataset
import numpy as np
import pandas as pd

# Social Media Emotion Classification

The dataset classifies Tweets into anger, joy, optimism or sadness.

In [3]:
cache_dir = "./data_cache"

train_dataset = load_dataset(
    "tweet_eval",
    name="emotion",
    split="train",
    cache_dir=cache_dir,
)
print(f"Training dataset with {len(train_dataset)} instances loaded")


val_dataset = load_dataset(
    "tweet_eval",
    name="emotion",
    split="validation",
    cache_dir=cache_dir,
)
print(f"Development/validation dataset with {len(val_dataset)} instances loaded")


test_dataset = load_dataset(
    "tweet_eval",
    name="emotion",
    split="test",
    cache_dir=cache_dir,
)
print(f"Test dataset with {len(test_dataset)} instances loaded")

# Access the input text and target labels like this...

train_texts = train_dataset['text']
train_labels = train_dataset['label']

val_texts = val_dataset['text']
val_labels = val_dataset['label']

test_texts = test_dataset['text']
test_labels = test_dataset['label']

Training dataset with 3257 instances loaded
Development/validation dataset with 374 instances loaded
Test dataset with 1421 instances loaded


In [4]:
def textsets_to_csv(textsets, labels):
    emotion = {
        0: "Anger",
        1: "Joy",
        2: "Optimism",
        3: "Sadness"
    }
    data = pd.DataFrame()
    data["text"] = textsets
    data["label"] = labels
    data["emotion"] = data["label"].apply(lambda x: emotion[x])
    return data

In [5]:
train = textsets_to_csv(train_texts, train_labels)
train.to_csv("../data/Tweets Emotion Classification/train.csv",index=False)

val = textsets_to_csv(val_texts, val_labels)
val.to_csv("../data/Tweets Emotion Classification/val.csv",index=False)

test = textsets_to_csv(test_texts, test_labels)
test.to_csv("../data/Tweets Emotion Classification/test.csv",index=False)

# Bio Creative V

Marks chemicals and diseases in Pubmed articles as named entities. 

In [7]:
ner_dataset = load_dataset(
    "tner/bc5cdr", 
)

print(f'The dataset is a dictionary with {len(ner_dataset)} splits: \n\n{ner_dataset}')

The dataset is a dictionary with 3 splits: 

DatasetDict({
    train: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 5228
    })
    validation: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 5330
    })
    test: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 5865
    })
})


In [8]:
train_sentences_ner = [item['tokens'] for item in ner_dataset['train']]
train_labels_ner = [[str(tag) for tag in item['tags']] for item in ner_dataset['train']]

val_sentences_ner = [item['tokens'] for item in ner_dataset['validation']]
val_labels_ner = [[str(tag) for tag in item['tags']] for item in ner_dataset['validation']]

test_sentences_ner = [item['tokens'] for item in ner_dataset['test']]
test_labels_ner = [[str(tag) for tag in item['tags']] for item in ner_dataset['test']]

In [9]:
np.unique(np.concatenate(train_labels_ner))

array(['0', '1', '2', '3', '4'], dtype='<U1')

In [10]:
with open('../data/Named Entity Recognition/train_sentences_ner', 'w') as file:
    for inner_list in train_sentences_ner:
        line = ', '.join(inner_list)
        file.write(f"{line}\n")

In [11]:
def textsets_to_csv(textsets, labels):
    label = {
        '0': "O",
        '1': "B-Chemical",
        '2':"B-Disease",
        '3':"I-Disease",
        '4':"I-Chemical"
    }
    data = pd.DataFrame()
    data["text"] = textsets
    data["label"] = labels
    
    data["text"] = data["text"].apply(lambda x: ' '.join(x))
    data["label"] = data["label"].apply(lambda x: ' '.join(x))
    
    return data