In [1]:
import pandas as pd
import nlpaug.augmenter.word as nlpaw
from tqdm import tqdm
import numpy as np
from time import time
import torch

In [2]:
mps_device = torch.device("mps")
print(str(mps_device))
# Create a Tensor directly on the mps device
x = torch.ones(5, device=mps_device)
# Or
x = torch.ones(5, device="mps")

# Any operation happens on the GPU
y = x * 2

mps


In [3]:
df = pd.read_csv("data.csv")[["summary", "genre"]]
grouped_df = df.replace(["romance"], "other")

In [4]:
def augment_sentence(sentence, aug, num_threads):
    """""""""
    Constructs a new sentence via text augmentation.

    Input:
        - sentence:     A string of text
        - aug:          An augmentation object defined by the nlpaug library
        - num_threads:  Integer controlling the number of threads to use if
                        augmenting text via CPU
    Output:
        - A string of text that been augmented
    """""""""
    return aug.augment(sentence, num_thread=num_threads)

In [5]:
def augment_text(df, aug, num_threads, num_times, genres_to_augment: list):
    """""""""
    Takes a pandas DataFrame and augments its text data.

    Input:
        - df:            A pandas DataFrame containing the columns:
                                - 'summary' containing strings of text to augment.
                                - 'genre' target variable containing genres.
        - aug:           Augmentation object defined by the nlpaug library.
        - num_threads:   Integer controlling number of threads to use if augmenting
                         text via CPU
        - num_times:     Integer representing the number of times to augment text.
        - genres_to_augment: names of the genres that should be augmented
    Output:
        - df:            Copy of the same pandas DataFrame with augmented data
                         appended to it and with rows randomly shuffled.
    """""""""

    # Get rows of data to augment
    for genre in genres_to_augment:
        to_augment = df[df['genre']==genre]
        to_augment_x = to_augment['summary'].head(2)
        print(to_augment_x.shape)
        to_augment_y = genre

        # Build up dictionary containing augmented data
        aug_dict = {'summary':[], 'genre': to_augment_y}
        for _ in tqdm(range(num_times)):
            aug_x = [augment_sentence(x, aug, num_threads) for x in to_augment_x]
            aug_dict['summary'].extend(aug_x)

        # Build DataFrame containing augmented data
        aug_df = pd.DataFrame.from_dict(aug_dict)

        df = pd.concat([df, aug_df])

    return df

In [6]:
# Define nlpaug augmentation object
aug10p = nlpaw.ContextualWordEmbsAug(model_path='distilbert-base-uncased', aug_min=1, aug_p=0.1, action="substitute")

t = time()
print(grouped_df.shape[0])
balanced_df = augment_text(grouped_df, aug10p, num_threads=8, num_times=1, genres_to_augment=["other"])
print(balanced_df.shape[0])
print(time() - t)

4657
(2,)


  0%|          | 0/1 [00:00<?, ?it/s]


TypeError: can't convert mps:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.

In [11]:
"a" in "ab"

True