In [None]:
import pandas as pd
import random

# Function to randomly delete words in a sentence
def random_deletion(sentence, p=0.1):
    if len(sentence.split()) == 1:  # Return the sentence if it has only one word
        return sentence
    words = sentence.split()
    new_words = [word for word in words if random.random() > p]
    # If all words are deleted, return a random word
    if len(new_words) == 0:
        return random.choice(words)
    return ' '.join(new_words)

# Set the random seed for reproducibility
random.seed(0)

# Load your dataset
df = pd.read_csv("/content/AandBClasses.csv")

# Count the instances of each class
class_counts = df['Class'].value_counts()
augmentations_needed = class_counts['A'] - class_counts['B']

# Filter for class 'B' and duplicate if necessary
class_B = df[df['Class'] == 'B']
augmented_class_B = pd.concat([class_B]*((augmentations_needed // len(class_B)) + 1), ignore_index=True)
augmented_class_B = augmented_class_B.sample(n=augmentations_needed)

# Apply random deletion augmentation
augmented_class_B['Tweet'] = augmented_class_B['Tweet'].apply(random_deletion)

# Combine with class 'A', shuffle and save
balanced_df = pd.concat([df[df['Class'] == 'A'], augmented_class_B]).sample(frac=1).reset_index(drop=True)
balanced_df.to_csv('/content/Augmented_random_deletion.csv', index=False)


In the code above, the resulted file did not balanced the dataset, insted I got:
A    17509
B    10501

Now, I will apply another way to do Rrandom Deletion to generate a balanced dataset that has [17,509 ]  in each class.

In [1]:
import pandas as pd
import random

# Function to randomly delete words in a sentence
def random_deletion(sentence, p=0.1):
    if len(sentence.split()) == 1:  # Return the sentence if it has only one word
        return sentence
    words = sentence.split()
    new_words = [word for word in words if random.random() > p]
    # If all words are deleted, return a random word
    if len(new_words) == 0:
        return random.choice(words)
    return ' '.join(new_words)

# Set the random seed for reproducibility
random.seed(0)

# Load your dataset
df = pd.read_csv("/content/AandBClasses.csv")

# Filter for class 'B'
class_B = df[df['Class'] == 'B']

# Calculate the total number of class B instances needed
total_class_B_needed = df['Class'].value_counts()['A']

# If the original class B count is less than the required count, perform augmentation
if len(class_B) < total_class_B_needed:
    # Calculate the additional instances needed
    additional_instances_needed = total_class_B_needed - len(class_B)

    # Sample the additional instances (with replacement)
    additional_class_B = class_B.sample(n=additional_instances_needed, replace=True)

    # Apply random deletion augmentation
    additional_class_B['Tweet'] = additional_class_B['Tweet'].apply(random_deletion)

    # Combine with the original class B instances
    class_B_augmented = pd.concat([class_B, additional_class_B])

# Combine augmented class B with class A
balanced_df = pd.concat([df[df['Class'] == 'A'], class_B_augmented]).sample(frac=1).reset_index(drop=True)

# Save the balanced dataset
balanced_df.to_csv('Augmented_Balanced_Random_Deletion.csv', index=False)
