In [None]:
import pandas as pd
import random

# Set the random seed
random.seed(0)

# Constants
PUNCTUATIONS = ['.', ',', '!', '?', ';', ':']
PUNC_RATIO = 0.3

# Function to insert punctuation marks into a sentence
def insert_punctuation_marks(sentence, punc_ratio=PUNC_RATIO):
    words = sentence.split(' ')
    new_line = []
    q = random.randint(1, int(punc_ratio * len(words) + 1))
    qs = random.sample(range(0, len(words)), q)

    for j, word in enumerate(words):
        if j in qs:
            new_line.append(PUNCTUATIONS[random.randint(0, len(PUNCTUATIONS)-1)])
            new_line.append(word)
        else:
            new_line.append(word)
    new_line = ' '.join(new_line)
    return new_line

# Load the dataset
df = pd.read_csv("/content/AandBClasses.csv")

# Count the instances of each class
class_counts = df['Class'].value_counts()

# Calculate the number of augmentations needed
augmentations_needed = class_counts['A'] - class_counts['B']

# Filter the dataset for class 'B'
class_B = df[df['Class'] == 'B']

# Create augmented entries
augmented_entries = []
for _ in range(augmentations_needed):
    random_row = class_B.sample(1).iloc[0]
    augmented_sentence = insert_punctuation_marks(random_row['Tweet'])
    augmented_entries.append({'ID': random_row['ID'], 'Tweet': augmented_sentence, 'Class': 'B'})

# Convert augmented entries to a DataFrame and append to the original dataset
augmented_df = pd.DataFrame(augmented_entries)
df_augmented = pd.concat([df, augmented_df])

# Shuffle the dataset
df_augmented = df_augmented.sample(frac=1).reset_index(drop=True)

# Save the augmented dataset
df_augmented.to_csv('Punctuation_Insertion_augmentation.csv', index=False)
