In [None]:
!pip install googletrans==4.0.0-rc1
!pip install dask

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from googletrans import Translator # use version 4.0.0-rc1
from dask import bag, diagnostics
import nlpaug.augmenter.word as naw
import nlpaug.flow as naf

In [None]:
def random_char_insertion(text):
    aug = nac.KeyboardAug()
    augmented_text = aug.augment(text)
    return augmented_text[0]

def random_swap(text):
    aug = naw.RandomWordAug(action="swap")
    augmented_text = aug.augment(text)
    return augmented_text[0]

def synonym_replacement(text):
    aug = naw.SynonymAug(aug_src='wordnet')
    augmented_text = aug.augment(text)
    return augmented_text[0]

def back_translate(sequence, target_lang):

    languages = ['en', 'fr', 'th', 'tr', 'ur', 'ru', 'bg', 'de', 'ar', 'zh-cn', 'hi',
                 'sw', 'vi', 'es', 'el']
    #instantiate translator
    translator = Translator()
    
    #store original language so we can convert back
    org_lang = translator.detect(sequence).lang

    try:
        if org_lang in languages:
            #translate to new language and back to original
            translated = translator.translate(sequence, dest = target_lang).text
            #translate back to original language
            translated_back = translator.translate(translated, dest = org_lang).text
        
            output_sequence = translated_back        
        #if detected language not in our list of languages, do nothing
        else:
            output_sequence = sequence
    except:
        output_sequence = sequence
    
    return output_sequence

# Applies above define function with Dask
def back_translate_parallel(dataset, target_lang):
    dataset = dataset.copy()
    text_bag = bag.from_sequence(dataset['text'].tolist()).map(back_translate, target_lang)
    
    with diagnostics.ProgressBar():
        text_bag = text_bag.compute()

    # Add the translated to a new dataframe
    df_augmented = pd.DataFrame({"text": text_bag, "class": dataset['class']})
    return df_augmented

In [None]:
def data_augmentation(pcl_df_train_train):

    ## Back translation

    for i in range(0,600,100):
        pcl_df_train_train_aug = pcl_df_train_train[pcl_df_train_train['class'] == 1].iloc[i:i+100].copy()
        pcl_df_train_train_aug.dropna(inplace=True)
        pcl_df_train_train_aug = back_translate_parallel(pcl_df_train_train_aug, 'fr')

        pcl_df_train_train_aug['class'] = 1

        pcl_df_train_train = pd.concat([pcl_df_train_train, pcl_df_train_train_aug], ignore_index=True)

    ## Synonym replacement 

    pcl_df_train_train_aug = pcl_df_train_train[pcl_df_train_train['class'] == 1].copy()
    pcl_df_train_train_aug['text'] = pcl_df_train_train_aug['text'].apply(lambda x: synonym_replacement(x))
    pcl_df_train_train_aug['class'] = 1

    pcl_df_train_train = pd.concat([pcl_df_train_train, pcl_df_train_train_aug], ignore_index=True)

    ## Random swap

    pcl_df_train_train_aug = pcl_df_train_train[pcl_df_train_train['class'] == 1][:1000].copy()
    pcl_df_train_train_aug['text'] = pcl_df_train_train_aug['text'].apply(lambda x: random_swap(x))
    pcl_df_train_train_aug['class'] = 1

    pcl_df_train_train = pd.concat([pcl_df_train_train, pcl_df_train_train_aug], ignore_index=True)

    ## Random char insertion

    pcl_df_train_train_aug = pcl_df_train_train[pcl_df_train_train['class'] == 1][:1000].copy()
    pcl_df_train_train_aug['text'] = pcl_df_train_train_aug['text'].apply(lambda x: random_char_insertion(x))
    pcl_df_train_train_aug['class'] = 1

    pcl_df_train_train = pd.concat([pcl_df_train_train, pcl_df_train_train_aug], ignore_index=True)

    return pcl_df_train_train    

## ChatGPT Text Generation

Generating Text with ChatGPT, prompts are the definitions of the 3 types of PCL

In [None]:
import pandas as pd
import time
import openai

# Add your OpenAI API key 
api_key = "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"

class GenerateChatGPTtext():
    def __init__(self, input,  num_samples, api_key):
        self.input = input
        self.num_samples = num_samples
        openai.api_key = api_key
    def generate_text(self, input):        
        response = openai.ChatCompletion.create(model="gpt-3.5-turbo",
                                            messages=[       
                                            {"role": "user", "content": input}      
                                            ])
        return response["choices"][0]["message"]["content"]

    def generate_samples(self):
        results = []
        while len(results) < self.num_samples:
            try:
                output = self.generate_text(self.input)
                results.append(output)
                print(output)
                time.sleep(3)
            except:
                time.sleep(3)
                continue 
        return results

    def re_establish_connection(self):
        openai.api_key = self.api_key
        return openai.api_key


    def remove_example(self, results):
        for i in range(len(results)):
            if results[i].startswith('Example: '):
                results[i] = results[i][9:]
            else:
                continue
        return results
        
saviour_input = "The saviour is defined as The community which the author and the majority of the audience belong to is presented in some way as saviours of those vulnerable or in need. The language used subtly positionsthe author in a better, more privileged situation than the vulnerable community. They express thewill to help them, from their superior and advantageous position. Give an example of waht the saviour would say"
poet_input = "The poet is a person who speaks in a patronising manner, The focus is not on the we (author and audience), but on the they (the individual or community referred to). The author uses a literary style to describe people or situations. They might, for example, use (or abuse) adjectives or rhetorical devices to either present a difficult situation as somehow beautiful, something to admire and learn from, or they might carefully detail its roughness to touch the heart of their audience. Give an example of what the poet would say"
expert_input = "The expert is a person who speaks in a patronising manner, The underlying message is that the privileged community, which the author and their audience belong to, knows better what the vulnerable community needs, how they are or what they should do to overcome their situation. Give an example of what the expert would say"


inputs = [saviour_input, poet_input, expert_input]
num_samples = 100
full_results = []

for i in range(len(inputs)):
    gpt = GenerateChatGPTtext(inputs[i],  num_samples, api_key)
    results = gpt.generate_samples()
    results = gpt.remove_example(results)
    full_results.extend(results)

df = pd.DataFrame(full_results, columns=['text'])
df['class'] = 1

# save the dataframe to a csv file
df.to_csv('chatgpt_generated_text.csv', index=False)




Reword Every PCL Train Example

In [1]:
import pandas as pd
import time
import openai

# Add your OpenAI API key 
api_key = "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"

class ReWordChatGPT():
    def __init__(self,api_key, train_file):
        self.input = input
        openai.api_key = api_key
        self.train_file = pd.read_csv(train_file)
        self.train_list = list(self.train_file[self.train_file['class'] == 1]['text'])
        
    def generate_text(self, input):
        response = openai.ChatCompletion.create(model="gpt-3.5-turbo",
                                            messages=[       
                                            {"role": "user", "content": input}      
                                            ])
        return response["choices"][0]["message"]["content"]


    def generate_random_reword_sample(self):
        """ Reword every sample in the train data using the chatgpt model
        """
        results = []
        for i in range(len(self.train_list)):
            sample_result = []
            while len(sample_result) == 0: # keep trying until we get a result
                try:
                    input = f"can you reword {self.train_list[i]}"
                    print('input: ', input)
                    output = self.generate_text(input)
                    sample_result.append(output)
                    print('output: ', output)
                    time.sleep(3)
                except:
                    time.sleep(3)
                    continue 
            results.extend(sample_result)
        return results


train_file = 'archive\pcl_df_train_train.csv'

reword = ReWordChatGPT(api_key, train_file)
results = reword.generate_random_reword_sample()
# export to csv
df = pd.DataFrame(results, columns=['text'])
df['class'] = 1

# save the dataframe to a csv file
df.to_csv('chatgpt_reworded_text.csv', index=False)


input:  can you reword "As chief minister of Rajasthan , Shekhawat had introduced ' Antodaya Yojna ' to fulfil the foodgrain needs of the poor , Gadkari said and described Shekhawat as a "" generous person "" ."
output:  

Gadkari commended Shekhawat for implementing the 'Antodaya Yojna' program during his tenure as the chief minister of Rajasthan, which aimed to provide foodgrains for people in need. Additionally, Gadkari described Shekhawat as a kind-hearted individual.
input:  can you reword The underprivileged also benefit a great deal at this time when donations and presents are distributed to the homeless and orphanages .
output:  

Those who are less fortunate also reap significant advantages during the time when contributions and gifts are given to homeless individuals and orphanages.
input:  can you reword """ There are people who are struggling to make ends meet and it just seems ironic that in a country like Australia where we are so blessed with so much land and so much fer