In [4]:
from parrot import Parrot
import json
import torch
import warnings
import read_and_write_docs

import pandas as pd

warnings.filterwarnings("ignore")

In [2]:
#Init models (make sure you init ONLY once if you integrate this to your code)
parrot = Parrot(model_tag="prithivida/parrot_paraphraser_on_T5")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [3]:
from parrot import Parrot
import json
import torch
import warnings
import read_and_write_docs

import pandas as pd

warnings.filterwarnings("ignore")

#Init models (make sure you init ONLY once if you integrate this to your code)
parrot = Parrot(model_tag="prithivida/parrot_paraphraser_on_T5")

def parrot_paraphrase(phrase, n_iterations, model, diverse=False):
    """
    Generates paraphrases for a given phrase using the Parrot paraphraser.

    Args:
        phrase (str): The phrase to be paraphrased.
        n_iterations (int): Number of iterations to generate paraphrases.
        diverse (bool, optional): Flag to enable diverse paraphrasing. Defaults to False.

    Returns:
        list: A list of unique paraphrases.

    Raises:
        ValueError: If `diverse` is not a boolean value.
    """
    
    # Set thresholds based on the diversity flag
    if diverse == False:
        diverse = False
        ad_thresh = 0.99
        fl_thresh = 0.9
    elif diverse == True:
        diverse = True
        ad_thresh = 0.7
        fl_thresh = 0.7
    else:
        raise ValueError("The 'diverse' argument must be a boolean value.")

    # Initialize the list to store paraphrases
    stored_phrases = []

    # Generate paraphrases for the given number of iterations
    for i in range(1, n_iterations):
        paraphrases = parrot.augment(input_phrase=phrase,
                                     use_gpu=True,
                                     diversity_ranker="levenshtein",
                                     do_diverse=diverse,
                                     max_return_phrases=100,
                                     max_length=1000,
                                     adequacy_threshold=ad_thresh,
                                     fluency_threshold=fl_thresh)

        # If paraphrases are generated, add them to the stored phrases list
        if paraphrases is not None:
            num_phrases = len(paraphrases)
            for j in range(1, num_phrases):
                paraphrase = paraphrases[j][0]
                stored_phrases.append(paraphrase)
        else:
            stored_phrases = []

    # Remove duplicates by converting the list to a set and back to a list
    result = list(set(stored_phrases))

    return result

def paraphrase_dataframe(df, save_location, n_iterations=10):
    """
    Paraphrases the text in the DataFrame and saves the results to JSONL files by document ID.

    Args:
        df (pd.DataFrame): The DataFrame containing the text to be paraphrased.
        save_location (str): The location to save the JSONL files.
        n_iterations (int, optional): Number of iterations to generate paraphrases. Defaults to 10.

    Returns:
        None
    """
    # Get unique document IDs
    unique_doc_ids = df['doc_id'].unique()

    # Loop through each document ID
    for doc_id in unique_doc_ids:
        # Filter the DataFrame by the current document ID
        doc_df = df[df['doc_id'] == doc_id]
        
        new_rows = []
        
        # Loop through each row in the filtered DataFrame
        for index, row in doc_df.iterrows():
            print(f"Paraphrasing sentence {index + 1} out of {len(doc_df)} for doc_id {doc_id}")  # Added print statement
            result = parrot_paraphrase(row['text'], n_iterations)
            
            # Add the paraphrases to the new rows list
            for paraphrase in result:
                new_row = {
                    'index': index,
                    'doc_id': row['doc_id'],
                    'author_id': row['author_id'],
                    'chunk_id': row['chunk_id'],
                    'gender': row['gender'],
                    'age': row['age'],
                    'topic': row['topic'],
                    'sign': row['sign'],
                    'date': row['date'],
                    'text': paraphrase
                }
                new_rows.append(new_row)
        
        # Save the paraphrased results to a JSONL file for the current document ID
        jsonl_path = f"{save_location}/doc_{doc_id}.jsonl"
        read_and_write_docs.save_as_jsonl(new_rows, jsonl_path)

    print("Paraphrasing complete.")

In [None]:
def paraphrase_dataframe(df, save_location, n_iterations=10):
  new_rows = []
  for index, row in df.iterrows():
    print(f"Paraphrasing sentence {index + 1} out of {len(df)}") # Added print statement
    result = parrot_paraphrase(row['text'], n_iterations)
    for paraphrase in result:
      new_row = {
          'index': index,
          'id': row['id'],
          'author_id': row['author_id'],
          'chunk_id': row['chunk_id'],
          'subchunk_id': row['subchunk_id'],
          'gender': row['gender'],
          'age': row['age'],
          'topic': row['topic'],
          'sign': row['sign'],
          'date': row['date'],
          'paraphrase': paraphrase
      }
      new_rows.append(new_row)

    # Save the DataFrame to a CSV file after each iteration
    pd.DataFrame(new_rows).to_csv(save_location, index=False)

  new_df = pd.DataFrame(new_rows)
  return new_df



In [15]:
def paraphrase_dataframe(df, save_location, n_iterations=10):
    """
    Paraphrases the text in the DataFrame and saves the results to JSONL files by document ID.

    Args:
        df (pd.DataFrame): The DataFrame containing the text to be paraphrased.
        save_location (str): The location to save the JSONL files.
        n_iterations (int, optional): Number of iterations to generate paraphrases. Defaults to 10.

    Returns:
        None
    """
    # Get unique document IDs
    unique_doc_ids = df['doc_id'].unique()

    # Loop through each document ID
    for doc_id in unique_doc_ids:
        # Filter the DataFrame by the current document ID
        doc_df = df[df['doc_id'] == doc_id]
        
        new_rows = []
        
        # Loop through each row in the filtered DataFrame
        for index, row in doc_df.iterrows():
            print(f"Paraphrasing sentence {index + 1} out of {len(doc_df)} for doc_id {doc_id}")  # Added print statement
            result = parrot_paraphrase(row['text'], n_iterations)
            
            # Add the paraphrases to the new rows list
            for paraphrase in result:
                new_row = {
                    'index': index,
                    'doc_id': row['doc_id'],
                    'author_id': row['author_id'],
                    'chunk_id': row['chunk_id'],
                    'gender': row['gender'],
                    'age': row['age'],
                    'topic': row['topic'],
                    'sign': row['sign'],
                    'date': row['date'],
                    'text': paraphrase
                }
                new_rows.append(new_row)
        
        # Save the paraphrased results to a JSONL file for the current document ID
        jsonl_path = f"{save_location}/doc_{doc_id}.jsonl"
        read_and_write_docs.save_as_jsonl(new_rows, jsonl_path)

    print("Paraphrasing complete.")

In [16]:
g_drive_base = "/Users/user/Library/CloudStorage/GoogleDrive-benjcross1995@gmail.com/My Drive/datasets/blogger_new_algorithm/"

In [17]:
base_file_loc = f"{g_drive_base}rephrased_preprocessed.jsonl"
save_folder = f"{g_drive_base}parrot_rephrased_diverse"

In [18]:
initial_df = read_and_write_docs.read_jsonl_file(base_file_loc)

In [19]:
initial_df

Unnamed: 0,doc_id,author_id,gender,age,topic,sign,date,text,chunk_id
0,18516,4160528,female,16,Student,Leo,"09,August,2004",Kelz~ We really got close this summer..,1
1,18516,4160528,female,16,Student,Leo,"09,August,2004",its gonna be kewl when u stay with me and i st...,2
2,18516,4160528,female,16,Student,Leo,"09,August,2004",Newayz..,3
3,18516,4160528,female,16,Student,Leo,"09,August,2004",races are the bomb especially that one day whe...,4
4,18516,4160528,female,16,Student,Leo,"09,August,2004",Itll all turn out for the best.,5
...,...,...,...,...,...,...,...,...,...
552,676573,2876684,male,24,Technology,Scorpio,"02,March,2004","Hello, my ID wasnt even CHECKED when I booked ...",15
553,676573,2876684,male,24,Technology,Scorpio,"02,March,2004",I felt like kicking up a fuss but the weird fe...,16
554,676573,2876684,male,24,Technology,Scorpio,"02,March,2004","Besides, I like to consider myself a patient p...",17
555,676573,2876684,male,24,Technology,Scorpio,"02,March,2004","Anyways, the testing officer wasnt too concern...",18


In [20]:
paraphrase_dataframe(initial_df, save_folder)

Paraphrasing sentence 1 out of 57 for doc_id 18516


AssertionError: Torch not compiled with CUDA enabled