In [1]:
import json
import time

from llama_cpp import Llama
from langchain_core.prompts import ChatPromptTemplate, FewShotChatMessagePromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate

In [2]:
%run "./google_drive.ipynb"

In [3]:
def get_raw_id(service, main_folder_name):
    """Function to get the id of the raw folder within the main folder on Google Drive"""
    
    # Get the files info from within the main folder
    main_folder_id = search_file_by_name(service, main_folder_name)['id']
    files_data = get_files_in_folder_recursive(service, main_folder_id, main_folder_name)
    files_data = split_name_column(files_data)

    # Get the folder data from within the file data
    folders = files_data[files_data['is_folder'] == True]

    # Split the data into the raw, error, and processed data
    raw_id = folders[folders['name'] == 'raw'].iloc[0,2]

    return raw_id

In [2]:
system_prompt = """
Given the sentence, generate as many paraphrased sentences as possible while preserving the original semantic meaning and style. 
Return the rephrased sentences in a python list format. Aim for AT LEAST TWENTY sentences. DO NOT INCLUDE ANY NOTES OR ADDITIONAL TEXT IN THE OUTPUT.

An example is below:
--------
Sentence: ```"Known for being very delicate, the skill could take a lifetime to master."```

Rephrased Sentences: ```["The skill is well known for its delicacy and could require a lifetime to perfect.", "The skill's reputation for delicateness suggests that it could take a whole lifetime to master.", "It may take a lifetime to master the skill, which is renowned for its delicacy.", "The delicacy of the skill means it could take a lifetime to master."]```
--------
Sentence: ```{original_user_supplied_sentence}```
"""

final_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{original_user_supplied_sentence}"),
    ]
)

In [3]:
def convert_to_phi(original_sentence,
                   prompt_input=final_prompt):

    messages = prompt_input.messages
    
    formatted_messages = ""

    for message in messages:
        if isinstance(message, SystemMessagePromptTemplate):
            formatted_messages += f"<|assistant|>\n{message.prompt.template.replace('\n', '')} <|end|>\n"
        elif isinstance(message, FewShotChatMessagePromptTemplate):
            formatted_messages += f"<|user|>\n{message.examples[0]['original_user_supplied_sentence'].replace('\n', '')} <|end|>\n"
            formatted_messages += f"<|assistant|>\n{message.examples[0]} <|end|>\n"
        elif isinstance(message, HumanMessagePromptTemplate):
            formatted_messages += f"<|user|>\n{message.prompt.template.replace('\n', '')} <|end|>\n"
    
    formatted_messages += f"<|assistant|>"

    formatted_prompt = formatted_messages.replace("<|user|>\n{original_user_supplied_sentence} <|end|>", f"<|user|>\n{original_sentence} <|end|>")
    
    return formatted_prompt
    

In [4]:
llm = Llama(
    model_path="C:/Users/benjc/Documents/models/Phi-3-mini-4k-instruct-q4.gguf",
    n_ctx=4096,  # The max sequence length to use - note that longer sequence lengths require much more resources
    n_threads=10, # Maximum I have is 12
    n_gpu_layers=-1, # The number of layers to offload to GPU, if you have GPU acceleration available.
    verbose=False,
    flash_attn=True
)

In [5]:
def phi_paraphrase(original_sentence,
                   n_runs=10,
                   llm=llm,
                   prompt_input=final_prompt):


    formatted_prompt = convert_to_phi(original_sentence, prompt_input=final_prompt)

    sentences = [original_sentence]
    new_sentence_amount = 1
    
    for i in range(1, n_runs + 1):

        if new_sentence_amount == 0:
            break
        print(f'  Iteration: {i}')
        attempts = 1
        
        while True:
            try:
                output_str = llm(formatted_prompt, max_tokens=1000, stop=["<|end|>"],
                                temperature=1)
                output_text = output_str['choices'][0]['text']

                # Find the index of the first '[' and the last ']'
                start_index = output_text.find('[')
                end_index = output_text.rfind(']')

                # Extract the content between the first '[' and the last ']'
                content_str = output_text[start_index+1:end_index]

                # Evaluate the content string as a Python expression to convert it into a list
                result_list = eval('[' + content_str + ']')

                break
            except Exception as e:
                print(f'    Attempt {attempts} failed: {str(e)}')
                attempts += 1  # Increment the number of attempts

                if attempts == 4:
                    print("3 Attempts Exceeded, Next Iteration.")
                    result_list = []
                    break

        new_sentence_amount = 0

        for result in result_list:
            if result not in sentences:
                sentences.append(result)
                new_sentence_amount += 1
        
    return sentences

In [6]:
def paraphrase_df(df, *args):
    doc_ids = []
    chunks = []
    rephrased_sentences = []
    
    n_rows = df['id'].count()

    
    for index, row in df.iterrows():
        row_num = index + 1
        print(f'Row {row_num} out of {n_rows}')
        doc_id = row['id']
        chunk = row['chunk_id']
        sentence = row['text']
        
        rephrased = phi_paraphrase(sentence, *args)
        num_sent = len(rephrased)
        
        # Extend lists with repeated doc_id and chunk_id
        doc_ids.extend([doc_id] * num_sent)
        chunks.extend([chunk] * num_sent)
        
        rephrased_sentences.extend(rephrased)

    # Construct DataFrame
    result_df = pd.DataFrame({
        'doc_id': doc_ids,
        'chunk_id': chunks,
        'sentence': rephrased_sentences
    })

    return result_df

In [7]:
%run "read_and_write_docs.py"

In [8]:
df = read_jsonl_file('../data/guardian_preprocessed.jsonl')

In [9]:
df

Unnamed: 0,index,id,chunk_id,author,topic,word_count,subchunk_id,input_length,subchunk,text
0,0,1,0,12,4,14,0,86,0,"DBC Pierre, Booker Prize-winner and author of ..."
1,1,1,1,12,4,33,0,186,0,The first rule of interviewing people in Irela...
2,2,1,2,12,4,44,0,265,0,It irks me when these literary bad boys (espec...
3,3,1,3,12,4,10,0,49,0,It irks me still further when they wont admit it.
4,4,1,4,12,4,13,0,69,0,"No, no, no, says Dirty But Clean (thats what h..."
...,...,...,...,...,...,...,...,...,...,...
3243,3243,63,4,9,4,8,0,43,0,He is the politics of substance made flesh.
3244,3244,63,5,9,4,18,0,98,0,His popularity is a vindication of those of us...
3245,3245,63,6,9,4,31,0,189,0,If he had not stooped to make that Cambridge U...
3246,3246,63,7,9,4,28,0,162,0,"As it is, he will have to be satisfied with hi..."


In [22]:
filtered_df = df[(df['id'] == 7)]

In [23]:
# filtered_df = df[(df['id'] == 2) & (df['chunk_id'] == 16)]

In [24]:
# filtered_df = df[(df['id'] == 2) & (df['chunk_id'] >= 52)]

In [25]:
# filtered_df = df[(df['id'] == 3) & (df['chunk_id'] >= 22)]

In [26]:
# filtered_df = df[(df['id'] == 4) & (df['chunk_id'] >= 73)]

In [27]:
#filtered_df = df[(df['id'] == 5) & (df['chunk_id'] >= 41)]

In [28]:
filtered_df

Unnamed: 0,index,id,chunk_id,author,topic,word_count,subchunk_id,input_length,subchunk,text
735,735,7,0,4,4,22,0,139,0,﻿That Christopher Meyers memoirs have been as ...
736,736,7,1,4,4,14,0,69,0,"But, read in the round, how does the book as a..."
737,737,7,2,4,4,16,0,101,0,My answer is that Meyers book is both better a...
738,738,7,3,4,4,29,0,166,0,"It is better because, contrary to the implicat..."
739,739,7,4,4,4,11,0,60,0,Meyers account of these events is painted on a...
740,740,7,5,4,4,22,0,111,0,His Washington years stretched from autumn 199...
741,741,7,6,4,4,16,0,84,0,The majority of his time in DC fell within the...
742,742,7,7,4,4,14,0,84,0,Indeed Bush barely makes an appearance in this...
743,743,7,8,4,4,43,0,230,0,"Seen in that light, this is an important book ..."
744,744,7,9,4,4,18,0,118,0,The strength of the Blair-Clinton and the Blai...


In [4]:
def paraphrase_df_save(df, base_save_loc, google_drive_main_folder, *args):

    # Connect to my Google Drive 
    service = connect_to_drive()

    # Get the id of the raw save location in chosen folder
    raw_folder_id = get_raw_id(service, google_drive_main_folder)
    
    docs = df['id'].unique().tolist()
    
    if 'subchunk_id' in df.columns:
        subchunk = True
    else:
        subchunk = False

    for doc in docs:
        start_time = time.time()
        doc_ids = []
        chunks = []
        subchunks = []
        rephrased_sentences = []
        filtered_df = df[df['id'] == doc]
        
        save_loc = f"{base_save_loc}doc_{doc}.jsonl"
        n_rows = filtered_df['id'].count()

        for index, row in filtered_df.iterrows():
            doc_id = row['id']
            chunk = row['chunk_id']
            max_chunk = filtered_df['chunk_id'].max()
            sentence = row['text']
                
            print(f'Doc: {doc_id} - Chunk: {chunk + 1} out of {max_chunk + 1}')

            rephrased = phi_paraphrase(sentence, *args)
            num_sent = len(rephrased)
        
            # Extend lists with repeated doc_id and chunk_id
            doc_ids.extend([doc_id] * num_sent)
            chunks.extend([chunk] * num_sent)
            if subchunk:
                s_chunk = row['subchunk_id']
                subchunks.extend([s_chunk] * num_sent)
                
            rephrased_sentences.extend(rephrased)

            if subchunk:
                raw_df = pd.DataFrame({
                    'doc_id': doc_ids,
                    'chunk_id': chunks,
                    'subchunk_id': subchunks,
                    'rephrased': rephrased_sentences
                })

                filtered_raw_df = raw_df[(raw_df['doc_id'] == doc_id)  &
                    (raw_df['chunk_id'] == chunk) &
                    (raw_df['subchunk_id'] == s_chunk)]
                
                temp_loc = f"{base_save_loc}temp.jsonl"
                google_drive_name = f"doc_{doc}_chunk_{chunk}_subchunk_{s_chunk}.jsonl"

            else:
                raw_df = pd.DataFrame({
                    'doc_id': doc_ids,
                    'chunk_id': chunks,
                    'rephrased': rephrased_sentences
                })

                filtered_raw_df = raw_df[(raw_df['doc_id'] == doc_id)  &
                    (raw_df['chunk_id'] == chunk)]
                
                temp_loc = f"{base_save_loc}temp.jsonl"
                google_drive_name = f"doc_{doc}_chunk_{chunk}.jsonl"

            try:
                save_as_jsonl(filtered_raw_df, temp_loc)
                upload_file(service, google_drive_name, temp_loc, parent_folder_id=raw_folder_id)
            except:
                pass
                
        if subchunk:
            # Construct DataFrame
            result_df = pd.DataFrame({
            'doc_id': doc_ids,
            'chunk_id': chunks,
            'subchunk_id': subchunks,
            'rephrased': rephrased_sentences
            })

            merged_df = result_df.merge(filtered_df, left_on=['doc_id', 'chunk_id', 'subchunk_id'],
                                        right_on=['id', 'chunk_id', 'subchunk_id'], how='left')

            selected_df = merged_df[['doc_id', 'chunk_id', 'subchunk_id', 'author', 'topic', 'text', 'rephrased']]
            
        else:
            # Construct DataFrame
            result_df = pd.DataFrame({
            'doc_id': doc_ids,
            'chunk_id': chunks,
            'rephrased': rephrased_sentences
            })

            merged_df = result_df.merge(filtered_df, left_on=['doc_id', 'chunk_id'],
                                        right_on=['id', 'chunk_id'], how='left')
            selected_df = merged_df[['doc_id', 'chunk_id', 'author', 'topic', 'text', 'rephrased']]

        # Filter out rows where text == rephrased
        filtered_df = selected_df[selected_df['text'] != selected_df['rephrased']]

        filtered_df['input_length'] = filtered_df['text'].apply(len)
        filtered_df['output_length'] = filtered_df['rephrased'].apply(len)

        save_as_jsonl(filtered_df, save_loc)
        
        end_time = time.time()
        time_taken_mins = (end_time - start_time) / 100
        time_taken_hours = round(time_taken_mins / 60, 2)
        
        print(f"Doc: {doc} Saved. Time Taken: {time_taken_hours} hours")

In [30]:
# paraphrase_df_save(df, base_save_loc = "../data/guardian_phi/")

In [31]:
paraphrase_df_save(filtered_df, base_save_loc = "../data/guardian_phi/")

Doc: 7 - Chunk: 1 out of 44
  Iteration: 1
  Iteration: 2
  Iteration: 3
  Iteration: 4
  Iteration: 5
  Iteration: 6
    Attempt 1 failed: invalid syntax. Perhaps you forgot a comma? (<string>, line 13)
  Iteration: 7
  Iteration: 8
  Iteration: 9
  Iteration: 10
Doc: 7 - Chunk: 2 out of 44
  Iteration: 1
  Iteration: 2
    Attempt 1 failed: invalid syntax (<string>, line 3)
  Iteration: 3
    Attempt 1 failed: invalid syntax (<string>, line 2)
    Attempt 2 failed: invalid syntax (<string>, line 3)
    Attempt 3 failed: invalid syntax (<string>, line 3)
3 Attempts Exceeded, Next Iteration.
Doc: 7 - Chunk: 3 out of 44
  Iteration: 1
  Iteration: 2
  Iteration: 3
  Iteration: 4
  Iteration: 5
  Iteration: 6
    Attempt 1 failed: unterminated string literal (detected at line 1) (<string>, line 1)
  Iteration: 7
  Iteration: 8
  Iteration: 9
    Attempt 1 failed: invalid syntax (<string>, line 3)
  Iteration: 10
Doc: 7 - Chunk: 4 out of 44
  Iteration: 1
  Iteration: 2
    Attempt 1 fail



    Attempt 1 failed: list indices must be integers or slices, not str
  Iteration: 7
  Iteration: 8
  Iteration: 9
  Iteration: 10
Doc: 7 - Chunk: 19 out of 44
  Iteration: 1
  Iteration: 2
  Iteration: 3
  Iteration: 4
  Iteration: 5
  Iteration: 6
  Iteration: 7
  Iteration: 8
  Iteration: 9
  Iteration: 10
Doc: 7 - Chunk: 20 out of 44
  Iteration: 1
  Iteration: 2
  Iteration: 3
  Iteration: 4
  Iteration: 5
  Iteration: 6
  Iteration: 7
    Attempt 1 failed: invalid syntax (<string>, line 10)
  Iteration: 8
    Attempt 1 failed: unterminated string literal (detected at line 1) (<string>, line 1)
  Iteration: 9
  Iteration: 10
Doc: 7 - Chunk: 21 out of 44
  Iteration: 1
  Iteration: 2
  Iteration: 3
    Attempt 1 failed: invalid syntax (<string>, line 3)
    Attempt 2 failed: invalid syntax. Perhaps you forgot a comma? (<string>, line 6)
  Iteration: 4
  Iteration: 5
    Attempt 1 failed: invalid syntax. Perhaps you forgot a comma? (<string>, line 15)
  Iteration: 6
  Iteration: 7




    Attempt 2 failed: list indices must be integers or slices, not str
  Iteration: 2
  Iteration: 3
    Attempt 1 failed: unexpected character after line continuation character (<string>, line 1)




    Attempt 2 failed: list indices must be integers or slices, not str
    Attempt 3 failed: unterminated string literal (detected at line 1) (<string>, line 1)
3 Attempts Exceeded, Next Iteration.
Doc: 7 - Chunk: 43 out of 44
  Iteration: 1
  Iteration: 2
  Iteration: 3
  Iteration: 4
  Iteration: 5
  Iteration: 6
  Iteration: 7
  Iteration: 8
  Iteration: 9
    Attempt 1 failed: unmatched ']' (<string>, line 11)
  Iteration: 10
Doc: 7 - Chunk: 44 out of 44
  Iteration: 1
  Iteration: 2
  Iteration: 3
  Iteration: 4
  Iteration: 5
    Attempt 1 failed: unterminated string literal (detected at line 6) (<string>, line 6)
  Iteration: 6
    Attempt 1 failed: unterminated string literal (detected at line 1) (<string>, line 1)
  Iteration: 7
    Attempt 1 failed: unterminated string literal (detected at line 1) (<string>, line 1)
    Attempt 2 failed: unterminated string literal (detected at line 1) (<string>, line 1)
    Attempt 3 failed: unterminated string literal (detected at line 1) (

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['input_length'] = filtered_df['text'].apply(len)


TypeError: object of type 'ellipsis' has no len()