In [None]:
import json
import time

from llama_cpp import Llama
from langchain_core.prompts import ChatPromptTemplate, FewShotChatMessagePromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate

In [None]:
%run "read_and_write_docs.py"

In [None]:
%run "./google_drive.ipynb"

In [None]:
def get_raw_id(service, main_folder_name):
    """Function to get the id of the raw folder within the main folder on Google Drive"""
    
    # Get the files info from within the main folder
    main_folder_id = search_file_by_name(service, main_folder_name)['id']
    files_data = get_files_in_folder_recursive(service, main_folder_id, main_folder_name)
    files_data = split_name_column(files_data)

    # Get the folder data from within the file data
    folders = files_data[files_data['is_folder'] == True]

    # Split the data into the raw, error, and processed data
    raw_id = folders[folders['name'] == 'raw'].iloc[0,2]

    return raw_id

In [None]:
system_prompt = """
Given the sentence, generate as many paraphrased sentences as possible while preserving the original semantic meaning and style. 
Return the rephrased sentences in a python list format. Aim for AT LEAST TWENTY sentences. DO NOT INCLUDE ANY NOTES OR ADDITIONAL TEXT IN THE OUTPUT.

An example is below:
--------
Sentence: ```"Known for being very delicate, the skill could take a lifetime to master."```

Rephrased Sentences: ```["The skill is well known for its delicacy and could require a lifetime to perfect.", "The skill's reputation for delicateness suggests that it could take a whole lifetime to master.", "It may take a lifetime to master the skill, which is renowned for its delicacy.", "The delicacy of the skill means it could take a lifetime to master."]```
--------
Sentence: ```{original_user_supplied_sentence}```
"""

final_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{original_user_supplied_sentence}"),
    ]
)

In [None]:
def convert_to_phi(original_sentence,
                   prompt_input=final_prompt):

    messages = prompt_input.messages
    
    formatted_messages = ""

    for message in messages:
        if isinstance(message, SystemMessagePromptTemplate):
            formatted_messages += f"<|assistant|>\n{message.prompt.template.replace('\n', '')} <|end|>\n"
        elif isinstance(message, FewShotChatMessagePromptTemplate):
            formatted_messages += f"<|user|>\n{message.examples[0]['original_user_supplied_sentence'].replace('\n', '')} <|end|>\n"
            formatted_messages += f"<|assistant|>\n{message.examples[0]} <|end|>\n"
        elif isinstance(message, HumanMessagePromptTemplate):
            formatted_messages += f"<|user|>\n{message.prompt.template.replace('\n', '')} <|end|>\n"
    
    formatted_messages += f"<|assistant|>"

    formatted_prompt = formatted_messages.replace("<|user|>\n{original_user_supplied_sentence} <|end|>", f"<|user|>\n{original_sentence} <|end|>")
    
    return formatted_prompt
    

In [None]:
llm = Llama(
    model_path="C:/Users/benjc/Documents/models/Phi-3-mini-4k-instruct-q4.gguf",
    n_ctx=4096,  # The max sequence length to use - note that longer sequence lengths require much more resources
    n_threads=10, # Maximum I have is 12
    n_gpu_layers=-1, # The number of layers to offload to GPU, if you have GPU acceleration available.
    verbose=False,
    flash_attn=True
)

In [None]:
def phi_paraphrase(original_sentence,
                   n_runs=10,
                   llm=llm,
                   prompt_input=final_prompt):


    formatted_prompt = convert_to_phi(original_sentence, prompt_input=final_prompt)

    sentences = [original_sentence]
    new_sentence_amount = 1
    
    for i in range(1, n_runs + 1):

        if new_sentence_amount == 0:
            break
        print(f'  Iteration: {i}')
        attempts = 1
        
        while True:
            try:
                output_str = llm(formatted_prompt, max_tokens=1000, stop=["<|end|>"],
                                temperature=1)
                output_text = output_str['choices'][0]['text']

                # Find the index of the first '[' and the last ']'
                start_index = output_text.find('[')
                end_index = output_text.rfind(']')

                # Extract the content between the first '[' and the last ']'
                content_str = output_text[start_index+1:end_index]

                # Evaluate the content string as a Python expression to convert it into a list
                result_list = eval('[' + content_str + ']')

                break
            except Exception as e:
                print(f'    Attempt {attempts} failed: {str(e)}')
                attempts += 1  # Increment the number of attempts

                if attempts == 4:
                    print("3 Attempts Exceeded, Next Iteration.")
                    result_list = []
                    break

        new_sentence_amount = 0

        for result in result_list:
            if result not in sentences:
                sentences.append(result)
                new_sentence_amount += 1
        
    return sentences

In [None]:
def paraphrase_df(df, *args):
    doc_ids = []
    chunks = []
    rephrased_sentences = []
    
    n_rows = df['id'].count()

    
    for index, row in df.iterrows():
        row_num = index + 1
        print(f'Row {row_num} out of {n_rows}')
        doc_id = row['id']
        chunk = row['chunk_id']
        sentence = row['text']
        
        rephrased = phi_paraphrase(sentence, *args)
        num_sent = len(rephrased)
        
        # Extend lists with repeated doc_id and chunk_id
        doc_ids.extend([doc_id] * num_sent)
        chunks.extend([chunk] * num_sent)
        
        rephrased_sentences.extend(rephrased)

    # Construct DataFrame
    result_df = pd.DataFrame({
        'doc_id': doc_ids,
        'chunk_id': chunks,
        'sentence': rephrased_sentences
    })

    return result_df

In [None]:
df = read_jsonl_file('../data/guardian_preprocessed.jsonl')

In [None]:
filtered_df = df[(df['id'] == 7)]

In [None]:
# filtered_df = df[(df['id'] == 2) & (df['chunk_id'] == 16)]

In [None]:
def paraphrase_df_save(df, base_save_loc, google_drive_main_folder, *args):

    # Connect to my Google Drive 
    service = connect_to_drive()

    # Get the id of the raw save location in chosen folder
    raw_folder_id = get_raw_id(service, google_drive_main_folder)
    
    docs = df['id'].unique().tolist()
    
    if 'subchunk_id' in df.columns:
        subchunk = True
    else:
        subchunk = False

    for doc in docs:
        filtered_df = df[df['id'] == doc]
        n_rows = filtered_df['id'].count()

        for index, row in filtered_df.iterrows():
            doc_ids = []
            chunks = []
            subchunks = []
            rephrased_sentences = []
            
            doc_id = row['id']
            chunk = row['chunk_id']
            max_chunk = filtered_df['chunk_id'].max()
            sentence = row['text']
                
            print(f'Doc: {doc_id} - Chunk: {chunk + 1} out of {max_chunk + 1}')

            rephrased = phi_paraphrase(sentence, *args)
            num_sent = len(rephrased)
        
            # Extend lists with repeated doc_id and chunk_id
            doc_ids.extend([doc_id] * num_sent)
            chunks.extend([chunk] * num_sent)
            rephrased_sentences.extend(rephrased)
            
            if subchunk:
                s_chunk = row['subchunk_id']
                subchunks.extend([s_chunk] * num_sent)

                raw_df = pd.DataFrame({
                    'doc_id': doc_ids,
                    'chunk_id': chunks,
                    'subchunk_id': subchunks,
                    'rephrased': rephrased_sentences
                })

                filtered_raw_df = raw_df[(raw_df['doc_id'] == doc_id)  &
                    (raw_df['chunk_id'] == chunk) &
                    (raw_df['subchunk_id'] == s_chunk)]
                
                temp_loc = f"{base_save_loc}temp.jsonl"
                google_drive_name = f"doc_{doc}_chunk_{chunk}_subchunk_{s_chunk}.jsonl"

            else:
                raw_df = pd.DataFrame({
                    'doc_id': doc_ids,
                    'chunk_id': chunks,
                    'rephrased': rephrased_sentences
                })

                filtered_raw_df = raw_df[(raw_df['doc_id'] == doc_id)  &
                    (raw_df['chunk_id'] == chunk)]
                
                temp_loc = f"{base_save_loc}temp.jsonl"
                google_drive_name = f"doc_{doc}_chunk_{chunk}.jsonl"

            try:
                save_as_jsonl(filtered_raw_df, temp_loc)
                upload_file(service, google_drive_name, temp_loc, parent_folder_id=
            except:
                pass

In [None]:
# paraphrase_df_save(df, base_save_loc = "../data/guardian_phi/")