In [18]:
import json
import time
import pandas as pd

from llama_cpp import Llama
from langchain_core.prompts import ChatPromptTemplate, FewShotChatMessagePromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate

In [5]:
%run "read_and_write_docs.py"

In [6]:
%run "./google_drive.ipynb"

doc_28_chunk_56_subchunk_0.jsonl
    File with ID '1kbJx0UJldMz3jtNKQA_8hTrey_FJmg77' deleted successfully.
doc_28_chunk_58_subchunk_0.jsonl
    File with ID '17hpXTzZ-CBw2WpbTCAOyh-iiIDY6ZDoY' deleted successfully.
doc_28_chunk_59_subchunk_0.jsonl
    File with ID '17a5tPrY7rA0pAbfoVZnEB3TM4ryQU3SG' deleted successfully.
doc_28_chunk_60_subchunk_0.jsonl
    File with ID '15Ug5Q6araf-DcJss8Nx2T-JCS8vmGRo0' deleted successfully.
doc_28_chunk_61_subchunk_0.jsonl
    File with ID '191jdHLusfsYT43xxPw17tsHpnkP4rJ-D' deleted successfully.
doc_28_chunk_63_subchunk_0.jsonl
    File with ID '1o6vxYqDeA23n4ucogd5AynPPX_6q2fqG' deleted successfully.
doc_28_chunk_64_subchunk_0.jsonl
    File with ID '1Pndrm15s7Aoa4VIOdwtK3mg_k4iT93Kx' deleted successfully.
doc_28_chunk_65_subchunk_0.jsonl
    File with ID '1UzwWnHPM4j4UBJIhpY97RsH_lemM_JTD' deleted successfully.
doc_28_chunk_66_subchunk_0.jsonl
    File with ID '1w0kfV08kf88l5j9sw98Xa6VPQ4zMg8wf' deleted successfully.
doc_28_chunk_67_subchunk_0.j

In [7]:
def get_raw_id(service, main_folder_name):
    """Function to get the id of the raw folder within the main folder on Google Drive"""
    
    # Get the files info from within the main folder
    main_folder_id = search_file_by_name(service, main_folder_name)['id']
    files_data = get_files_in_folder_recursive(service, main_folder_id, main_folder_name)
    files_data = split_name_column(files_data)

    # Get the folder data from within the file data
    folders = files_data[files_data['is_folder'] == True]

    # Split the data into the raw, error, and processed data
    raw_id = folders[folders['name'] == 'raw'].iloc[0,2]

    return raw_id

In [8]:
system_prompt = """
Given the sentence, generate as many paraphrased sentences as possible while preserving the original semantic meaning and style. 
Return the rephrased sentences in a python list format. Aim for AT LEAST TWENTY sentences. DO NOT INCLUDE ANY NOTES OR ADDITIONAL TEXT IN THE OUTPUT.

An example is below:
--------
Sentence: ```"Known for being very delicate, the skill could take a lifetime to master."```

Rephrased Sentences: ```["The skill is well known for its delicacy and could require a lifetime to perfect.", "The skill's reputation for delicateness suggests that it could take a whole lifetime to master.", "It may take a lifetime to master the skill, which is renowned for its delicacy.", "The delicacy of the skill means it could take a lifetime to master."]```
--------
Sentence: ```{original_user_supplied_sentence}```
"""

final_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{original_user_supplied_sentence}"),
    ]
)

In [9]:
def convert_to_phi(original_sentence,
                   prompt_input=final_prompt):

    messages = prompt_input.messages
    
    formatted_messages = ""

    for message in messages:
        if isinstance(message, SystemMessagePromptTemplate):
            formatted_messages += f"<|assistant|>\n{message.prompt.template.replace('\n', '')} <|end|>\n"
        elif isinstance(message, FewShotChatMessagePromptTemplate):
            formatted_messages += f"<|user|>\n{message.examples[0]['original_user_supplied_sentence'].replace('\n', '')} <|end|>\n"
            formatted_messages += f"<|assistant|>\n{message.examples[0]} <|end|>\n"
        elif isinstance(message, HumanMessagePromptTemplate):
            formatted_messages += f"<|user|>\n{message.prompt.template.replace('\n', '')} <|end|>\n"
    
    formatted_messages += f"<|assistant|>"

    formatted_prompt = formatted_messages.replace("<|user|>\n{original_user_supplied_sentence} <|end|>", f"<|user|>\n{original_sentence} <|end|>")
    
    return formatted_prompt
    

In [10]:
llm = Llama(
    model_path="C:/Users/benjc/Documents/models/Phi-3-mini-4k-instruct-q4.gguf",
    n_ctx=4096,  # The max sequence length to use - note that longer sequence lengths require much more resources
    n_threads=10, # Maximum I have is 12
    n_gpu_layers=-1, # The number of layers to offload to GPU, if you have GPU acceleration available.
    verbose=False,
    flash_attn=True
)

In [11]:
def phi_paraphrase(original_sentence,
                   n_runs=10,
                   llm=llm,
                   prompt_input=final_prompt):


    formatted_prompt = convert_to_phi(original_sentence, prompt_input=final_prompt)

    sentences = [original_sentence]
    new_sentence_amount = 1
    
    for i in range(1, n_runs + 1):

        if new_sentence_amount == 0:
            break
        print(f'  Iteration: {i}')
        attempts = 1
        
        while True:
            try:
                output_str = llm(formatted_prompt, max_tokens=1000, stop=["<|end|>"],
                                temperature=1)
                output_text = output_str['choices'][0]['text']

                # Find the index of the first '[' and the last ']'
                start_index = output_text.find('[')
                end_index = output_text.rfind(']')

                # Extract the content between the first '[' and the last ']'
                content_str = output_text[start_index+1:end_index]

                # Evaluate the content string as a Python expression to convert it into a list
                result_list = eval('[' + content_str + ']')

                break
            except Exception as e:
                print(f'    Attempt {attempts} failed: {str(e)}')
                attempts += 1  # Increment the number of attempts

                if attempts == 4:
                    print("3 Attempts Exceeded, Next Iteration.")
                    result_list = []
                    break

        new_sentence_amount = 0

        for result in result_list:
            if result not in sentences:
                sentences.append(result)
                new_sentence_amount += 1
        
    return sentences

In [12]:
def paraphrase_df(df, *args):
    doc_ids = []
    chunks = []
    rephrased_sentences = []
    
    n_rows = df['id'].count()

    
    for index, row in df.iterrows():
        row_num = index + 1
        print(f'Row {row_num} out of {n_rows}')
        doc_id = row['id']
        chunk = row['chunk_id']
        sentence = row['text']
        
        rephrased = phi_paraphrase(sentence, *args)
        num_sent = len(rephrased)
        
        # Extend lists with repeated doc_id and chunk_id
        doc_ids.extend([doc_id] * num_sent)
        chunks.extend([chunk] * num_sent)
        
        rephrased_sentences.extend(rephrased)

    # Construct DataFrame
    result_df = pd.DataFrame({
        'doc_id': doc_ids,
        'chunk_id': chunks,
        'sentence': rephrased_sentences
    })

    return result_df

In [13]:
df = read_jsonl_file('../data/guardian_chunked_impostor.jsonl')

In [None]:
# filtered_df = df[(df['id'] >= 19)]

In [None]:
# filtered_df = df[(df['id'] == 18) & (df['chunk_id'] >= 46)]

In [17]:
df

Unnamed: 0,index,id,chunk_id,author,topic,word_count,subchunk_id,input_length,text
0,0,1,0,12,4,259,0,1435,"DBC Pierre, Booker Prize-winner and author of ..."
1,1,1,1,12,4,271,0,1471,The first rule of interviewing people in Irela...
2,2,1,2,12,4,293,0,1541,It irks me when these literary bad boys (espec...
3,3,1,3,12,4,256,0,1315,It irks me still further when they wont admit ...
4,4,1,4,12,4,252,0,1291,"No, no, no, says Dirty But Clean (thats what h..."
...,...,...,...,...,...,...,...,...,...
2512,3226,62,31,9,4,295,0,1775,"But although, as this biography makes clear, F..."
2513,3227,62,32,9,4,264,0,1589,"And that, I suspect, is true. He was for the p..."
2514,3228,62,33,9,4,258,0,1559,"He was for the pen, not the sword, and believe..."
2515,3229,62,34,9,4,267,0,1636,"Foot was the greatest polemicist of his day, w..."


In [23]:
grouped_df = df.groupby('id').size().reset_index(name='count')
grouped_df = grouped_df.sort_values(by='count', ascending=True)
grouped_df = grouped_df.iloc[(len(grouped_df)//2 - 5):(len(grouped_df)//2 + 5)]
sample_list = grouped_df['id'].tolist()

In [24]:
sample_list

[40, 7, 26, 43, 32, 42, 62, 54, 50, 31]

In [26]:
filtered_df = df[df['id'].isin(sample_list)]

In [27]:
filtered_df

Unnamed: 0,index,id,chunk_id,author,topic,word_count,subchunk_id,input_length,text
657,735,7,0,4,4,265,0,1487,﻿That Christopher Meyers memoirs have been as ...
658,736,7,1,4,4,283,0,1581,"But, read in the round, how does the book as a..."
659,737,7,2,4,4,269,0,1511,My answer is that Meyers book is both better a...
660,738,7,3,4,4,253,0,1409,"It is better because, contrary to the implicat..."
661,739,7,4,4,4,266,0,1500,Meyers account of these events is painted on a...
...,...,...,...,...,...,...,...,...,...
2512,3226,62,31,9,4,295,0,1775,"But although, as this biography makes clear, F..."
2513,3227,62,32,9,4,264,0,1589,"And that, I suspect, is true. He was for the p..."
2514,3228,62,33,9,4,258,0,1559,"He was for the pen, not the sword, and believe..."
2515,3229,62,34,9,4,267,0,1636,"Foot was the greatest polemicist of his day, w..."


In [15]:
filtered_df.iloc[0,8]

'DBC Pierre, Booker Prize-winner and author of Vernon God Little, has moved to Ireland. The first rule of interviewing people in Ireland, before Wear thermal pants and Remember some euros, is this: if theyre a writer, an artist or a composer, theyre there for the tax break. It irks me when these literary bad boys (especially Michel Houellebecq, who used to call himself a sodding communist) uproot themselves just, it seems fair to assume, to avoid a basic system of sharing stuff that your average apolitical cabbie seems able to manage. It irks me still further when they wont admit it. No, no, no, says Dirty But Clean (thats what his initials stand for). No. In the first instance, it was because I was in south London, and with the advance from Vernon I thought I should make something of it. I did, slowly, pay back my creditors. But I had to move somewhere I could afford to buy a place, and not fritter the rest of it in Soho. Not tax-related at all? No. Also, in Balham, the property boom 

In [16]:
filtered_df.iloc[1,8]

'The first rule of interviewing people in Ireland, before Wear thermal pants and Remember some euros, is this: if theyre a writer, an artist or a composer, theyre there for the tax break. It irks me when these literary bad boys (especially Michel Houellebecq, who used to call himself a sodding communist) uproot themselves just, it seems fair to assume, to avoid a basic system of sharing stuff that your average apolitical cabbie seems able to manage. It irks me still further when they wont admit it. No, no, no, says Dirty But Clean (thats what his initials stand for). No. In the first instance, it was because I was in south London, and with the advance from Vernon I thought I should make something of it. I did, slowly, pay back my creditors. But I had to move somewhere I could afford to buy a place, and not fritter the rest of it in Soho. Not tax-related at all? No. Also, in Balham, the property boom spread from Clapham, and people started getting really uptight. The neighbours started 

In [28]:
def paraphrase_df_save(df, base_save_loc, google_drive_main_folder, *args):

    # Connect to my Google Drive 
    service = connect_to_drive()

    # Get the id of the raw save location in chosen folder
    raw_folder_id = get_raw_id(service, google_drive_main_folder)
    
    docs = df['id'].unique().tolist()
    
    if 'subchunk_id' in df.columns:
        subchunk = True
    else:
        subchunk = False

    for doc in docs:
        filtered_df = df[df['id'] == doc]
        n_rows = filtered_df['id'].count()

        for index, row in filtered_df.iterrows():
            doc_ids = []
            chunks = []
            subchunks = []
            rephrased_sentences = []
            
            doc_id = row['id']
            chunk = row['chunk_id']
            max_chunk = filtered_df['chunk_id'].max()
            sentence = row['text']
                
            print(f'Doc: {doc_id} - Chunk: {chunk + 1} out of {max_chunk + 1}')

            rephrased = phi_paraphrase(sentence, *args)
            num_sent = len(rephrased)
        
            # Extend lists with repeated doc_id and chunk_id
            doc_ids.extend([doc_id] * num_sent)
            chunks.extend([chunk] * num_sent)
            rephrased_sentences.extend(rephrased)
            
            if subchunk:
                s_chunk = row['subchunk_id']
                subchunks.extend([s_chunk] * num_sent)

                raw_df = pd.DataFrame({
                    'doc_id': doc_ids,
                    'chunk_id': chunks,
                    'subchunk_id': subchunks,
                    'rephrased': rephrased_sentences
                })

                filtered_raw_df = raw_df[(raw_df['doc_id'] == doc_id)  &
                    (raw_df['chunk_id'] == chunk) &
                    (raw_df['subchunk_id'] == s_chunk)]
                
                temp_loc = f"{base_save_loc}temp.jsonl"
                google_drive_name = f"doc_{doc}_chunk_{chunk}_subchunk_{s_chunk}.jsonl"

            else:
                raw_df = pd.DataFrame({
                    'doc_id': doc_ids,
                    'chunk_id': chunks,
                    'rephrased': rephrased_sentences
                })

                filtered_raw_df = raw_df[(raw_df['doc_id'] == doc_id)  &
                    (raw_df['chunk_id'] == chunk)]
                
                temp_loc = f"{base_save_loc}temp.jsonl"
                google_drive_name = f"doc_{doc}_chunk_{chunk}.jsonl"

            try:
                save_as_jsonl(filtered_raw_df, temp_loc)
                upload_file(service, google_drive_name, temp_loc, parent_folder_id=raw_folder_id)
            except:
                pass

In [None]:
# paraphrase_df_save(filtered_df, base_save_loc = "../data/guardian_phi/", google_drive_main_folder="guardian_phi")

In [29]:
paraphrase_df_save(filtered_df, base_save_loc="../data/guardian_phi_chunked", google_drive_main_folder="guardian_phi_chunked")

Doc: 7 - Chunk: 1 out of 34
  Iteration: 1
    Attempt 1 failed: invalid syntax (<string>, line 2)
    Attempt 2 failed: invalid syntax (<string>, line 3)
    Attempt 3 failed: invalid syntax (<string>, line 3)
3 Attempts Exceeded, Next Iteration.
Doc: 7 - Chunk: 2 out of 34
  Iteration: 1
    Attempt 1 failed: invalid syntax (<string>, line 3)
  Iteration: 2
    Attempt 1 failed: invalid syntax (<string>, line 4)
  Iteration: 3
    Attempt 1 failed: invalid syntax (<string>, line 3)
    Attempt 2 failed: unterminated string literal (detected at line 21) (<string>, line 21)
  Iteration: 4
  Iteration: 5
    Attempt 1 failed: invalid syntax (<string>, line 3)
    Attempt 2 failed: invalid syntax (<string>, line 3)
    Attempt 3 failed: invalid syntax (<string>, line 3)
3 Attempts Exceeded, Next Iteration.
Doc: 7 - Chunk: 3 out of 34
  Iteration: 1
    Attempt 1 failed: invalid syntax (<string>, line 8)
    Attempt 2 failed: unterminated string literal (detected at line 1) (<string>, lin

KeyboardInterrupt: 