In [None]:
import pickle
import os.path
import re
import io
import json

import pandas as pd

from googleapiclient.http import MediaIoBaseDownload, MediaFileUpload
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request

In [None]:
%run "./read_and_write_docs.py"

In [None]:
# Define the SCOPES. If modifying it, delete the token.pickle file.
SCOPES = ['https://www.googleapis.com/auth/drive']

In [None]:
def get_credentials():
    """Get user credentials or create new ones if needed."""
    creds = None

    # Check if file token.pickle exists
    if os.path.exists('token.pickle'):
        # Read the token from the file
        with open('token.pickle', 'rb') as token:
            creds = pickle.load(token)

    # If no valid credentials are available, request the user to log in.
    if not creds or not creds.valid:
        # If token is expired, refresh it, else, request a new one.
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                '../google_credentials.json', SCOPES)
            creds = flow.run_local_server(port=0)

        # Save the access token in token.pickle file for future usage
        with open('token.pickle', 'wb') as token:
            pickle.dump(creds, token)

    return creds


def connect_to_drive():
    """Connect to the Google Drive API service."""
    # Get user credentials
    creds = get_credentials()
    # Connect to the API service
    service = build('drive', 'v3', credentials=creds)
    return service


def get_file_list(service, N):
    """Get a list of first N files or folders from Google Drive."""
    # Request a list of first N files or folders with name and id from the API.
    resource = service.files()
    result = resource.list(pageSize=N, fields="files(id, kind, name, fullFileExtension, parents)").execute()
    # Return the result dictionary containing the information about the files
    return result

def search_file_by_name(service, file_name):
    """Search for a file by name and return its data."""
    # Define the query to search for the file by name
    query = f"name='{file_name}'"
    
    # Request files matching the search query
    resource = service.files()
    result = resource.list(q=query, fields="files(id, kind, name, fullFileExtension, parents)").execute()
    
    # Extract the list of files from the result
    files = result.get('files', [])
    
    if files:
        # Return the data of the first file found
        return files[0]
    else:
        # If no file is found, return None
        return None

def get_files_in_folder(service, parent_folder_id):
    """Get a list of files within a specific parent folder, handling pagination."""
    files = []
    page_token = None

    while True:
        # Define the query to retrieve files within the parent folder
        query = f"'{parent_folder_id}' in parents"
        
        # Request files matching the query
        resource = service.files()
        result = resource.list(
            q=query, 
            fields="nextPageToken, files(id, kind, name, fullFileExtension, parents)",
            pageSize=1000, 
            pageToken=page_token
        ).execute()
        
        # Extract the list of files from the result
        items = result.get('files', [])
        files.extend(items)
        
        # Check if there are more pages
        page_token = result.get('nextPageToken')
        if not page_token:
            break

    return files

def get_files_in_folder_recursive(service, parent_folder_id, parent_folder_name=None):
    """Recursively get all files and folders within a parent folder."""

    # Initialize an empty list to store file and folder data
    files_data = []

    # Initialize a stack to store folders to be processed
    folders_to_process = [(parent_folder_id, parent_folder_name)]
    
    # Process folders until the stack is empty
    while folders_to_process:
        # Pop the last folder from the stack
        folder_id, folder_name = folders_to_process.pop()
        
        # Get a list of files and folders within the current folder
        files_in_folder = get_files_in_folder(service, folder_id)
        
        # Iterate through the list of files and folders
        for file_data in files_in_folder:
            # Append the file or folder data to the list
            files_data.append({
                'parent_folder_id': folder_id,
                'parent_folder_name': folder_name,
                'id': file_data['id'],
                'name': file_data['name'],
                'kind': file_data['kind'],
                'file_extension': file_data.get('fullFileExtension', ''),
                'is_folder': file_data.get('fullFileExtension') is None
            })
            
            # If the item is a folder, add it to the stack to be processed later
            if file_data.get('fullFileExtension') is None:
                subfolder_id = file_data['id']
                subfolder_name = file_data['name']
                folders_to_process.append((subfolder_id, subfolder_name))

    files_df = pd.DataFrame(files_data)
    
    return files_df

def split_name_column(df):
    """Function to extract doc, chunk, and subchunk"""
    
    def extract_parts(name):
        """Extract the doc, chunk, and subchunk"""
        
        match = re.match(r'doc_(\d+)_chunk_(\d+)(?:_subchunk_(\d+))?.jsonl', name)
        if match:
            doc = int(match.group(1))
            chunk = int(match.group(2))
            subchunk = int(match.group(3)) if match.group(3) else 0
            return pd.Series([doc, chunk, subchunk])
        else:
            return pd.Series([None, None, None])

    # Apply the function to the 'name' column
    df[['doc', 'chunk', 'subchunk']] = df['name'].apply(extract_parts)

    df = df.sort_values(by=['doc', 'chunk', 'subchunk'])
    
    return df

def read_google_drive_jsonl(service, file_id):
    """Read the document with the designated file_id"""
    
    request = service.files().get_media(fileId=file_id)
    fh = io.BytesIO()
    downloader = MediaIoBaseDownload(fh, request)
    
    done = False
    while not done:
        status, done = downloader.next_chunk()
    
    # Move the cursor to the beginning of the BytesIO object
    fh.seek(0)
    
    # Read the file content into memory
    jsonl_content = fh.read().decode('utf-8')
    
    # Split the content into individual JSON lines
    json_lines = jsonl_content.strip().split('\n')
    
    # Parse each line as a JSON object
    json_objects = [json.loads(line) for line in json_lines]
    
    json_df = pd.DataFrame(json_objects)

    return json_df

def move_file_to_folder(service, file_id, folder_id):
    """Move specified file to the specified folder.
        Args:
            service: service client
            file_id: Id of the file to move.
            folder_id: Id of the folder
    """
    try:
        # Retrieve the existing parents to remove
        file = service.files().get(fileId=file_id, fields="parents").execute()
        previous_parents = ",".join(file.get("parents"))
        # Move the file to the new folder
        file = (
            service.files()
            .update(
                fileId=file_id,
                addParents=folder_id,
                removeParents=previous_parents,
                fields="id, parents",
            )
            .execute()
        )
    except HttpError as error:
        print(f"An error occurred: {error}")

def upload_file(service, file_name, file_path, parent_folder_id=None):
    """Upload a file to Google Drive."""
    file_metadata = {
        'name': file_name
    }
    if parent_folder_id:
        file_metadata['parents'] = [parent_folder_id]

    media = MediaFileUpload(file_path, resumable=True)

    file = service.files().create(
        body=file_metadata,
        media_body=media,
        fields='id, name, parents'
    ).execute()

def delete_file(service, file_id):
    """Delete a file from Google Drive."""
    try:
        service.files().delete(fileId=file_id).execute()
        print(f"    File with ID '{file_id}' deleted successfully.")
    except Exception as e:
        print(f"    An error occurred: {e}")

def read_raw_df(service, df, error_folder_id):

    df = df[df['file_extension'] == 'jsonl']
    num_rows = df.shape[0]
    columns = ['doc_id', 'chunk_id', 'subchunk_id', 'rephrased']
    result_df = pd.DataFrame(columns=columns)
    
    for i in range(0, num_rows):
        doc_id = df.iloc[i, 2]
        doc_num = df.iloc[i, 7]
        chunk_num = df.iloc[i, 8]
        subchunk_num = df.iloc[i, 9]

        print(df.iloc[i, 3])
        try:
            doc_data = read_google_drive_jsonl(service, doc_id)
        
            if 'subchunk_id' not in doc_data.columns:
                # Get the location of 'chunk_id' and insert dummy subchunk
                chunk_index = doc_data.columns.get_loc('chunk_id')
                doc_data.insert(chunk_index + 1, 'subchunk_id', 0)
    
            # Filter the data to make sure we don't duplicate in iterations
            filtered_data = doc_data[(doc_data['doc_id'] == doc_num) &
                (doc_data['chunk_id'] == chunk_num) &
                (doc_data['subchunk_id'] == subchunk_num)]

            delete_file(service, doc_id)
            
        except:
            print("Parsing Error")
            move_file_to_folder(service, doc_id, error_folder_id)

        result_df = pd.concat([result_df, filtered_data], ignore_index=True)

    return result_df

def update_files(main_folder_name, save_location, filename="rephrased.jsonl"):
    """Update the files in the google drive with any new raw files.
    
        Args:
            main_folder_name: The folder which contains the error, raw and processed folders
            save_location: The local save location.
            filename: The filename on Google Drive
            
    """

    # Connect to the google drive location
    service = connect_to_drive()

    # Get the files info from within the main folder
    main_folder_id = search_file_by_name(service, main_folder_name)['id']
    files_data = get_files_in_folder_recursive(service, main_folder_id, main_folder_name)
    files_data = split_name_column(files_data)

    # Get the folder data from within the file data
    folders = files_data[files_data['is_folder'] == True]

    # Split the data into the raw, error, and processed data
    raw_id = folders[folders['name'] == 'raw'].iloc[0,2]
    error_id = folders[folders['name'] == 'errors'].iloc[0,2]
    processed_id = folders[folders['name'] == 'processed'].iloc[0,2]

    # Return only the data in the raw folder
    raw_files = files_data[(files_data['parent_folder_id'] == raw_id) &
        (files_data['file_extension'] == 'jsonl')]

    # Run the code to pull new data
    old_data = read_jsonl_file(save_location)
    new_data = read_raw_df(service, raw_files, error_id)
    new_rows = new_data.shape[0]

    # If we have new rows then save to local location and Google Drive
    if new_rows > 0:
        print("New Rows Added")
        result_df = pd.concat([old_data, new_data], ignore_index=True)
    
        save_as_jsonl(result_df, save_location)
        upload_file(service, filename, save_location, parent_folder_id=processed_id)
    else:
        print("No New Data")

def delete_trash():
    """Delete all items in the trash permenantly"""
    service = connect_to_drive()
    service.files().emptyTrash().execute()

In [None]:
# update_files('guardian_phi', "../data/guardian_phi/rephrased.jsonl")

In [6]:
update_files('guardian_phi_chunked', "../data/guardian_phi_chunked/rephrased.jsonl")

FileNotFoundError: [Errno 2] No such file or directory: '../data/guardian_phi_chunked/rephrased.jsonl'