In [60]:
import pickle
import os.path
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request

# Define the SCOPES. If modifying it,
# delete the token.pickle file.
SCOPES = ['https://www.googleapis.com/auth/drive']


def get_credentials():
    """Get user credentials or create new ones if needed."""
    creds = None

    # Check if file token.pickle exists
    if os.path.exists('token.pickle'):
        # Read the token from the file
        with open('token.pickle', 'rb') as token:
            creds = pickle.load(token)

    # If no valid credentials are available, request the user to log in.
    if not creds or not creds.valid:
        # If token is expired, refresh it, else, request a new one.
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                '../google_credentials.json', SCOPES)
            creds = flow.run_local_server(port=0)

        # Save the access token in token.pickle file for future usage
        with open('token.pickle', 'wb') as token:
            pickle.dump(creds, token)

    return creds


def connect_to_drive():
    """Connect to the Google Drive API service."""
    # Get user credentials
    creds = get_credentials()
    # Connect to the API service
    service = build('drive', 'v3', credentials=creds)
    return service


def get_file_list(service, N):
    """Get a list of first N files or folders from Google Drive."""
    # Request a list of first N files or folders with name and id from the API.
    resource = service.files()
    result = resource.list(pageSize=N, fields="files(id, kind, name, fullFileExtension, parents)").execute()
    # Return the result dictionary containing the information about the files
    return result

def search_file_by_name(service, file_name):
    """Search for a file by name and return its data."""
    # Define the query to search for the file by name
    query = f"name='{file_name}'"
    
    # Request files matching the search query
    resource = service.files()
    result = resource.list(q=query, fields="files(id, kind, name, fullFileExtension, parents)").execute()
    
    # Extract the list of files from the result
    files = result.get('files', [])
    
    if files:
        # Return the data of the first file found
        return files[0]
    else:
        # If no file is found, return None
        return None

def get_files_in_folder(service, parent_folder_id):
    """Get a list of files within a specific parent folder."""
    # Define the query to retrieve files within the parent folder
    query = f"'{parent_folder_id}' in parents"
    
    # Request files matching the query
    resource = service.files()
    result = resource.list(q=query, fields="files(id, kind, name, fullFileExtension, parents)").execute()
    
    # Extract the list of files from the result
    files = result.get('files', [])
    
    return files
    
def print_file_names(file_list):
    """Print names of files in the provided file list."""
    # Extract the list from the dictionary
    files = file_list.get('files')
    print(files)
    # Print every file's name
    for file in files:
        print(file['name'])

In [51]:
service = connect_to_drive()

In [52]:
result_dict = get_file_list(service, 500)

In [58]:
datasets_id = search_file_by_name(service, 'datasets')['id']

In [64]:
datasets_files = get_files_in_folder(service, datasets_id)

In [71]:
guardian_id = [file_data for file_data in datasets_files if file_data['name'] == 'guardian'][0]['id']

In [72]:
guardian_files = get_files_in_folder(service, guardian_id)

In [73]:
guardian_files

[{'kind': 'drive#file',
  'parents': ['1m9Ne7xC3YBPvdTWQpnmhNtICffIZeli5'],
  'id': '1woC5Xhs0kG44lAFTLPbiSimSTSiUfJEa',
  'name': 'guardian_preprocessed.jsonl',
  'fullFileExtension': 'jsonl'},
 {'kind': 'drive#file',
  'parents': ['1m9Ne7xC3YBPvdTWQpnmhNtICffIZeli5'],
  'id': '1b7_fUsIPhF4XLOhnE6_U4nI6MBxkE_Fk',
  'name': 'guardian_phi'}]

In [53]:
print_file_names(result_dict)

[{'kind': 'drive#file', 'parents': ['1Xr_FrDDnyOnFpgkhXggzlfAzYSVHhu7H'], 'id': '17K45HhGZdVrt9M--nr7HtFKCfra-S7_z', 'name': 'doc_1_chunk_181.jsonl', 'fullFileExtension': 'jsonl'}, {'kind': 'drive#file', 'parents': ['1Xr_FrDDnyOnFpgkhXggzlfAzYSVHhu7H'], 'id': '1n4mlNZpXmqJYtn0gfyftjIH63VG7B51u', 'name': 'doc_1_chunk_157.jsonl', 'fullFileExtension': 'jsonl'}, {'kind': 'drive#file', 'parents': ['1Xr_FrDDnyOnFpgkhXggzlfAzYSVHhu7H'], 'id': '132e4HjogHtEec8gdd13bU_5ZpjfsX9iB', 'name': 'doc_1_chunk_177.jsonl', 'fullFileExtension': 'jsonl'}, {'kind': 'drive#file', 'parents': ['1Xr_FrDDnyOnFpgkhXggzlfAzYSVHhu7H'], 'id': '1TgaKG5InoSlblWPTbsLvvxPZMgboST9a', 'name': 'doc_1_chunk_156.jsonl', 'fullFileExtension': 'jsonl'}, {'kind': 'drive#file', 'parents': ['1Xr_FrDDnyOnFpgkhXggzlfAzYSVHhu7H'], 'id': '11lnPvsxr0WIgrzovYSTcNGTvRemiTw1n', 'name': 'doc_1_chunk_165.jsonl', 'fullFileExtension': 'jsonl'}, {'kind': 'drive#file', 'parents': ['1Xr_FrDDnyOnFpgkhXggzlfAzYSVHhu7H'], 'id': '1UfLqAyHHPcPb4Q4nnP

In [77]:
import pandas as pd

def get_files_in_folder_recursive(service, parent_folder_id, parent_folder_name=None):
    """Recursively get all files and folders within a parent folder."""
    # Initialize an empty list to store file and folder data
    files_data = []
    
    # Get a list of files and folders within the parent folder
    files_in_folder = get_files_in_folder(service, parent_folder_id)
    
    # Iterate through the list of files and folders
    for file_data in files_in_folder:
        # Append the file or folder data to the list
        files_data.append({
            'Parent Folder ID': parent_folder_id,
            'Parent Folder Name': parent_folder_name,
            'ID': file_data['id'],
            'Name': file_data['name'],
            'Kind': file_data['kind'],
            'File Extension': file_data.get('fullFileExtension', ''),
            'Is Folder': file_data.get('fullFileExtension') is None
        })
        
        # If the item is a folder, recursively get files and folders within it
        if file_data['kind'] == 'drive#folder':
            folder_id = file_data['id']
            folder_name = file_data['name']
            files_data.extend(get_files_in_folder_recursive(service, folder_id, folder_name))
    
    return files_data

In [80]:
files_data = get_files_in_folder_recursive(service, datasets_id, 'datasets')

In [81]:
files_data

[{'Parent Folder ID': '1pT_SGjJ7CDVsONq932Y-5ONVE4q21paj',
  'Parent Folder Name': 'datasets',
  'ID': '1m9Ne7xC3YBPvdTWQpnmhNtICffIZeli5',
  'Name': 'guardian',
  'Kind': 'drive#file',
  'File Extension': '',
  'Is Folder': True}]

In [87]:
import pandas as pd

def get_files_in_folder_recursive(service, parent_folder_id, parent_folder_name=None):
    """Recursively get all files and folders within a parent folder."""
    # Initialize an empty list to store file and folder data
    files_data = []
    
    # Initialize a stack to store folders to be processed
    folders_to_process = [(parent_folder_id, parent_folder_name)]
    
    # Process folders until the stack is empty
    while folders_to_process:
        # Pop the last folder from the stack
        folder_id, folder_name = folders_to_process.pop()
        
        # Get a list of files and folders within the current folder
        files_in_folder = get_files_in_folder(service, folder_id)
        
        # Iterate through the list of files and folders
        for file_data in files_in_folder:
            # Append the file or folder data to the list
            files_data.append({
                'parent_folder_id': folder_id,
                'parent_folder_name': folder_name,
                'id': file_data['id'],
                'name': file_data['name'],
                'kind': file_data['kind'],
                'file_extension': file_data.get('fullFileExtension', ''),
                'is_folder': file_data.get('fullFileExtension') is None
            })
            
            # If the item is a folder, add it to the stack to be processed later
            if file_data.get('fullFileExtension') is None:
                subfolder_id = file_data['id']
                subfolder_name = file_data['name']
                folders_to_process.append((subfolder_id, subfolder_name))
    
    return files_data

In [88]:
files_data = get_files_in_folder_recursive(service, datasets_id, 'datasets')

In [89]:
files_data

[{'parent_folder_id': '1pT_SGjJ7CDVsONq932Y-5ONVE4q21paj',
  'parent_folder_name': 'datasets',
  'id': '1m9Ne7xC3YBPvdTWQpnmhNtICffIZeli5',
  'name': 'guardian',
  'kind': 'drive#file',
  'file_extension': '',
  'is_folder': True},
 {'parent_folder_id': '1m9Ne7xC3YBPvdTWQpnmhNtICffIZeli5',
  'parent_folder_name': 'guardian',
  'id': '1woC5Xhs0kG44lAFTLPbiSimSTSiUfJEa',
  'name': 'guardian_preprocessed.jsonl',
  'kind': 'drive#file',
  'file_extension': 'jsonl',
  'is_folder': False},
 {'parent_folder_id': '1m9Ne7xC3YBPvdTWQpnmhNtICffIZeli5',
  'parent_folder_name': 'guardian',
  'id': '1b7_fUsIPhF4XLOhnE6_U4nI6MBxkE_Fk',
  'name': 'guardian_phi',
  'kind': 'drive#file',
  'file_extension': '',
  'is_folder': True},
 {'parent_folder_id': '1b7_fUsIPhF4XLOhnE6_U4nI6MBxkE_Fk',
  'parent_folder_name': 'guardian_phi',
  'id': '1-nR_X8osU42pgEQGjCQDVgyf-f_PMXUx',
  'name': 'doc_1.jsonl',
  'kind': 'drive#file',
  'file_extension': 'jsonl',
  'is_folder': False},
 {'parent_folder_id': '1b7_fUs

In [90]:
files_df = pd.DataFrame(files_data)

In [None]:
files_df[files_df['