In [60]:
import pickle
import os.path

import pandas as pd

from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request

# Define the SCOPES. If modifying it,
# delete the token.pickle file.
SCOPES = ['https://www.googleapis.com/auth/drive']


def get_credentials():
    """Get user credentials or create new ones if needed."""
    creds = None

    # Check if file token.pickle exists
    if os.path.exists('token.pickle'):
        # Read the token from the file
        with open('token.pickle', 'rb') as token:
            creds = pickle.load(token)

    # If no valid credentials are available, request the user to log in.
    if not creds or not creds.valid:
        # If token is expired, refresh it, else, request a new one.
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                '../google_credentials.json', SCOPES)
            creds = flow.run_local_server(port=0)

        # Save the access token in token.pickle file for future usage
        with open('token.pickle', 'wb') as token:
            pickle.dump(creds, token)

    return creds


def connect_to_drive():
    """Connect to the Google Drive API service."""
    # Get user credentials
    creds = get_credentials()
    # Connect to the API service
    service = build('drive', 'v3', credentials=creds)
    return service


def get_file_list(service, N):
    """Get a list of first N files or folders from Google Drive."""
    # Request a list of first N files or folders with name and id from the API.
    resource = service.files()
    result = resource.list(pageSize=N, fields="files(id, kind, name, fullFileExtension, parents)").execute()
    # Return the result dictionary containing the information about the files
    return result

def search_file_by_name(service, file_name):
    """Search for a file by name and return its data."""
    # Define the query to search for the file by name
    query = f"name='{file_name}'"
    
    # Request files matching the search query
    resource = service.files()
    result = resource.list(q=query, fields="files(id, kind, name, fullFileExtension, parents)").execute()
    
    # Extract the list of files from the result
    files = result.get('files', [])
    
    if files:
        # Return the data of the first file found
        return files[0]
    else:
        # If no file is found, return None
        return None

def get_files_in_folder(service, parent_folder_id):
    """Get a list of files within a specific parent folder."""
    # Define the query to retrieve files within the parent folder
    query = f"'{parent_folder_id}' in parents"
    
    # Request files matching the query
    resource = service.files()
    result = resource.list(q=query, fields="files(id, kind, name, fullFileExtension, parents)").execute()
    
    # Extract the list of files from the result
    files = result.get('files', [])
    
    return files
    
def print_file_names(file_list):
    """Print names of files in the provided file list."""
    # Extract the list from the dictionary
    files = file_list.get('files')
    print(files)
    # Print every file's name
    for file in files:
        print(file['name'])

In [4]:
service = connect_to_drive()

Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=655146860231-vr0gcnd196gnbth7hc4kgk3dnbig6lfi.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A52737%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&state=4QKP67QZBTdf6hVQtZrNRrzlr30y5F&access_type=offline


In [61]:
result_dict = get_file_list(service, 500)

In [62]:
datasets_id = search_file_by_name(service, 'datasets')['id']

In [63]:
search_file_by_name(service, 'datasets')

{'kind': 'drive#file',
 'parents': ['0APu_EJrjDYF6Uk9PVA'],
 'id': '1pT_SGjJ7CDVsONq932Y-5ONVE4q21paj',
 'name': 'datasets'}

In [64]:
datasets_files = get_files_in_folder(service, datasets_id)

In [65]:
guardian_id = [file_data for file_data in datasets_files if file_data['name'] == 'guardian'][0]['id']

In [66]:
guardian_files = get_files_in_folder(service, guardian_id)

In [68]:
import pandas as pd

def get_files_in_folder_recursive(service, parent_folder_id, parent_folder_name=None):
    """Recursively get all files and folders within a parent folder."""

    # Initialize an empty list to store file and folder data
    files_data = []

    # Initialize a stack to store folders to be processed
    folders_to_process = [(parent_folder_id, parent_folder_name)]
    
    # Process folders until the stack is empty
    while folders_to_process:
        # Pop the last folder from the stack
        folder_id, folder_name = folders_to_process.pop()
        
        # Get a list of files and folders within the current folder
        files_in_folder = get_files_in_folder(service, folder_id)
        
        # Iterate through the list of files and folders
        for file_data in files_in_folder:
            # Append the file or folder data to the list
            files_data.append({
                'parent_folder_id': folder_id,
                'parent_folder_name': folder_name,
                'id': file_data['id'],
                'name': file_data['name'],
                'kind': file_data['kind'],
                'file_extension': file_data.get('fullFileExtension', ''),
                'is_folder': file_data.get('fullFileExtension') is None
            })
            
            # If the item is a folder, add it to the stack to be processed later
            if file_data.get('fullFileExtension') is None:
                subfolder_id = file_data['id']
                subfolder_name = file_data['name']
                folders_to_process.append((subfolder_id, subfolder_name))

    files_df = pd.DataFrame(files_data)
    
    return files_df

In [69]:
files_data = get_files_in_folder_recursive(service, datasets_id, 'datasets')

In [70]:
files_data

Unnamed: 0,parent_folder_id,parent_folder_name,id,name,kind,file_extension,is_folder
0,1pT_SGjJ7CDVsONq932Y-5ONVE4q21paj,datasets,1m9Ne7xC3YBPvdTWQpnmhNtICffIZeli5,guardian,drive#file,,True
1,1m9Ne7xC3YBPvdTWQpnmhNtICffIZeli5,guardian,1woC5Xhs0kG44lAFTLPbiSimSTSiUfJEa,guardian_preprocessed.jsonl,drive#file,jsonl,False
2,1m9Ne7xC3YBPvdTWQpnmhNtICffIZeli5,guardian,1b7_fUsIPhF4XLOhnE6_U4nI6MBxkE_Fk,guardian_phi,drive#file,,True
3,1b7_fUsIPhF4XLOhnE6_U4nI6MBxkE_Fk,guardian_phi,1-nR_X8osU42pgEQGjCQDVgyf-f_PMXUx,doc_1.jsonl,drive#file,jsonl,False
4,1b7_fUsIPhF4XLOhnE6_U4nI6MBxkE_Fk,guardian_phi,1Xr_FrDDnyOnFpgkhXggzlfAzYSVHhu7H,raw,drive#file,,True
...,...,...,...,...,...,...,...
100,1Xr_FrDDnyOnFpgkhXggzlfAzYSVHhu7H,raw,17lNIMPJLbGSFDziFRH0yfIaLjXSPYHDJ,doc_1_chunk_130.jsonl,drive#file,jsonl,False
101,1Xr_FrDDnyOnFpgkhXggzlfAzYSVHhu7H,raw,1P6nJaIBXYKXK1sB4SZ41JgYlX5uUmmEf,doc_1_chunk_134.jsonl,drive#file,jsonl,False
102,1Xr_FrDDnyOnFpgkhXggzlfAzYSVHhu7H,raw,1qFstoCbkeZkuL9Zp_Ufb6ECaa4hA75ni,doc_1_chunk_132.jsonl,drive#file,jsonl,False
103,1Xr_FrDDnyOnFpgkhXggzlfAzYSVHhu7H,raw,1r8y2TCCCCo8oNKxN8WuLqmLN2F_4KrL1,doc_1_chunk_132.jsonl,drive#file,jsonl,False


In [71]:
files_df[files_df['name'] == 'datasets']

Unnamed: 0,parent_folder_id,parent_folder_name,id,name,kind,file_extension,is_folder


In [72]:
files_df[files_df['is_folder'] == True]

Unnamed: 0,parent_folder_id,parent_folder_name,id,name,kind,file_extension,is_folder
0,1pT_SGjJ7CDVsONq932Y-5ONVE4q21paj,datasets,1m9Ne7xC3YBPvdTWQpnmhNtICffIZeli5,guardian,drive#file,,True
2,1m9Ne7xC3YBPvdTWQpnmhNtICffIZeli5,guardian,1b7_fUsIPhF4XLOhnE6_U4nI6MBxkE_Fk,guardian_phi,drive#file,,True
4,1b7_fUsIPhF4XLOhnE6_U4nI6MBxkE_Fk,guardian_phi,1Xr_FrDDnyOnFpgkhXggzlfAzYSVHhu7H,raw,drive#file,,True
