## Script for downloading image IDs from a shared Google Drive

In this Drive we first copy the FFHQ dataset from the original shared folder and add it to our own drive, so we have the files permanently in order to avoid any sort of changes to the dataset while we're still conducting the experiment.

We then iterate over all the images and add their Google Drive IDs to a dataframe.

In [1]:
import os.path

from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError

import pandas as pd

In [2]:
import io
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from googleapiclient.http import MediaIoBaseDownload
import io
from PIL import Image
import numpy as np


class GDrive_Handler:
    """
    @arg scopes: The scopes of the API service
    @arg credentials_path: Client credentials.
    @arg write_new_token: If set to True it will write a new access token based off of the scopes. This will open a pop-up window requesting authorization of your email.

    If set to false, it will use the old access token, which were created using the previous scopes.

    """

    def __init__(self,scopes,credentials_path,write_new_token = True):
        self.credentials = self.get_drive_credentials(credentials_path,scopes,write_new_token=write_new_token)
        self.service = build("drive", "v3", credentials=self.credentials)

    def get_drive_credentials(self,credentials_path,SCOPES,write_new_token = True):
      """Shows basic usage of the Drive v3 API.
      Prints the names and ids of the first 10 files the user has access to.
      """
      creds = None
      # The file token.json stores the user's access and refresh tokens, and is
      # created automatically when the authorization flow completes for the first
      # time.
      if os.path.exists("token.json") and not write_new_token:
        creds = Credentials.from_authorized_user_file("token.json", SCOPES)
      # If there are no (valid) credentials available, let the user log in.
      if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
          creds.refresh(Request())
        else:
          flow = InstalledAppFlow.from_client_secrets_file(
              credentials_path, SCOPES
          )
          creds = flow.run_local_server(port=0)
        # Save the credentials for the next run
        with open("token.json", "w") as token:
          token.write(creds.to_json())

        return creds

    def download_file(self, real_file_id):
        try:
            # Create drive api client
            file_id = real_file_id

            request = self.service.files().get_media(fileId=file_id)
            file = io.BytesIO()
            downloader = MediaIoBaseDownload(file, request)
            done = False
            while not done:
                status, done = downloader.next_chunk()
                print(f"Download {int(status.progress() * 100)}%.")

            # Convert the downloaded bytes to an image
            file.seek(0)  # Go to the beginning of the IO object
            image = Image.open(file)

            # Convert the image to a numpy array
            image_array = np.array(image)

            return image_array

        except HttpError as error:
            # Print more detailed error information
            print(f"An error occurred: {error}")
            if error.resp.status in [403, 500, 503]:
                print(f"Reason: {error.resp.reason}")
                print(f"Body: {error.resp.body}")
                # Try to parse and print out detailed error message
                try:
                    error_details = json.loads(error.resp.body.decode("utf-8"))
                    print(json.dumps(error_details, indent=2))
                except json.JSONDecodeError:
                    print("Could not parse error details.")
            return None

In [9]:
# If modifying these scopes, delete the file token.json.
SCOPES = ["https://www.googleapis.com/auth/drive.readonly"]
creds_path = r"C:\Users\DripTooHard\PycharmProjects\taming-transformers2\scripts\google-drive\credentials.json"
drive_api = GDrive_Handler(scopes=SCOPES,credentials_path=creds_path,write_new_token=False)

#service = build("drive", "v3", credentials=creds)

In [31]:
top_path_id = "1tZUcXDBeOibC6jcMCtgRRz67pzrAHeHL"
page_token = None
files = []


In [7]:
import json

def load_json(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
            return data
    except FileNotFoundError:
        print(f"The file at {file_path} was not found.")
        return None
    except json.JSONDecodeError:
        print(f"Error decoding JSON from file at {file_path}.")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# Example usage
file_path = 'token.json'
data = load_json(file_path)

if data is not None:
    print("JSON data loaded successfully:")
    print(data)

JSON data loaded successfully:
{'token': 'ya29.a0AfB_byALRAGF_ex0mSUcdt6i0kRKhd61MzrEsMFTiezf_vKWNZy7oomL41Z98Zi3yAaJObFYSGKZSvt2dri2eTRHySML4JGGzHLn9WdgLOQCO1KEQRu1Xanpu3qi4fMxDtCmJXAv0nGY4-FnWzGdvkwPE8k7B1jW4zDTaCgYKAfwSARESFQHGX2MiW2oQCV4AgGmkvexheqxT6g0171', 'refresh_token': '1//09uc21TAmKja0CgYIARAAGAkSNwF-L9IrgLuHJEY_65bKBOYF-b6sWjea2Q_-Uq2IB_AzbV1iG1lK0n2z5bZW8CXdT99bnrJ4f3g', 'token_uri': 'https://oauth2.googleapis.com/token', 'client_id': '100836400561-2fs1u2j0ietgp5o5vbf2phgnnrcl29ka.apps.googleusercontent.com', 'client_secret': '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', 'scopes': ['https://www.googleapis.com/auth/drive'], 'expiry': '2023-11-28T01:08:01.636494Z'}


In [59]:
def copy_folder(service, source_folder_id, destination_folder_id):
    """Copy all contents from the source folder to the destination folder."""
    results = service.files().list(
        q=f"'{source_folder_id}' in parents",
        fields="nextPageToken, files(id, name, mimeType)"
    ).execute()

    for file in results.get('files', []):
        if file['mimeType'] == 'application/vnd.google-apps.folder':
            # Check if the subfolder already exists in the destination
            subfolder_id = folder_exists(service, file['name'], destination_folder_id)
            if not subfolder_id:
                # Create a new subfolder in the destination
                subfolder_id = create_folder(service, file['name'], destination_folder_id)
            # Recursively copy the contents of the subfolder
            copy_folder(service, file['id'], subfolder_id)
        else:
            # Copy file to the destination folder
            copy_file_to_folder(service, file['id'], file['name'], destination_folder_id)

def create_folder(service, folder_name, parent_folder_id):
    """Create a folder under the specified parent folder."""
    file_metadata = {
        'name': folder_name,
        'mimeType': 'application/vnd.google-apps.folder',
        'parents': [parent_folder_id]
    }
    folder = service.files().create(body=file_metadata, fields='id').execute()
    return folder.get('id')

def folder_exists(service, folder_name, parent_folder_id):
    """Check if a folder with the specified name exists under the parent folder."""
    query = f"name = '{folder_name}' and '{parent_folder_id}' in parents and mimeType = 'application/vnd.google-apps.folder' and trashed = false"
    print(f"Executing query: {query}")  # Debugging line
    try:
        response = service.files().list(q=query, fields="files(id)").execute()
        files = response.get('files', [])
        return files[0]['id'] if files else None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None




In [60]:
def file_exists(service, file_name, parent_folder_id):
    """Check if a file with the specified name exists under the parent folder."""
    query = f"name = '{file_name}' and '{parent_folder_id}' in parents and mimeType != 'application/vnd.google-apps.folder' and trashed = false"
    response = service.files().list(q=query, fields="files(id)").execute()
    files = response.get('files', [])
    return files[0]['id'] if files else None

def copy_file_to_folder(service, file_id, file_name, parent_folder_id):
    """Copy a file to the specified folder if it doesn't already exist there."""
    if not file_exists(service, file_name, parent_folder_id):
        file_metadata = {'parents': [parent_folder_id]}
        service.files().copy(fileId=file_id, body=file_metadata).execute()

def copy_folder_contents(service, source_folder_id, destination_folder_id):
    """Copy all contents of a folder (both files and subfolders) to the destination folder."""
    results = service.files().list(
        q=f"'{source_folder_id}' in parents",
        fields="nextPageToken, files(id, name, mimeType)"
    ).execute()

    for file in results.get('files', []):
        if file['mimeType'] == 'application/vnd.google-apps.folder':
            # Check if the subfolder already exists in the destination
            subfolder_id = folder_exists(service, file['name'], destination_folder_id)
            if not subfolder_id:
                # Create a new subfolder in the destination
                subfolder_id = create_folder(service, file['name'], destination_folder_id)
            # Recursively copy the contents of the subfolder
            copy_folder_contents(service, file['id'], subfolder_id)
        else:
            # Copy file to the destination folder
            copy_file_to_folder(service, file['id'], file['name'], destination_folder_id)


In [None]:
# Example usage
origin_folder_id = top_path_id  # Replace with the ID of the folder you want to copy
destination_folder_name = '1zGn2jaTuM0tJA6jWD8uEUgFeQ4YeUWxT'  # Replace with your desired new folder name
copied_folder_id = copy_folder(service, origin_folder_id, destination_folder_name)
print(f"Folder copied successfully. New folder ID: {copied_folder_id}")


In [100]:
#For a given folder, it collects all files in that folder
def get_all_files_at_layer(service, layer_id):
    page_token = None
    files = []
    while True:
        response = service.files().list(
            pageSize=1000,
            q=f"'{layer_id}' in parents",
            fields="nextPageToken, files(name, id, size, mimeType)",
            pageToken=page_token
        ).execute()
        files.extend(response.get("files", []))
        page_token = response.get("nextPageToken", None)
        if not page_token:
            return files


#For a given folder, it collects all folders and files of image_mime_type at that layer and returns
#the files seperated into two arrays, based on whether they're folders or not.
#It doesn't search in any subfolders
def get_separated_images_folders_at_layer(service, layer_id, image_mime_types):
    all_files = get_all_files_at_layer(service, layer_id)
    folders, images = [], []
    for file in all_files:
        if file['mimeType'] in image_mime_types:
            images.append(file)
        elif file['mimeType'] == 'application/vnd.google-apps.folder':
            folders.append(file)
    return folders, images


#Starting at the top_layer_id folder, it iterates over it and all subfolders and collects all files that have one of the mimetypes in image_mime_types
def get_all_image_files(service,top_layer_id,image_mime_types):
  folders = [top_layer_id]
  images = []

  while folders != []:
    folder_id = folders.pop()
    #In the first loop we feed it a string, while we feed it requests in the remaining
    if(type(folder_id) != str):
      folder_id = folder_id.get("id")
    new_folders,new_images = get_separated_images_folders_at_layer(service,folder_id,image_mime_types)
    images += new_images
    folders += new_folders
  return images

def create_dataset_file(service,top_layer_id,image_mime_types,destination_path):
  image_files = get_all_image_files(service,top_layer_id,image_mime_types)
  images_df = pd.DataFrame.from_dict(image_files)
  images_df["attributes"] = "[]"
  images_df.to_csv(destination_path)



In [98]:
#print(get_separated_images_folders_at_layer(service,top_path_id,["image/png","image/jpg"]))

images = get_all_image_files(service,top_path_id,["image/png","image/jpg"])
print(images)

KeyboardInterrupt: 

In [101]:
create_dataset_file(service,top_path_id,["image/png","image/jpg"],"FFHQimages.csv")