# Google API for Drive

## Other media at root having zero size, move to folder

In [1]:
from __future__ import print_function
import os, pandas as pd, pickle
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError

In [2]:
# If modifying these scopes, delete the file token.json.

SCOPES = [
    'https://www.googleapis.com/auth/drive.metadata.readonly',
    'https://www.googleapis.com/auth/drive.file',
    'https://www.googleapis.com/auth/drive',
    'https://www.googleapis.com/auth/drive.appdata']

# We get access to these scopes only.

In [3]:
"""Authorization"""

creds = None
# The file token.json stores the user's access and refresh tokens, and is
# created automatically when the authorization flow completes for the first
# time.
if os.path.exists('token.json'):
    try:
        creds = Credentials.from_authorized_user_file('token.json', SCOPES)
        print('authentication exists')
    except:
        print('token read failed')
        pass
# If there are no (valid) credentials available, let the user log in.
if not creds or not creds.valid:
    if creds and creds.expired and creds.refresh_token:
        print('getting authenticated')
        creds.refresh(Request())
    else:
        flow = InstalledAppFlow.from_client_secrets_file(
            'client_secret.json', SCOPES)
        creds = flow.run_local_server(port=44599)
    # Save the credentials for the next run
    with open('token.json', 'w') as token:
        token.write(creds.to_json())

authentication exists


In [4]:
# Start Service
service = build('drive', 'v3', credentials=creds)

In [25]:
%%time

"""
find all files in root
    - in parent folder, id = "kfgodr8g9IUHmojnJI898uUhiu"
    - having quotaBytesUsed = 0
"""

parent_folder_id = "kfgodr8g9IUHmojnJI898uUhiu"
page_token = None # which page to get
searched_files = []# list of files

while True:
    response = service.files().list(q=f"'{parent_folder_id}' in parents",
                                    spaces='drive',
                                    pageSize=1000,
                                    fields='nextPageToken, files(id, name, fullFileExtension, quotaBytesUsed, size, webViewLink, parents)',
                                    pageToken=page_token).execute()
    files = response.get('files', [])
    searched_files.extend( files )
    # print('Files added ' + str(len(files)) )
    page_token = response.get('nextPageToken', None)
    if page_token is None:
        break
print(f"Total files searched: {len(searched_files)}")

df = pd.DataFrame(data=searched_files)

Total files searched: 32
CPU times: user 11.8 ms, sys: 0 ns, total: 11.8 ms
Wall time: 483 ms


In [7]:
df_files_root_nosize = df[ (df['quotaBytesUsed']=='0') & (df['fullFileExtension'].isin(['HEIC', 'PNG', 'MOV', 'png', 'mov', '3gp', 'TRIM.MOV', 'gif', 'MPG', 'MP4'])) &(~df['fullFileExtension'].isnull()) ]

In [8]:
df_files_root_nosize.shape
#8084

(0, 7)

In [9]:
df_files_root_nosize.fullFileExtension.value_counts()

Series([], Name: count, dtype: int64)

In [10]:
# df_files_root_nosize.fullFileExtension.value_counts()

In [11]:
#df_files_root_nosize.to_csv('df_files_root_nosize-other-media-4.csv')

# Batch Operation

In [12]:
#batch = service.new_batch_http_request()

In [13]:
# create a folder

# file_metadata = {
#     'name': 'zero-non-jpg',
#     'mimeType': 'application/vnd.google-apps.folder'
# }
# #file = service.files().create(body=file_metadata, fields='id').execute()
# print(F'Folder ID: "{file.get("id")}".')
# folder_id = file.get("id")

In [14]:
folder_id = 'kfgodr8g9IUHmojnJI898uUhiu'

In [15]:
# count = 0
# error = 0
def move_files_callback(request_id, response, exception):
    if exception is not None:
        print('Exception occured')
        #error = error + 1
        pass
    else:
        # Do something with the response
        # print('Files moved')
        #count = count + 1
        pass

In [16]:


def move_add_batch(x, batch):

    #print(f'id: {x.id}, parent: {x.parents}')
    previous_parents = ",".join(x.parents)
    batch.add(service.files().update(fileId=x.id, addParents=folder_id,
                              removeParents=previous_parents, fields='id') )

In [17]:
# temp = df_files_root_nosize.iloc[700:1000].apply(lambda x: move_add_batch(x), axis=1)


In [18]:
%%time
# batch.execute()

CPU times: user 1 µs, sys: 0 ns, total: 1 µs
Wall time: 3.81 µs


In [19]:
# \print(count)

In [20]:
df_len = df_files_root_nosize.shape[0]
print(df_len)
batches = []
for i in range(0,df_len,300):
    print(f'Iteration: {i}')
    batch = service.new_batch_http_request(callback=move_files_callback)
    if (df_len-i < 300):
        #print(f' - range: {i}:')
        temp = df_files_root_nosize.iloc[i:].apply(lambda x: move_add_batch(x, batch), axis=1)
    else:
        #print(f' - range: {i}:{i+300}')
        temp = df_files_root_nosize.iloc[i:i+300].apply(lambda x: move_add_batch(x, batch), axis=1)
    batches.append(batch)
    

0


In [21]:
len(batches)

0

In [22]:
import time

In [23]:
time.time()

1685297057.689125

In [24]:
%%time

for b in batches:
    start = time.time()
    b.execute()
    end = time.time()
    print(f'time taken: {end - start}')

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 6.2 µs
