In [None]:
import io
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload
from google.cloud import storage
from google.oauth2 import service_account
import json
from pyspark.sql import SparkSession
import glob

credentials = {
  "type": "service_account",
  "project_id": "cedar-pottery-388916",
  "private_key_id": "SECRET",
  "private_key": "SECRET",
  "client_email": "drive-895@cedar-pottery-388916.iam.gserviceaccount.com",
  "client_id": "113489151660006131618",
  "auth_uri": "https://accounts.google.com/o/oauth2/auth",
  "token_uri": "https://oauth2.googleapis.com/token",
  "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
  "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/drive-895%40cedar-pottery-388916.iam.gserviceaccount.com",
  "universe_domain": "googleapis.com"
}

credentials = service_account.Credentials.from_service_account_info(credentials)
drive_service = build('drive', 'v3', credentials=credentials)

# Define the project ID
project_id = 'cedar-pottery-388916'

# Define the destination bucket name in Google Cloud Storage
bucket_name = 'bucket123321'

In [2]:
# Retrieve directory and file information recursively
def get_directory_and_file_info(service, directory_id, parent_directory_name=""):
    file_info_list = []

    # Query for files in the current directory
    query = f"'{directory_id}' in parents"
    response = service.files().list(q=query, fields='files(id, name, mimeType, modifiedTime), nextPageToken').execute()

    # Iterate over the files in the current directory
    for file in response.get('files', []):
        if file.get('mimeType') == 'application/vnd.google-apps.folder':
            # If the file is a subdirectory, recursively call the function
            directory_name = parent_directory_name + file['name'] + "/"
            file_info_list.extend(get_directory_and_file_info(service, file['id'], directory_name))
        else:
            # If the file is not a subdirectory, add the file information to the list
            file_info = {
                "Name": parent_directory_name + file['name'],
                "ID": file['id'],
                "Last Modified Date": file.get('modifiedTime', '')
            }
            file_info_list.append(file_info)

    return file_info_list

# ID of the top-level directory
top_directory_id = '1wGNQYA3f_PJpUkarEes0txvYpQ2k1ctB'

# Retrieve directory and file information recursively
file_info_list = get_directory_and_file_info(drive_service, top_directory_id)

# Convert the file information list to JSON string
file_info_json = json.dumps(file_info_list, indent=4)

# Print the file information in JSON-like format
print(file_info_json)

[
    {
        "Name": "12.json",
        "ID": "1Of9OzNb9tSoghB2sk3rS5OoLjtia2BzX",
        "Last Modified Date": "2023-06-15T23:43:55.000Z"
    },
    {
        "Name": "11.json",
        "ID": "1ZQxtdJSx9Pgu0XM2Zhn8Jts4Mw6J1x9i",
        "Last Modified Date": "2023-05-29T20:53:25.000Z"
    },
    {
        "Name": "10.json",
        "ID": "1UjTwRPYVlx_tqFOPCcRr9WSmNNtSWgnb",
        "Last Modified Date": "2023-05-29T20:37:08.000Z"
    },
    {
        "Name": "7.json",
        "ID": "1RRbDaqhj7mUE63FeBNIZnMr2I6tsdpDl",
        "Last Modified Date": "2023-05-29T20:29:40.000Z"
    },
    {
        "Name": "9.json",
        "ID": "1QjRIUj3UxPAC0gLXmEo21gu_OebwiqBX",
        "Last Modified Date": "2023-02-03T02:42:56.000Z"
    },
    {
        "Name": "8.json",
        "ID": "1dYAvdjpLV9GZFejB2hbsIsNjr0-LKI2b",
        "Last Modified Date": "2023-02-03T02:42:42.000Z"
    },
    {
        "Name": "6.json",
        "ID": "1-FRQ8PgfXal_CgEUDlFNpi4s1X2Ylykt",
        "Last Modified Date": 

In [3]:
# Read the content of the file in GCS
def read_file_from_gcs(file_path):
    # Instantiate a client
    client = storage.Client()

    # Get the bucket and blob names from the file path
    bucket_name, blob_name = file_path.split("/", 3)[2:]

    # Get the bucket
    bucket = client.bucket(bucket_name)

    # Get the blob
    blob = bucket.blob(blob_name)

    # Download the file contents
    content = blob.download_as_text()
    return content

# Compare the file in GCS with the result from the code above
def compare_file_with_result(gcs_file_path, result):
    # Read the content of the file in GCS
    gcs_content = read_file_from_gcs(gcs_file_path)

    # Parse the JSON content of the GCS file
    gcs_data = json.loads(gcs_content)

    # Compare the results
    if len(gcs_data) != len(result):
        return False

    for gcs_file, code_file in zip(gcs_data, result):
        if gcs_file != code_file:
            return False
    return True

gcs_registers_file_path = "gs://bucket123321/incremental_load/registers.json"

# Assuming you have the 'result' variable containing the result from the code above
comparison_result = compare_file_with_result(gcs_registers_file_path, file_info_list)

# Print the result
if comparison_result:
    print("-----------------------------------------------------")
    print("The GCS file is the same as the result from the code.")
    print("-----------------------------------------------------")
else:
    print("-----------------------------------------------------")
    print("The GCS file is different from the result from the code.")
    print("-----------------------------------------------------")

content = read_file_from_gcs(gcs_registers_file_path)
print(content)

-----------------------------------------------------
The GCS file is different from the result from the code.
-----------------------------------------------------
[
    {
        "Name": "11.json",
        "ID": "1ZQxtdJSx9Pgu0XM2Zhn8Jts4Mw6J1x9i",
        "Last Modified Date": ""
    },
    {
        "Name": "10.json",
        "ID": "1UjTwRPYVlx_tqFOPCcRr9WSmNNtSWgnb",
        "Last Modified Date": ""
    },
    {
        "Name": "7.json",
        "ID": "1RRbDaqhj7mUE63FeBNIZnMr2I6tsdpDl",
        "Last Modified Date": ""
    },
    {
        "Name": "9.json",
        "ID": "1QjRIUj3UxPAC0gLXmEo21gu_OebwiqBX",
        "Last Modified Date": ""
    },
    {
        "Name": "8.json",
        "ID": "1dYAvdjpLV9GZFejB2hbsIsNjr0-LKI2b",
        "Last Modified Date": ""
    },
    {
        "Name": "6.json",
        "ID": "1-FRQ8PgfXal_CgEUDlFNpi4s1X2Ylykt",
        "Last Modified Date": ""
    },
    {
        "Name": "5.json",
        "ID": "1CgzDqnbNuPhTNQT6ORLzYTCxlVKiwIVF",
        "L

In [4]:
# Parse the JSON string into a list of dictionaries
file_info_json = json.loads(file_info_json)
content = json.loads(content)

In [5]:
new_elements = []
file_names = [element['Name'] for element in content]
for element in file_info_json:
    if element['Name'] not in file_names:
        new_elements.append(element)
print(new_elements)

[{'Name': '12.json', 'ID': '1Of9OzNb9tSoghB2sk3rS5OoLjtia2BzX', 'Last Modified Date': '2023-06-15T23:43:55.000Z'}]


In [6]:
new_elements[0]['ID']

'1Of9OzNb9tSoghB2sk3rS5OoLjtia2BzX'

In [7]:
# Create a client for Google Cloud Storage directly using the credentials
client = storage.Client(credentials=credentials, project=project_id)

# Define the target directory path within the bucket
target_directory = "gmap_metadata/new_data/"

# Iterate over the file URLs and names simultaneously
for element in new_elements:

    # Download the file from Google Drive
    request = drive_service.files().get_media(fileId=element['ID'])
    file_stream = io.BytesIO()
    downloader = MediaIoBaseDownload(file_stream, request)
    done = False
    while not done:
        status, done = downloader.next_chunk()

    # Upload the file to Google Cloud Storage
    target_blob_name = target_directory + element['Name']
    blob = client.bucket(bucket_name).blob(target_blob_name)
    file_stream.seek(0)  # Reset the stream position to the beginning
    blob.upload_from_file(file_stream, content_type='application/octet-stream')

    print('File transfer completed for file ID:', id)

File transfer completed for file ID: <built-in function id>


In [8]:
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder.getOrCreate()

# Read the JSON files into Spark DataFrames
gmaps_metadata = spark.read.json('gs://bucket123321/gmap_metadata/new_data/*.json')

# Drop columns
columns_to_drop = ["hours", "MISC", "state", "price", "description"]
gmaps_metadata = gmaps_metadata.drop(*columns_to_drop)

# Remove duplicates from the DataFrame
gmaps_metadata = gmaps_metadata.dropDuplicates()

gmaps_metadata.show()

                                                                                

+--------------------+----------+--------+--------------------+--------+---------+-----------+--------------+--------------------+--------------------+
|             address|avg_rating|category|             gmap_id|latitude|longitude|       name|num_of_reviews|    relative_results|                 url|
+--------------------+----------+--------+--------------------+--------+---------+-----------+--------------+--------------------+--------------------+
|742 Evergreen Ter...|       4.9| [House]|0x88f1deddd28ff68...| 32.3783| -83.3511|Grand Place|            16|[0x88f16e41929435...|https://www.googl...|
+--------------------+----------+--------+--------------------+--------+---------+-----------+--------------+--------------------+--------------------+



In [9]:
# Define the output JSON file path
output_json_path = 'gs://bucket123321/gmap_clean_data/metadata/new_data'

# Repartition the DataFrame to a single partition
gmaps_metadata = gmaps_metadata.repartition(1)

# Write the DataFrame to a JSON file 
gmaps_metadata.write.json(output_json_path)

                                                                                

In [10]:
!bq load --autodetect --source_format=NEWLINE_DELIMITED_JSON conjunto_testing.gmaps_metadata_testing gs://bucket123321/gmap_clean_data/metadata/new_data/*.json

# Initialize the client
client = storage.Client()

# Specify the GCS bucket and JSON file path
bucket_name = "bucket123321"
json_file_path = "incremental_load/registers.json"

# Convert the file_info_json variable to JSON format
json_data = json.dumps(file_info_json)

# Upload the JSON data to GCS, replacing the existing JSON file
bucket = client.get_bucket(bucket_name)
blob = bucket.blob(json_file_path)
blob.upload_from_string(json_data, content_type="application/json")

Waiting on bqjob_r481d05d9984ba213_00000188c2051bb2_1 ... (1s) Current status: DONE   
