# From File (including URL to download a file) to GCS

In [None]:
from gcp_ingestion import FromFileToGCS

# #usage exemple from URL to GCS
bucket_name = f"raw_geodata"
credentials_path = 'service_account.json' # JSON key from BigQuery service account, must have a Storage Admin role

# list of urls to download with the destination name of the blob in GCS
list_url_blob = []
for year in range(2018, 2024):
    url = f'https://files.data.gouv.fr/geo-dvf/latest/csv/{year}/full.csv.gz'
    destination_blob_name = f'geo_dvf_{year}.gz'
    dict_url_dest = {'url' : url, 
                     'destination_blob_name' : destination_blob_name }
    list_url_blob.append(dict_url_dest)



processor = FromFileToGCS(bucket_name, credentials_path)
# create the bucket in GCS, if it doesn't exist, otherwise do nothing
processor.create_bucket() 

# loop to download the data from the url and upload it to GCS
for el in list_url_blob :
    url = el['url']
    destination_blob_name = el['destination_blob_name']
    processor.download_and_upload_from_URL(url, destination_blob_name)

# list the blobs in the bucket
blobs = processor.list_blobs()
# print the list of blobs, you can choose which one to extract and upload
print(blobs)
# extract and upload the data from the list of blobs (GCS format)
processor.extract_and_upload_sel(blobs)


# From local file to GCS 

In [None]:
from gcp_ingestion import FromFileToGCS

#usage exemple from local to GCS
bucket_name = f"raw_geodata_local_files"
credentials_path = 'service_account.json' # JSON key from BigQuery service account, must have a Storage Admin role

# list of local files to upload with the destination name of the blob in GCS
processor = FromFileToGCS(bucket_name, credentials_path)
processor.create_bucket()
processor.local_to_gcs('full.csv.gz', 'geo_dvf_2023.gz')
blobs = processor.list_blobs()
processor.extract_and_upload_sel(blobs)

# From GCS to Bigquery

In [1]:
from gcp_ingestion import FromGCStoGBQ

# usage exemple
credentials_path = 'service_account.json'
project_id = 'blablacar-ae-case-study'
dataset_name = 'raw_data'
bucket_name = 'raw_geodata'

processor_bq = FromGCStoGBQ(credentials_path, project_id, dataset_name, bucket_name)
processor_bq.create_dataset()
blobs = processor_bq.list_blobs()
for blob in blobs:
    print(blob.name)
processor_bq.upload_to_bq(blobs)

[32mCreated dataset (or already exists) blablacar-ae-case-study.raw_data[0m
raw_geodata
raw_csv/geo_dvf_2018.csv
raw_csv/geo_dvf_2019.csv
raw_csv/geo_dvf_2020.csv
raw_csv/geo_dvf_2021.csv
raw_csv/geo_dvf_2022.csv
raw_csv/geo_dvf_2023.csv


  df = pd.read_csv(StringIO(blob_data))


[34mraw_csv/geo_dvf_2018.csv[0m


1it [01:14, 74.61s/it]


[32mraw_csv/geo_dvf_2018.csv is uploaded to blablacar-ae-case-study.raw_data.geo_dvf_2018[0m


KeyboardInterrupt: 