In [2]:
import pandas as pd
import numpy as np

from google.cloud import storage

**Data 1**

In [3]:
# Original Data - getting historic data in
data = {
    'primary_key': [1, 2, 3],
    'integer_col': [22, 33, 44],
    'float_col': [1.1, 2.2, 3.3],
    'double_col': [1.111111111, 2.222222222, 3.333333333],  # Double datatype column
    'decimal_col': [0.12345, 0.23456, 0.34567],  # Decimal datatype column
    'string_col': ['apple', 'banana', 'orange'],
    'boolean_col': [True, False, True],
    'timestamp_col': [pd.Timestamp('2022-01-01'), pd.Timestamp('2022-01-02'), pd.Timestamp('2022-01-03')],
    'date_col': [pd.Timestamp('2022-01-01').date(), pd.Timestamp('2022-01-02').date(), pd.Timestamp('2022-01-03').date()],
    'datetime_with_timezone_col': [
        pd.Timestamp('2022-01-01 12:00:00', tz='UTC'),
        pd.Timestamp('2022-01-02 12:00:00', tz='UTC'),
        pd.Timestamp('2022-01-03 12:00:00', tz='UTC')
    ]
}

# Create DataFrame
df = pd.DataFrame(data)

# Display DataFrame
df.head()


Unnamed: 0,primary_key,integer_col,float_col,double_col,decimal_col,string_col,boolean_col,timestamp_col,date_col,datetime_with_timezone_col
0,1,22,1.1,1.111111,0.12345,apple,True,2022-01-01,2022-01-01,2022-01-01 12:00:00+00:00
1,2,33,2.2,2.222222,0.23456,banana,False,2022-01-02,2022-01-02,2022-01-02 12:00:00+00:00
2,3,44,3.3,3.333333,0.34567,orange,True,2022-01-03,2022-01-03,2022-01-03 12:00:00+00:00


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype              
---  ------                      --------------  -----              
 0   primary_key                 3 non-null      int64              
 1   integer_col                 3 non-null      int64              
 2   float_col                   3 non-null      float64            
 3   double_col                  3 non-null      float64            
 4   decimal_col                 3 non-null      float64            
 5   string_col                  3 non-null      object             
 6   boolean_col                 3 non-null      bool               
 7   timestamp_col               3 non-null      datetime64[ns]     
 8   date_col                    3 non-null      object             
 9   datetime_with_timezone_col  3 non-null      datetime64[ns, UTC]
dtypes: bool(1), datetime64[ns, UTC](1), datetime64[ns](1), float64(3),

In [13]:
# Assuming df is your Pandas DataFrame
# Define the filename and folder path
filename = 'test_bg_df_sample_historic_data.parquet'
folder_path = '/Users/leonsmith/explore/fs/assetinsure/asset_secure/fs-as-disc/'  # Replace this with your actual folder path

# Write DataFrame to a Parquet file in the specified folder
df.to_parquet(folder_path + filename)

In [9]:
def upload_blob(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket."""
#     The ID of your GCS bucket
#     bucket_name = "fs-assetsecure-testing"
#     The path to your file to upload
#     source_file_name = "Users/leonsmith/explore/fs/assetinsure/asset_secure/fs-as-disc/test_bg_df_sample_data.parquet"
#     The ID of your GCS object
#     destination_blob_name = "test_bg_df_sample_data.parquet"

    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    # Optional: set a generation-match precondition to avoid potential race conditions
    # and data corruptions. The request to upload is aborted if the object's
    # generation number does not match your precondition. For a destination
    # object that does not yet exist, set the if_generation_match precondition to 0.
    # If the destination object already exists in your bucket, set instead a
    # generation-match precondition using its generation number.
    generation_match_precondition = blob.generation

    blob.upload_from_filename(source_file_name, if_generation_match=generation_match_precondition)

    print(
        f"File {source_file_name} uploaded to {destination_blob_name}."
    )


In [10]:
# The ID of your GCS bucket
bucket_name = "fs-assetsecure-testing"
# The path to your file to upload
source_file_name = folder_path + filename
# The ID of your GCS object
destination_blob_name = "test_bg_df_sample_historic_data.parquet"

In [11]:
upload_blob(bucket_name, source_file_name, destination_blob_name)

File /Users/leonsmith/explore/fs/assetinsure/asset_secure/fs-as-disc/test_bg_df_sample_data.parquet uploaded to test_bg_df_sample_historic_data.parquet.


In [12]:
# Additional data to be added - only taking the delta changes of data and creating new file to use
additional_data = {
    'primary_key': [4, 5],
    'integer_col': [44, 55],
    'float_col': [4.4, 5.5],
    'double_col': [4.444444444, 5.555555555],  # Double datatype column
    'decimal_col': [0.45678, 0.56789],  # Decimal datatype column
    'string_col': ['grape', 'pineapple'],
    'boolean_col': [False, True],
    'timestamp_col': [pd.Timestamp('2022-01-04'), pd.Timestamp('2022-01-05')],
    'date_col': [pd.Timestamp('2022-01-04').date(), pd.Timestamp('2022-01-05').date()],
    'datetime_with_timezone_col': [
        pd.Timestamp('2022-01-04 12:00:00', tz='UTC'),
        pd.Timestamp('2022-01-05 12:00:00', tz='UTC')
    ]
}

# Create DataFrame for additional data
additional_df = pd.DataFrame(additional_data)


# Displat DataFrame
additional_df.head()

Unnamed: 0,primary_key,integer_col,float_col,double_col,decimal_col,string_col,boolean_col,timestamp_col,date_col,datetime_with_timezone_col
0,4,44,4.4,4.444444,0.45678,grape,False,2022-01-04,2022-01-04,2022-01-04 12:00:00+00:00
1,5,55,5.5,5.555556,0.56789,pineapple,True,2022-01-05,2022-01-05,2022-01-05 12:00:00+00:00


 - integer_col: Contains integer values (int64 data type).
 - float_col: Contains floating-point values (float64 data type).
 - string_col: Contains string values (object data type).
 - boolean_col: Contains boolean values (bool data type).
 - timestamp_col: Contains timestamp values (datetime64[ns] data type).
 - datetime64[ns, UTC]

In [14]:
# Assuming df is your Pandas DataFrame
# Define the filename and folder path
filename = 'test_bg_df_sample_delta_data.parquet'
folder_path = '/Users/leonsmith/explore/fs/assetinsure/asset_secure/fs-as-disc/'  # Replace this with your actual folder path

# Write DataFrame to a Parquet file in the specified folder
additional_df.to_parquet(folder_path + filename)

In [15]:
# The ID of your GCS bucket
bucket_name = "fs-assetsecure-testing"
# The path to your file to upload
source_file_name = folder_path + filename
# The ID of your GCS object
destination_blob_name = "test_bg_df_sample_delta_data.parquet"

In [16]:
upload_blob(bucket_name, source_file_name, destination_blob_name)

File /Users/leonsmith/explore/fs/assetinsure/asset_secure/fs-as-disc/test_bg_df_sample_delta_data.parquet uploaded to test_bg_df_sample_delta_data.parquet.


**Data 2**

In [3]:
# Original Data - getting historic data in
data = {
    'primary_key': [3, 1, 2],
    'other_integer_col': [666, 33333, 5435],
    'other_float_col': [9.5, 4.6, 8.8],
    'other_decimal_col': [0.987, 0.278501, 0.0847936],  # Decimal datatype column
    'other_string_col': ['brocolli', 'spinach', 'carrot'],
    'other_boolean_col': [False, True, False],
}

# Create DataFrame
df_2 = pd.DataFrame(data)

# Display DataFrame
df_2.head()


Unnamed: 0,primary_key,other_integer_col,other_float_col,other_decimal_col,other_string_col,other_boolean_col
0,3,666,9.5,0.987,brocolli,False
1,1,33333,4.6,0.278501,spinach,True
2,2,5435,8.8,0.084794,carrot,False


In [5]:
df_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   primary_key        3 non-null      int64  
 1   other_integer_col  3 non-null      int64  
 2   other_float_col    3 non-null      float64
 3   other_decimal_col  3 non-null      float64
 4   other_string_col   3 non-null      object 
 5   other_boolean_col  3 non-null      bool   
dtypes: bool(1), float64(2), int64(2), object(1)
memory usage: 251.0+ bytes


In [7]:
# Assuming df is your Pandas DataFrame
# Define the filename and folder path
filename = 'test_bg_df_sample2_historic_data.parquet'
folder_path = '/Users/leonsmith/explore/fs/assetinsure/asset_secure/fs-as-disc/'  # Replace this with your actual folder path

# Write DataFrame to a Parquet file in the specified folder
df_2.to_parquet(folder_path + filename)

In [8]:
def upload_blob(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket."""
#     The ID of your GCS bucket
#     bucket_name = "fs-assetsecure-testing"
#     The path to your file to upload
#     source_file_name = "Users/leonsmith/explore/fs/assetinsure/asset_secure/fs-as-disc/test_bg_df_sample_data.parquet"
#     The ID of your GCS object
#     destination_blob_name = "test_bg_df_sample_data.parquet"

    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    # Optional: set a generation-match precondition to avoid potential race conditions
    # and data corruptions. The request to upload is aborted if the object's
    # generation number does not match your precondition. For a destination
    # object that does not yet exist, set the if_generation_match precondition to 0.
    # If the destination object already exists in your bucket, set instead a
    # generation-match precondition using its generation number.
    generation_match_precondition = blob.generation

    blob.upload_from_filename(source_file_name, if_generation_match=generation_match_precondition)

    print(
        f"File {source_file_name} uploaded to {destination_blob_name}."
    )


In [9]:
# The ID of your GCS bucket
bucket_name = "fs-assetsecure-testing"
# The path to your file to upload
source_file_name = folder_path + filename
# The ID of your GCS object
destination_blob_name = "test_bg_df_sample2_historic_data.parquet"

In [10]:
upload_blob(bucket_name, source_file_name, destination_blob_name)

File /Users/leonsmith/explore/fs/assetinsure/asset_secure/fs-as-disc/test_bg_df_sample2_historic_data.parquet uploaded to test_bg_df_sample2_historic_data.parquet.


In [6]:
# Additional data to be added - only taking the delta changes of data and creating new file to use
additional_data = {
    'primary_key': [5, 4],
    'other_integer_col': [43, 56],
    'other_float_col': [24.5, 98.6],
    'other_decimal_col': [0.84693, 0.3342],  # Decimal datatype column
    'other_string_col': ['aartappel', 'brusselsprout'],
    'other_boolean_col': [False, False],
}

# Create DataFrame for additional data
additional_df2 = pd.DataFrame(additional_data)


# Displat DataFrame
additional_df2.head()

Unnamed: 0,primary_key,other_integer_col,other_float_col,other_decimal_col,other_string_col,other_boolean_col
0,5,43,24.5,0.84693,aartappel,False
1,4,56,98.6,0.3342,brusselsprout,False


 - integer_col: Contains integer values (int64 data type).
 - float_col: Contains floating-point values (float64 data type).
 - string_col: Contains string values (object data type).
 - boolean_col: Contains boolean values (bool data type).
 - timestamp_col: Contains timestamp values (datetime64[ns] data type).
 - datetime64[ns, UTC]

In [11]:
# Assuming df is your Pandas DataFrame
# Define the filename and folder path
filename = 'test_bg_df_sample2_delta_data.parquet'
folder_path = '/Users/leonsmith/explore/fs/assetinsure/asset_secure/fs-as-disc/'  # Replace this with your actual folder path

# Write DataFrame to a Parquet file in the specified folder
additional_df2.to_parquet(folder_path + filename)

In [12]:
# The ID of your GCS bucket
bucket_name = "fs-assetsecure-testing"
# The path to your file to upload
source_file_name = folder_path + filename
# The ID of your GCS object
destination_blob_name = "test_bg_df_sample2_delta_data.parquet"

In [13]:
upload_blob(bucket_name, source_file_name, destination_blob_name)

File /Users/leonsmith/explore/fs/assetinsure/asset_secure/fs-as-disc/test_bg_df_sample2_delta_data.parquet uploaded to test_bg_df_sample2_delta_data.parquet.
