In [5]:
import boto3
import numpy as np
import pandas as pd
from io import StringIO
from time import time

In [None]:
def upload_raw_measurements():
  s3_resource = boto3.resource('s3')
  s3_client = boto3.client('s3')
  s3_bucket = 'utd19-u-raw'

  # create bucket if not exists
  try:
    location = {'LocationConstraint': 'eu-central-1'}

    s3_client.create_bucket(
      Bucket=s3_bucket,
      CreateBucketConfiguration=location
    )
  except s3_client.exceptions.BucketAlreadyOwnedByYou:
    print(f"Bucket '{s3_bucket}' already exists")

  
  dtypes = {
    'day': pd.StringDtype(),
    'interval': pd.Int64Dtype(),
    'detid': pd.StringDtype(),
    'flow': pd.Float64Dtype(),
    'occ': pd.Float64Dtype(),
    'error': pd.Int64Dtype(),
    'city': pd.StringDtype(),
    'speed': pd.Float64Dtype()
  }

  # read the file in chunks
  df_iterator = pd.read_csv('./data/utd19_u.csv', chunksize=10000000, dtype=dtypes)

  # upload chunks to S3
  for index, df in enumerate(df_iterator):
    t_start = time()

    csv_buffer = StringIO()

    df.to_csv(csv_buffer, index=False)
    s3_resource.Object(s3_bucket, f"raw/measurements/chunk_{index}.csv").put(Body=csv_buffer.getvalue())

    csv_buffer.close()

    t_end = time()
    print(f"Partition {index} upload to {s3_bucket} in {(t_end - t_start):.3f} seconds")
  print("File upload complete.")

In [13]:
upload_raw_measurements()

Bucket 'utd19-u-raw' already exists
Partition 0 upload to utd19-u-raw in 62.934 seconds
Partition 1 upload to utd19-u-raw in 77.813 seconds
Partition 2 upload to utd19-u-raw in 68.367 seconds
Partition 3 upload to utd19-u-raw in 89.765 seconds
Partition 4 upload to utd19-u-raw in 88.589 seconds
Partition 5 upload to utd19-u-raw in 81.966 seconds
Partition 6 upload to utd19-u-raw in 83.215 seconds
Partition 7 upload to utd19-u-raw in 111.796 seconds
Partition 8 upload to utd19-u-raw in 67.598 seconds
Partition 9 upload to utd19-u-raw in 68.305 seconds
Partition 10 upload to utd19-u-raw in 63.675 seconds
Partition 11 upload to utd19-u-raw in 69.845 seconds
Partition 12 upload to utd19-u-raw in 67.001 seconds
Partition 13 upload to utd19-u-raw in 28.907 seconds
File upload complete.
