In [27]:
import boto3
from botocore.exceptions import ClientError
import pandas as pd
from io import StringIO
from time import time
import logging
import os

In [28]:
def upload_raw_measurements():
  s3_client = boto3.client('s3')
  s3_bucket = 'utd19-u-raw'

  # create bucket if not exists
  try:
    location = {'LocationConstraint': 'eu-central-1'}

    s3_client.create_bucket(
      Bucket=s3_bucket,
      CreateBucketConfiguration=location
    )
  except s3_client.exceptions.BucketAlreadyOwnedByYou:
    print(f"Bucket '{s3_bucket}' already exists")

  
  dtypes = {
    'day': pd.StringDtype(),
    'interval': pd.Int64Dtype(),
    'detid': pd.StringDtype(),
    'flow': pd.Float64Dtype(),
    'occ': pd.Float64Dtype(),
    'error': pd.Int64Dtype(),
    'city': pd.StringDtype(),
    'speed': pd.Float64Dtype()
  }

  # read the file in chunks
  df_iterator = pd.read_csv('./data/utd19_u.csv', chunksize=10000000, dtype=dtypes)

  # upload chunks to S3
  for index, df in enumerate(df_iterator):
    t_start = time()

    csv_buffer = StringIO()

    df.to_csv(csv_buffer, index=False)
    s3_client.put_object(
      Body=csv_buffer.getvalue(),
      Bucket=s3_bucket,
      Key=f"measurements/chunk_{index}.csv"
    )

    csv_buffer.close()

    t_end = time()
    print(f"Partition {index} upload to {s3_bucket} in {(t_end - t_start):.3f} seconds")
  print("File upload complete.")

In [None]:
upload_raw_measurements()

Bucket 'utd19-u-raw' already exists
Partition 0 upload to utd19-u-raw in 67.190 seconds
Partition 1 upload to utd19-u-raw in 68.340 seconds
Partition 2 upload to utd19-u-raw in 73.269 seconds


In [None]:
def upload_raw_links():
  s3_client = boto3.client('s3')
  s3_bucket = 'utd19-u-raw'

  # create bucket if not exists
  try:
    location = {'LocationConstraint': 'eu-central-1'}

    s3_client.create_bucket(
      Bucket=s3_bucket,
      CreateBucketConfiguration=location
    )
  except s3_client.exceptions.BucketAlreadyOwnedByYou:
    print(f"Bucket '{s3_bucket}' already exists")

  
  dtypes = {
    'long': pd.Float64Dtype(),
    'lat': pd.Float64Dtype(),
    'order': pd.Int64Dtype(),
    'piece': pd.Int64Dtype(),
    'linkid': pd.Int64Dtype(),
    'group': pd.StringDtype(),
    'city': pd.StringDtype()
  }

  file = './data/links.csv'

  t_start = time()

  with open(file, 'rb') as fileObj:
    s3_client.upload_fileobj(fileObj, s3_bucket, 'links.csv')

  t_end = time()
  print(f"File uploaded in {(t_end - t_start):.3f} seconds")

In [None]:
upload_raw_links()

Bucket 'utd19-u-raw' already exists
None
File uploaded in 0.678 seconds
