In [1]:
import os
import s3fs
import polars as pl 
import pandas as pd
from dotenv import load_dotenv

In [4]:
load_dotenv(override=True)

aws_access_key = os.getenv("AWS_ACCESS_KEY_ID")
aws_secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY")
aws_region = os.getenv("AWS_REGION")

fs = s3fs.S3FileSystem(
    key=aws_access_key, secret=aws_secret_access_key, client_kwargs={"region_name": aws_region})


In [18]:
csv_path = "dataeng-warmup/data_raw/station.csv"

# with fs.open(csv_path, "rb") as f:
#     df = pl.scan_csv(f)


    
schema = {
    'id': pl.Int64,
    'name': pl.String,
    'lat': pl.String,
    'long': pl.String,
    'dock_count': pl.Int64,
    'city': pl.String,
    'installation_date': pl.String
} 

with fs.open(csv_path, "rb") as f:
    df = pl.scan_csv(f, schema= schema)

df = df.with_columns([
    pl.col("installation_date").str.strptime(pl.Date, "%d/%m/%Y")])

df.head(2).collect()

id,name,lat,long,dock_count,city,installation_date
i64,str,str,str,i64,str,date
2,"""San Jose Diridon Caltrain Stat…","""37.329732""","""-121.90178200000001""",27,"""San Jose""",2013-06-08
3,"""San Jose Civic Center""","""37.330698""","""-121.888979""",15,"""San Jose""",2013-05-08


In [6]:
csv_path = "dataeng-warmup/data_raw/station.csv"

with fs.open(csv_path, "rb") as f:
    df = pl.read_csv(f)

    
df.head(2)

id,name,lat,long,dock_count,city,installation_date
i64,str,f64,f64,i64,str,str
2,"""San Jose Diridon Caltrain Stat…",37.329732,-121.901782,27,"""San Jose""","""8/6/2013"""
3,"""San Jose Civic Center""",37.330698,-121.888979,15,"""San Jose""","""8/5/2013"""


In [7]:
csv_path = "dataeng-warmup/data_raw/station.csv"

with fs.open(csv_path, "rb") as f:
    df = pd.read_csv(f)

df.head(2)

Unnamed: 0,id,name,lat,long,dock_count,city,installation_date
0,2,San Jose Diridon Caltrain Station,37.329732,-121.901782,27,San Jose,8/6/2013
1,3,San Jose Civic Center,37.330698,-121.888979,15,San Jose,8/5/2013


station

In [22]:
insper_username = "antoniovfa"
bucket_name = "dataeng-warmup"
parquet_path = f"{bucket_name}/data_processed/{insper_username}/station.parquet"

schema = {
    'id': pl.Int64,
    'name': pl.String,
    'lat': pl.String,
    'long': pl.String,
    'dock_count': pl.Int64,
    'city': pl.String,
    'installation_date': pl.String
} 

csv_path = "dataeng-warmup/data_raw/station.csv"

with fs.open(csv_path, mode= "rb") as f:
    df = (pl.scan_csv(f, schema=schema)
          .with_columns([
              pl.col("installation_date").str.strptime(pl.Date, "%m/%d/%Y")
          ]))


with fs.open(parquet_path, mode="wb") as f:
    df.collect().write_parquet(f)
