# Schema Evolution


In [65]:
from datetime import date
import awswrangler as wr
import pandas as pd

In [3]:
import boto3
boto3.setup_default_session(profile_name="quannguyen")

## Create a Dataset

In [32]:
s3_path = "s3://upskills-landing-zone/testdb/my_table"

In [67]:
df = pd.DataFrame({
    "id": [1, 2],
    "value": ["foo", "boo"],
    "business_date": [date(2020, 1, 1), date(2020, 1, 2)]
})

wr.s3.to_parquet(
    df=df,
    path=s3_path,
    dataset=True,
    mode="overwrite",
    database="testdb",
    table="my_table",
    partition_cols=["business_date"]
)

wr.s3.read_parquet(s3_path, dataset=True)

Unnamed: 0,id,value,business_date
0,1,foo,2020-01-01
1,2,boo,2020-01-02


## Adding new columns
- When you add new columns to a Glue table, the previous Parquet files associated with the table remain unchanged. 
- Parquet is a columnar storage format that allows for schema evolution, meaning that new columns can be added to the schema without affecting the existing data.

In [68]:
df = pd.DataFrame({
    "id": [3, 4],
    "value": ["bar", None],
    "business_date": [date(2020, 1, 3), date(2020, 1, 4)],
    "flag": [True, False]
})

wr.s3.to_parquet(
    df=df,
    path=s3_path,
    dataset=True,
    mode="append",
    database="testdb",
    table="my_table",
    partition_cols=["business_date"],
    catalog_versioning=True  # Optional
)

{'paths': ['s3://upskills-landing-zone/testdb/my_table/business_date=2020-01-03/29b0947fa8624ccb9beb8b9294d5f092.snappy.parquet',
  's3://upskills-landing-zone/testdb/my_table/business_date=2020-01-04/29b0947fa8624ccb9beb8b9294d5f092.snappy.parquet'],
 'partitions_values': {'s3://upskills-landing-zone/testdb/my_table/business_date=2020-01-03/': ['2020-01-03'],
  's3://upskills-landing-zone/testdb/my_table/business_date=2020-01-04/': ['2020-01-04']}}

In [69]:
df = wr.s3.read_parquet(s3_path, dataset=True, validate_schema=False)
df.head()

Unnamed: 0,id,value,business_date,flag
0,1,foo,2020-01-01,
1,2,boo,2020-01-02,
2,3,bar,2020-01-03,True
3,4,,2020-01-04,False


In [63]:
#  wr.s3.delete_objects(s3_path)