In [18]:
import pandas as pd
import pyarrow.parquet as pq
from datetime import date, datetime, time
from dateutil.relativedelta import relativedelta

In [2]:
today = date.today()

df = pd.DataFrame({
    'name': ['yesterday', 'today', 'tomorrow'],
    'value': [None, today, today + relativedelta(days=1)]
})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    3 non-null      object
 1   value   2 non-null      object
dtypes: object(2)
memory usage: 180.0+ bytes


In [3]:
parquet_path = '../.files_temp/parquet_example.parquet'
df.to_parquet(parquet_path, engine="pyarrow", index=False)

<h1>Utility functions</h1>

In [4]:
def check_cell_datatype(df):
    df.info()

    for index, row in df.iterrows():
        name, value = row['name'], row['value']
        print(f'{name} - {value} - {type(value)}')


def loc_on_the_date_column(df):
    print()
    print(df.loc[df['value'].isnull(), :])
    print()
    print(df.loc[df['value'] == today, :])
    print()
    print(df.loc[df['value'] >= today, :])
    print()
    print(df.loc[df['value'].isnull() | (df['value'] <= today), :])

<h1>Directly read parquet by pandas</h1>
<h4>1. pd.read_parquet() successfully recovered the datetime.date type</h4>
<h4>2. convert_dtypes() converts to the best nullable type. columns with datetime.date remain "object" type</h4>

In [5]:
df = pd.read_parquet(parquet_path)
df = df.convert_dtypes()

check_cell_datatype(df)
loc_on_the_date_column(df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    3 non-null      string
 1   value   2 non-null      object
dtypes: object(1), string(1)
memory usage: 180.0+ bytes
yesterday - None - <class 'NoneType'>
today - 2025-10-29 - <class 'datetime.date'>
tomorrow - 2025-10-30 - <class 'datetime.date'>

        name value
0  yesterday  None

    name       value
1  today  2025-10-29

       name       value
1     today  2025-10-29
2  tomorrow  2025-10-30

        name       value
0  yesterday        None
1      today  2025-10-29


<h1>Read through awswrangler</h1>

<h4>awswrangler does not allow read from local file</h4>
<h4>Need to use the pyarrow.parquet library, which awswrangler itself leverages</h4>

In [20]:
table = pq.read_table(parquet_path)
df = table.to_pandas()
df = df.convert_dtypes()

check_cell_datatype(df)
loc_on_the_date_column(df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    3 non-null      string
 1   value   2 non-null      object
dtypes: object(1), string(1)
memory usage: 180.0+ bytes
yesterday - None - <class 'NoneType'>
today - 2025-10-29 - <class 'datetime.date'>
tomorrow - 2025-10-30 - <class 'datetime.date'>

        name value
0  yesterday  None

    name       value
1  today  2025-10-29

       name       value
1     today  2025-10-29
2  tomorrow  2025-10-30

        name       value
0  yesterday        None
1      today  2025-10-29


<h1>Inspect parquet file schema</h1>

In [24]:
pq.read_schema(parquet_path)

name: string
value: date32[day]
-- schema metadata --
pandas: '{"index_columns": [], "column_indexes": [], "columns": [{"name":' + 290

<h1>Conclusion: Pandas can handle datetime.date type properly</h1>