# core data utilities

> core utilities for data processing: 
> datetime processing, time zone processing, validity, filtering

In [None]:
#| default_exp data.utils

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| hide
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
#| export
from datetime import datetime
import pandas as pd
import pytz

In [None]:
#| export
pd.options.mode.chained_assignment = None

In [None]:
#| export
def validate_datetime(date_string, format_string):
    """
    Validates if a string is a valid datetime according to the given format.
    """

    try:
        datetime.strptime(date_string, format_string)
        return True
    except ValueError:
        return False
    
def validate_datetime_in_iso_format(date_text):
        try:
            datetime.fromisoformat(date_text)
            return True
        except ValueError:
            # raise ValueError("Incorrect data format, should be YYYY-MM-DD")
            return False


In [None]:
# Example usage
date_strings = [
    "2023-12-25 24:00:00",
    "2023-12-25 12:60:00.12",
    "2023-12-25 12:10:00",
    "2023-12-25 23:00:00",
    "2011-11-04",
    "20111104",
    "2011-11-04T00:05:23",
    "2011-11-04T00:05:23.283185",
    "20111104T000523",
    "20111104T000523.283185",
    "2011-11-04T00:05:23Z",
    "2011-11-04T00:05:23.283185+08:00",
    "2011-11-04T00:05:23+08:00",
    ]
format_string = "%Y-%m-%d"

for s in date_strings:
    if validate_datetime_in_iso_format(s):
        print(f"{s} is Valid datetime string")
    else:
        print(f"{s} is Invalid datetime string")

In [None]:
df_datetime_str = pd.DataFrame(date_strings)
df_datetime_str.columns = ['datetime']
# df_datetime_str
df_datetime_str['validity'] = df_datetime_str.apply(lambda x: validate_datetime_in_iso_format(x.iloc[0]), axis=1)
df_datetime_str


In [None]:
df_datetime = df_datetime_str[df_datetime_str.apply(lambda x: validate_datetime_in_iso_format(x.iloc[0]), axis=1)]
df_datetime


In [None]:
df_datetime_invalid = df_datetime_str[df_datetime_str.apply(lambda x: not validate_datetime_in_iso_format(x.iloc[0]), axis=1)]
df_datetime_invalid


In [None]:
df_datetime['datetime']

In [None]:
# df_datetime_str.apply(lambda x: validate_datetime_in_iso_format(x.iloc[0]), axis=1)
df_datetime1 = df_datetime.apply(lambda x: datetime.fromisoformat(x['datetime']), axis=1)
df_datetime1.name = 'datetime'
df_datetime1


In [None]:
df_datetime1.loc[0:8]

In [None]:
df_no_tz = pd.to_datetime(df_datetime1.loc[0:8])
df_no_tz

In [None]:
#| export
def get_timezone_abbreviation(timezone_name):
    timezone = pytz.timezone(timezone_name)
    now = datetime.now(timezone)
    return now.strftime("%Z")

def validate_timezone_in_iana(timezone_name):
    return timezone_name in pytz.all_timezones


In [None]:

time_zone_strings = [
    'Asia/Shanghai',
    'Asia/Mumbai',
    'America/New_York',
    'Europe/London',
    'Eastern Standard Time',
    'US/Eastern',
    'America/Los_Angeles',
    'Asia/Kolkata',
    'Europe/London',
    'Asia/Hong_Kong',
    'Asia/Tokyo',
]
df_timezone_str = pd.DataFrame(time_zone_strings)
df_timezone_str.columns = ['timezone']

# df_datetime_str
df_timezone_str['validity'] = df_timezone_str.apply(lambda x: validate_timezone_in_iana(x.iloc[0]), axis=1)
df_timezone_str


In [None]:

df_timezone = df_timezone_str[df_timezone_str.apply(lambda x: validate_timezone_in_iana(x.iloc[0]), axis=1)]
df_timezone

In [None]:
# extract invalid data
df_timezone_invalid = df_timezone_str[df_timezone_str.apply(lambda x: not validate_timezone_in_iana(x.iloc[0]), axis=1)]
df_timezone_invalid

In [None]:
df_abbr = df_timezone.apply(lambda x: get_timezone_abbreviation(x.loc['timezone']), axis=1)
df_abbr.name = 'abbr'
df_abbr

In [None]:
df_timezone

In [None]:
# df_timezone_str['validity'] = df_timezone_str.apply(lambda x: validate_timezone_in_iana(x.iloc[0]), axis=1)
# df_timezone['abbr'] = df_timezone.apply(lambda x: get_timezone_abbreviation(x.iloc[0]), axis=1)
# df_timezone.apply(lambda x: get_timezone_abbreviation(x.iloc[0]), axis=1)
df_timezone.loc[:,'abbr'] = df_abbr
df_timezone
# df_timezone


In [None]:

df_utc_offset = (df_timezone.apply(lambda x: datetime.now(pytz.timezone(x.loc['timezone'])), axis=1)
                            .apply(lambda x: x.strftime('%z')))
df_utc_offset.name = 'utc_offset'
df_utc_offset


In [None]:

df_timezone.loc[:,'utc_offset'] = df_utc_offset
df_timezone


In [None]:
for tz in df_timezone.loc[:,'timezone']:
    print(f"{tz}: {datetime.now(pytz.timezone(tz))} - {datetime.now(pytz.timezone(tz)).strftime('%Z')}, {datetime.now(pytz.timezone(tz)).strftime('%z')}")

In [None]:
for dt,tz in zip(df_no_tz.loc[0:8], df_timezone.loc[:,'timezone']):
    print(f"{dt}: {dt.tz_localize(tz)}, as {dt.tz_localize(tz).astimezone('utc')}")

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()