# core data utilities

> core utilities for data processing: 
> datetime processing, time zone processing, validity, filtering

In [70]:
#| default_exp data.utils

In [71]:
#| hide
from nbdev.showdoc import *

In [72]:
#| hide
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [73]:
#| export
from datetime import datetime
import pandas as pd
import pytz

In [74]:
#| export
pd.options.mode.chained_assignment = None

In [75]:
#| export
def validate_datetime(date_string, format_string):
    """
    Validates if a string is a valid datetime according to the given format.
    """

    try:
        datetime.strptime(date_string, format_string)
        return True
    except ValueError:
        return False
    
def validate_datetime_in_iso_format(date_text):
        try:
            datetime.fromisoformat(date_text)
            return True
        except ValueError:
            # raise ValueError("Incorrect data format, should be YYYY-MM-DD")
            return False


In [76]:
# Example usage
date_strings = [
    "2023-12-25 24:00:00",
    "2023-12-25 12:60:00.12",
    "2023-12-25 12:10:00",
    "2023-12-25 23:00:00",
    "2011-11-04",
    "20111104",
    "2011-11-04T00:05:23",
    "2011-11-04T00:05:23.283185",
    "20111104T000523",
    "20111104T000523.283185",
    "2011-11-04T00:05:23Z",
    "2011-11-04T00:05:23.283185+08:00",
    "2011-11-04T00:05:23+08:00",
    ]
format_string = "%Y-%m-%d"

for s in date_strings:
    if validate_datetime_in_iso_format(s):
        print(f"{s} is Valid datetime string")
    else:
        print(f"{s} is Invalid datetime string")

2023-12-25 24:00:00 is Invalid datetime string
2023-12-25 12:60:00.12 is Invalid datetime string
2023-12-25 12:10:00 is Valid datetime string
2023-12-25 23:00:00 is Valid datetime string
2011-11-04 is Valid datetime string
20111104 is Valid datetime string
2011-11-04T00:05:23 is Valid datetime string
2011-11-04T00:05:23.283185 is Valid datetime string
20111104T000523 is Valid datetime string
20111104T000523.283185 is Valid datetime string
2011-11-04T00:05:23Z is Valid datetime string
2011-11-04T00:05:23.283185+08:00 is Valid datetime string
2011-11-04T00:05:23+08:00 is Valid datetime string


In [77]:
df_datetime_str = pd.DataFrame(date_strings)
df_datetime_str.columns = ['datetime']
# df_datetime_str
df_datetime_str['validity'] = df_datetime_str.apply(lambda x: validate_datetime_in_iso_format(x.iloc[0]), axis=1)
df_datetime_str


Unnamed: 0,datetime,validity
0,2023-12-25 24:00:00,False
1,2023-12-25 12:60:00.12,False
2,2023-12-25 12:10:00,True
3,2023-12-25 23:00:00,True
4,2011-11-04,True
5,20111104,True
6,2011-11-04T00:05:23,True
7,2011-11-04T00:05:23.283185,True
8,20111104T000523,True
9,20111104T000523.283185,True


In [78]:
df_datetime = df_datetime_str[df_datetime_str.apply(lambda x: validate_datetime_in_iso_format(x.iloc[0]), axis=1)]
df_datetime


Unnamed: 0,datetime,validity
2,2023-12-25 12:10:00,True
3,2023-12-25 23:00:00,True
4,2011-11-04,True
5,20111104,True
6,2011-11-04T00:05:23,True
7,2011-11-04T00:05:23.283185,True
8,20111104T000523,True
9,20111104T000523.283185,True
10,2011-11-04T00:05:23Z,True
11,2011-11-04T00:05:23.283185+08:00,True


In [79]:
df_datetime_invalid = df_datetime_str[df_datetime_str.apply(lambda x: not validate_datetime_in_iso_format(x.iloc[0]), axis=1)]
df_datetime_invalid


Unnamed: 0,datetime,validity
0,2023-12-25 24:00:00,False
1,2023-12-25 12:60:00.12,False


In [80]:
df_datetime['datetime']

2                  2023-12-25 12:10:00
3                  2023-12-25 23:00:00
4                           2011-11-04
5                             20111104
6                  2011-11-04T00:05:23
7           2011-11-04T00:05:23.283185
8                      20111104T000523
9               20111104T000523.283185
10                2011-11-04T00:05:23Z
11    2011-11-04T00:05:23.283185+08:00
12           2011-11-04T00:05:23+08:00
Name: datetime, dtype: object

In [81]:
# df_datetime_str.apply(lambda x: validate_datetime_in_iso_format(x.iloc[0]), axis=1)
df_datetime1 = df_datetime.apply(lambda x: datetime.fromisoformat(x['datetime']), axis=1)
df_datetime1.name = 'datetime'
df_datetime1


2                  2023-12-25 12:10:00
3                  2023-12-25 23:00:00
4                  2011-11-04 00:00:00
5                  2011-11-04 00:00:00
6                  2011-11-04 00:05:23
7           2011-11-04 00:05:23.283185
8                  2011-11-04 00:05:23
9           2011-11-04 00:05:23.283185
10           2011-11-04 00:05:23+00:00
11    2011-11-04 00:05:23.283185+08:00
12           2011-11-04 00:05:23+08:00
Name: datetime, dtype: object

In [82]:
df_datetime1.loc[0:8]

2           2023-12-25 12:10:00
3           2023-12-25 23:00:00
4           2011-11-04 00:00:00
5           2011-11-04 00:00:00
6           2011-11-04 00:05:23
7    2011-11-04 00:05:23.283185
8           2011-11-04 00:05:23
Name: datetime, dtype: object

In [83]:
df_no_tz = pd.to_datetime(df_datetime1.loc[0:8])
df_no_tz

2   2023-12-25 12:10:00.000000
3   2023-12-25 23:00:00.000000
4   2011-11-04 00:00:00.000000
5   2011-11-04 00:00:00.000000
6   2011-11-04 00:05:23.000000
7   2011-11-04 00:05:23.283185
8   2011-11-04 00:05:23.000000
Name: datetime, dtype: datetime64[ns]

In [84]:
#| export
def get_timezone_abbreviation(timezone_name):
    timezone = pytz.timezone(timezone_name)
    now = datetime.now(timezone)
    return now.strftime("%Z")

def validate_timezone_in_iana(timezone_name):
    return timezone_name in pytz.all_timezones


In [85]:

time_zone_strings = [
    'Eastern Standard Time',
    'Eastern Daylight Time',
    'US/Eastern',
    'US/Daylight',
    'Asia/Shanghai',
    'Asia/Mumbai',
    'America/New_York',
    'Europe/London',
    'America/Los_Angeles',
    'Asia/Kolkata',
    'Europe/London',
    'Asia/Hong_Kong',
    'Asia/Tokyo',
]
df_timezone_str = pd.DataFrame(time_zone_strings)
df_timezone_str.columns = ['timezone']

# df_datetime_str
df_timezone_str['validity'] = df_timezone_str.apply(lambda x: validate_timezone_in_iana(x['timezone']), axis=1)
df_timezone_str


Unnamed: 0,timezone,validity
0,Eastern Standard Time,False
1,Eastern Daylight Time,False
2,US/Eastern,True
3,US/Daylight,False
4,Asia/Shanghai,True
5,Asia/Mumbai,False
6,America/New_York,True
7,Europe/London,True
8,America/Los_Angeles,True
9,Asia/Kolkata,True


In [104]:
#| export
timezone_fixing_map = {'Eastern Standard Time': 'US/Eastern',
                       'Eastern Daylight Time': 'US/Eastern',
                       'US/Daylight': 'US/Eastern',
                       'Asia/Mumbai': 'Asia/Calcutta',}

In [105]:
#| export
def fix_timezone(timezone_name):
    return timezone_fixing_map.get(timezone_name, timezone_name)

In [87]:
for tz in time_zone_strings:
    print(f"{tz} -> {fix_timezone(tz)} -> {get_timezone_abbreviation(fix_timezone(tz))}")


Eastern Standard Time -> US/Eastern -> EDT
Eastern Daylight Time -> US/Eastern -> EDT
US/Eastern -> US/Eastern -> EDT
US/Daylight -> US/Eastern -> EDT
Asia/Shanghai -> Asia/Shanghai -> CST
Asia/Mumbai -> Asia/Calcutta -> IST
America/New_York -> America/New_York -> EDT
Europe/London -> Europe/London -> BST
America/Los_Angeles -> America/Los_Angeles -> PDT
Asia/Kolkata -> Asia/Kolkata -> IST
Europe/London -> Europe/London -> BST
Asia/Hong_Kong -> Asia/Hong_Kong -> HKT
Asia/Tokyo -> Asia/Tokyo -> JST


In [88]:

df_timezone_unidentified = df_timezone_str[df_timezone_str.apply(lambda x: not validate_timezone_in_iana(x['timezone']), axis=1)]
df_timezone_unidentified

Unnamed: 0,timezone,validity
0,Eastern Standard Time,False
1,Eastern Daylight Time,False
3,US/Daylight,False
5,Asia/Mumbai,False


In [89]:

df_timezone_unidentified.apply(lambda x: fix_timezone(x['timezone']), axis=1)


0       US/Eastern
1       US/Eastern
3       US/Eastern
5    Asia/Calcutta
dtype: object

In [90]:
df_timezone_str['timezone_fixed'] = df_timezone_str.apply(lambda x: fix_timezone(x['timezone']), axis=1)
df_timezone_str


Unnamed: 0,timezone,validity,timezone_fixed
0,Eastern Standard Time,False,US/Eastern
1,Eastern Daylight Time,False,US/Eastern
2,US/Eastern,True,US/Eastern
3,US/Daylight,False,US/Eastern
4,Asia/Shanghai,True,Asia/Shanghai
5,Asia/Mumbai,False,Asia/Calcutta
6,America/New_York,True,America/New_York
7,Europe/London,True,Europe/London
8,America/Los_Angeles,True,America/Los_Angeles
9,Asia/Kolkata,True,Asia/Kolkata


In [91]:

df_timezone = df_timezone_str[df_timezone_str.apply(lambda x: validate_timezone_in_iana(x['timezone_fixed']), axis=1)]
df_timezone

Unnamed: 0,timezone,validity,timezone_fixed
0,Eastern Standard Time,False,US/Eastern
1,Eastern Daylight Time,False,US/Eastern
2,US/Eastern,True,US/Eastern
3,US/Daylight,False,US/Eastern
4,Asia/Shanghai,True,Asia/Shanghai
5,Asia/Mumbai,False,Asia/Calcutta
6,America/New_York,True,America/New_York
7,Europe/London,True,Europe/London
8,America/Los_Angeles,True,America/Los_Angeles
9,Asia/Kolkata,True,Asia/Kolkata


In [92]:
# extract invalid data
df_timezone_invalid = df_timezone_str[df_timezone_str.apply(lambda x: not validate_timezone_in_iana(x.iloc[0]), axis=1)]
df_timezone_invalid

Unnamed: 0,timezone,validity,timezone_fixed
0,Eastern Standard Time,False,US/Eastern
1,Eastern Daylight Time,False,US/Eastern
3,US/Daylight,False,US/Eastern
5,Asia/Mumbai,False,Asia/Calcutta


In [93]:
df_abbr = df_timezone.apply(lambda x: get_timezone_abbreviation(x.loc['timezone_fixed']), axis=1)
df_abbr.name = 'abbr'
df_abbr

0     EDT
1     EDT
2     EDT
3     EDT
4     CST
5     IST
6     EDT
7     BST
8     PDT
9     IST
10    BST
11    HKT
12    JST
Name: abbr, dtype: object

In [94]:
df_timezone

Unnamed: 0,timezone,validity,timezone_fixed
0,Eastern Standard Time,False,US/Eastern
1,Eastern Daylight Time,False,US/Eastern
2,US/Eastern,True,US/Eastern
3,US/Daylight,False,US/Eastern
4,Asia/Shanghai,True,Asia/Shanghai
5,Asia/Mumbai,False,Asia/Calcutta
6,America/New_York,True,America/New_York
7,Europe/London,True,Europe/London
8,America/Los_Angeles,True,America/Los_Angeles
9,Asia/Kolkata,True,Asia/Kolkata


In [95]:
# df_timezone_str['validity'] = df_timezone_str.apply(lambda x: validate_timezone_in_iana(x.iloc[0]), axis=1)
# df_timezone['abbr'] = df_timezone.apply(lambda x: get_timezone_abbreviation(x.iloc[0]), axis=1)
# df_timezone.apply(lambda x: get_timezone_abbreviation(x.iloc[0]), axis=1)
df_timezone.loc[:,'abbr'] = df_abbr
df_timezone
# df_timezone


Unnamed: 0,timezone,validity,timezone_fixed,abbr
0,Eastern Standard Time,False,US/Eastern,EDT
1,Eastern Daylight Time,False,US/Eastern,EDT
2,US/Eastern,True,US/Eastern,EDT
3,US/Daylight,False,US/Eastern,EDT
4,Asia/Shanghai,True,Asia/Shanghai,CST
5,Asia/Mumbai,False,Asia/Calcutta,IST
6,America/New_York,True,America/New_York,EDT
7,Europe/London,True,Europe/London,BST
8,America/Los_Angeles,True,America/Los_Angeles,PDT
9,Asia/Kolkata,True,Asia/Kolkata,IST


In [96]:

df_utc_offset = (df_timezone.apply(lambda x: datetime.now(pytz.timezone(x.loc['timezone_fixed'])), axis=1)
                            .apply(lambda x: x.strftime('%z')))
df_utc_offset.name = 'utc_offset'
df_utc_offset


0     -0400
1     -0400
2     -0400
3     -0400
4     +0800
5     +0530
6     -0400
7     +0100
8     -0700
9     +0530
10    +0100
11    +0800
12    +0900
Name: utc_offset, dtype: object

In [97]:

df_timezone.loc[:,'utc_offset'] = df_utc_offset
df_timezone


Unnamed: 0,timezone,validity,timezone_fixed,abbr,utc_offset
0,Eastern Standard Time,False,US/Eastern,EDT,-400
1,Eastern Daylight Time,False,US/Eastern,EDT,-400
2,US/Eastern,True,US/Eastern,EDT,-400
3,US/Daylight,False,US/Eastern,EDT,-400
4,Asia/Shanghai,True,Asia/Shanghai,CST,800
5,Asia/Mumbai,False,Asia/Calcutta,IST,530
6,America/New_York,True,America/New_York,EDT,-400
7,Europe/London,True,Europe/London,BST,100
8,America/Los_Angeles,True,America/Los_Angeles,PDT,-700
9,Asia/Kolkata,True,Asia/Kolkata,IST,530


In [98]:
for tz in df_timezone.loc[:,'timezone_fixed']:
    print(f"{tz}: {datetime.now(pytz.timezone(tz))} - {datetime.now(pytz.timezone(tz)).strftime('%Z')}, {datetime.now(pytz.timezone(tz)).strftime('%z')}")

US/Eastern: 2024-09-18 01:27:45.676836-04:00 - EDT, -0400
US/Eastern: 2024-09-18 01:27:45.676916-04:00 - EDT, -0400
US/Eastern: 2024-09-18 01:27:45.676956-04:00 - EDT, -0400
US/Eastern: 2024-09-18 01:27:45.677000-04:00 - EDT, -0400
Asia/Shanghai: 2024-09-18 13:27:45.677040+08:00 - CST, +0800
Asia/Calcutta: 2024-09-18 10:57:45.677080+05:30 - IST, +0530
America/New_York: 2024-09-18 01:27:45.677121-04:00 - EDT, -0400
Europe/London: 2024-09-18 06:27:45.677151+01:00 - BST, +0100
America/Los_Angeles: 2024-09-17 22:27:45.677175-07:00 - PDT, -0700
Asia/Kolkata: 2024-09-18 10:57:45.677201+05:30 - IST, +0530
Europe/London: 2024-09-18 06:27:45.677221+01:00 - BST, +0100
Asia/Hong_Kong: 2024-09-18 13:27:45.677240+08:00 - HKT, +0800
Asia/Tokyo: 2024-09-18 14:27:45.677262+09:00 - JST, +0900


In [99]:
for dt,tz in zip(df_no_tz, df_timezone.loc[:,'timezone_fixed']):
    print(f"{dt}: {dt.tz_localize(tz)}, as {dt.tz_localize(tz).astimezone('utc')}")

2023-12-25 12:10:00: 2023-12-25 12:10:00-05:00, as 2023-12-25 17:10:00+00:00
2023-12-25 23:00:00: 2023-12-25 23:00:00-05:00, as 2023-12-26 04:00:00+00:00
2011-11-04 00:00:00: 2011-11-04 00:00:00-04:00, as 2011-11-04 04:00:00+00:00
2011-11-04 00:00:00: 2011-11-04 00:00:00-04:00, as 2011-11-04 04:00:00+00:00
2011-11-04 00:05:23: 2011-11-04 00:05:23+08:00, as 2011-11-03 16:05:23+00:00
2011-11-04 00:05:23.283185: 2011-11-04 00:05:23.283185+05:30, as 2011-11-03 18:35:23.283185+00:00
2011-11-04 00:05:23: 2011-11-04 00:05:23-04:00, as 2011-11-04 04:05:23+00:00


In [100]:
from zoneinfo import ZoneInfo

In [101]:
for dt,tz in zip(df_no_tz, df_timezone.loc[:,'timezone_fixed']):
    tz_zone = pytz.timezone(tz)
    print(f"{dt}: {tz_zone.localize(dt)}, or {dt.replace(tzinfo=tz_zone)}, or {dt.tz_localize(tz)}, as {dt.tz_localize(tz).astimezone('utc')}")

2023-12-25 12:10:00: 2023-12-25 12:10:00-05:00, or 2023-12-25 12:10:00-05:00, or 2023-12-25 12:10:00-05:00, as 2023-12-25 17:10:00+00:00
2023-12-25 23:00:00: 2023-12-25 23:00:00-05:00, or 2023-12-25 23:00:00-05:00, or 2023-12-25 23:00:00-05:00, as 2023-12-26 04:00:00+00:00
2011-11-04 00:00:00: 2011-11-04 00:00:00-04:00, or 2011-11-04 00:00:00-04:00, or 2011-11-04 00:00:00-04:00, as 2011-11-04 04:00:00+00:00
2011-11-04 00:00:00: 2011-11-04 00:00:00-04:00, or 2011-11-04 00:00:00-04:00, or 2011-11-04 00:00:00-04:00, as 2011-11-04 04:00:00+00:00
2011-11-04 00:05:23: 2011-11-04 00:05:23+08:00, or 2011-11-04 00:05:23+08:00, or 2011-11-04 00:05:23+08:00, as 2011-11-03 16:05:23+00:00
2011-11-04 00:05:23.283185: 2011-11-04 00:05:23.283185+05:30, or 2011-11-04 00:05:23.283185+05:30, or 2011-11-04 00:05:23.283185+05:30, as 2011-11-03 18:35:23.283185+00:00
2011-11-04 00:05:23: 2011-11-04 00:05:23-04:00, or 2011-11-04 00:05:23-04:00, or 2011-11-04 00:05:23-04:00, as 2011-11-04 04:05:23+00:00


In [106]:
#| hide
import nbdev; nbdev.nbdev_export()