# core data utilities

> core utilities for data processing: 
> datetime processing, time zone processing, validity, filtering

In [2]:
#| default_exp data.utils

In [3]:
#| hide
from nbdev.showdoc import *

In [4]:
#| hide
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [5]:
#| export
from datetime import datetime
import pandas as pd
import pytz

In [6]:
#| export
pd.options.mode.chained_assignment = None

In [7]:
#| export
def validate_datetime(date_string, format_string):
    """
    Validates if a string is a valid datetime according to the given format.
    """

    try:
        datetime.strptime(date_string, format_string)
        return True
    except ValueError:
        return False
    
def validate_datetime_in_iso_format(date_text):
        try:
            datetime.fromisoformat(date_text)
            return True
        except ValueError:
            # raise ValueError("Incorrect data format, should be YYYY-MM-DD")
            return False


In [28]:
# Example usage
date_strings = [
    "2023-12-25 24:00:00",
    "2023-12-25 12:60:00.12",
    "2023-12-25 12:10:00",
    "2023-12-25 23:00:00",
    "2011-11-04",
    "20111104",
    "2011-11-04T00:05:23",
    "2011-11-04T00:05:23.283185",
    "20111104T000523",
    "20111104T000523.283185",
    "2011-11-04T00:05:23Z",
    "2011-11-04T00:05:23.283185+08:00",
    "2011-11-04T00:05:23+08:00",
    ]
format_string = "%Y-%m-%d"

for s in date_strings:
    if validate_datetime_in_iso_format(s):
        print(f"{s} is Valid datetime string")
    else:
        print(f"{s} is Invalid datetime string")

2023-12-25 24:00:00 is Invalid datetime string
2023-12-25 12:60:00.12 is Invalid datetime string
2023-12-25 12:10:00 is Valid datetime string
2023-12-25 23:00:00 is Valid datetime string
2011-11-04 is Valid datetime string
20111104 is Valid datetime string
2011-11-04T00:05:23 is Valid datetime string
2011-11-04T00:05:23.283185 is Valid datetime string
20111104T000523 is Valid datetime string
20111104T000523.283185 is Valid datetime string
2011-11-04T00:05:23Z is Valid datetime string
2011-11-04T00:05:23.283185+08:00 is Valid datetime string
2011-11-04T00:05:23+08:00 is Valid datetime string


In [30]:
df_datetime_str = pd.DataFrame(date_strings)
df_datetime_str.columns = ['datetime']
# df_datetime_str
df_datetime_str['validity'] = df_datetime_str.apply(lambda x: validate_datetime_in_iso_format(x.iloc[0]), axis=1)
df_datetime_str


Unnamed: 0,datetime,validity
0,2023-12-25 24:00:00,False
1,2023-12-25 12:60:00.12,False
2,2023-12-25 12:10:00,True
3,2023-12-25 23:00:00,True
4,2011-11-04,True
5,20111104,True
6,2011-11-04T00:05:23,True
7,2011-11-04T00:05:23.283185,True
8,20111104T000523,True
9,20111104T000523.283185,True


In [31]:
df_datetime = df_datetime_str[df_datetime_str.apply(lambda x: validate_datetime_in_iso_format(x.iloc[0]), axis=1)]
df_datetime


Unnamed: 0,datetime,validity
2,2023-12-25 12:10:00,True
3,2023-12-25 23:00:00,True
4,2011-11-04,True
5,20111104,True
6,2011-11-04T00:05:23,True
7,2011-11-04T00:05:23.283185,True
8,20111104T000523,True
9,20111104T000523.283185,True
10,2011-11-04T00:05:23Z,True
11,2011-11-04T00:05:23.283185+08:00,True


In [32]:
df_datetime_invalid = df_datetime_str[df_datetime_str.apply(lambda x: not validate_datetime_in_iso_format(x.iloc[0]), axis=1)]
df_datetime_invalid


Unnamed: 0,datetime,validity
0,2023-12-25 24:00:00,False
1,2023-12-25 12:60:00.12,False


In [11]:
df_datetime['datetime']

1                  2023-12-25 12:10:00
2                  2023-12-25 23:00:00
3                           2011-11-04
4                             20111104
5                  2011-11-04T00:05:23
6           2011-11-04T00:05:23.283185
7                      20111104T000523
8               20111104T000523.283185
9                 2011-11-04T00:05:23Z
10    2011-11-04T00:05:23.283185+08:00
11           2011-11-04T00:05:23+08:00
Name: datetime, dtype: object

In [12]:
# df_datetime_str.apply(lambda x: validate_datetime_in_iso_format(x.iloc[0]), axis=1)
df_datetime1 = df_datetime.apply(lambda x: datetime.fromisoformat(x['datetime']), axis=1)
df_datetime1.name = 'datetime'
df_datetime1


1                  2023-12-25 12:10:00
2                  2023-12-25 23:00:00
3                  2011-11-04 00:00:00
4                  2011-11-04 00:00:00
5                  2011-11-04 00:05:23
6           2011-11-04 00:05:23.283185
7                  2011-11-04 00:05:23
8           2011-11-04 00:05:23.283185
9            2011-11-04 00:05:23+00:00
10    2011-11-04 00:05:23.283185+08:00
11           2011-11-04 00:05:23+08:00
Name: datetime, dtype: object

In [13]:
df_datetime1.loc[0:8]

1           2023-12-25 12:10:00
2           2023-12-25 23:00:00
3           2011-11-04 00:00:00
4           2011-11-04 00:00:00
5           2011-11-04 00:05:23
6    2011-11-04 00:05:23.283185
7           2011-11-04 00:05:23
8    2011-11-04 00:05:23.283185
Name: datetime, dtype: object

In [14]:
df_no_tz = pd.to_datetime(df_datetime1.loc[0:8])
df_no_tz

1   2023-12-25 12:10:00.000000
2   2023-12-25 23:00:00.000000
3   2011-11-04 00:00:00.000000
4   2011-11-04 00:00:00.000000
5   2011-11-04 00:05:23.000000
6   2011-11-04 00:05:23.283185
7   2011-11-04 00:05:23.000000
8   2011-11-04 00:05:23.283185
Name: datetime, dtype: datetime64[ns]

In [15]:
#| export
def get_timezone_abbreviation(timezone_name):
    timezone = pytz.timezone(timezone_name)
    now = datetime.now(timezone)
    return now.strftime("%Z")

def validate_timezone_in_iana(timezone_name):
    return timezone_name in pytz.all_timezones


In [16]:

time_zone_strings = [
    'Asia/Shanghai',
    'Asia/Mumbai',
    'America/New_York',
    'Europe/London',
    'Eastern Standard Time',
    'US/Eastern',
    'America/Los_Angeles',
    'Asia/Kolkata',
    'Europe/London',
    'Asia/Hong_Kong',
    'Asia/Tokyo',
]
df_timezone_str = pd.DataFrame(time_zone_strings)
df_timezone_str.columns = ['timezone']

# df_datetime_str
df_timezone_str['validity'] = df_timezone_str.apply(lambda x: validate_timezone_in_iana(x.iloc[0]), axis=1)
df_timezone_str


Unnamed: 0,timezone,validity
0,Asia/Shanghai,True
1,Asia/Mumbai,False
2,America/New_York,True
3,Europe/London,True
4,Eastern Standard Time,False
5,US/Eastern,True
6,America/Los_Angeles,True
7,Asia/Kolkata,True
8,Europe/London,True
9,Asia/Hong_Kong,True


In [17]:

df_timezone = df_timezone_str[df_timezone_str.apply(lambda x: validate_timezone_in_iana(x.iloc[0]), axis=1)]
df_timezone

Unnamed: 0,timezone,validity
0,Asia/Shanghai,True
2,America/New_York,True
3,Europe/London,True
5,US/Eastern,True
6,America/Los_Angeles,True
7,Asia/Kolkata,True
8,Europe/London,True
9,Asia/Hong_Kong,True
10,Asia/Tokyo,True


In [26]:
# extract invalid data
df_timezone_invalid = df_timezone_str[df_timezone_str.apply(lambda x: not validate_timezone_in_iana(x.iloc[0]), axis=1)]
df_timezone_invalid

Unnamed: 0,timezone,validity
1,Asia/Mumbai,False
4,Eastern Standard Time,False


In [18]:
df_abbr = df_timezone.apply(lambda x: get_timezone_abbreviation(x.loc['timezone']), axis=1)
df_abbr.name = 'abbr'
df_abbr

0     CST
2     EDT
3     BST
5     EDT
6     PDT
7     IST
8     BST
9     HKT
10    JST
Name: abbr, dtype: object

In [19]:
df_timezone

Unnamed: 0,timezone,validity
0,Asia/Shanghai,True
2,America/New_York,True
3,Europe/London,True
5,US/Eastern,True
6,America/Los_Angeles,True
7,Asia/Kolkata,True
8,Europe/London,True
9,Asia/Hong_Kong,True
10,Asia/Tokyo,True


In [20]:
# df_timezone_str['validity'] = df_timezone_str.apply(lambda x: validate_timezone_in_iana(x.iloc[0]), axis=1)
# df_timezone['abbr'] = df_timezone.apply(lambda x: get_timezone_abbreviation(x.iloc[0]), axis=1)
# df_timezone.apply(lambda x: get_timezone_abbreviation(x.iloc[0]), axis=1)
df_timezone.loc[:,'abbr'] = df_abbr
df_timezone
# df_timezone


Unnamed: 0,timezone,validity,abbr
0,Asia/Shanghai,True,CST
2,America/New_York,True,EDT
3,Europe/London,True,BST
5,US/Eastern,True,EDT
6,America/Los_Angeles,True,PDT
7,Asia/Kolkata,True,IST
8,Europe/London,True,BST
9,Asia/Hong_Kong,True,HKT
10,Asia/Tokyo,True,JST


In [21]:

df_utc_offset = (df_timezone.apply(lambda x: datetime.now(pytz.timezone(x.loc['timezone'])), axis=1)
                            .apply(lambda x: x.strftime('%z')))
df_utc_offset.name = 'utc_offset'
df_utc_offset


0     +0800
2     -0400
3     +0100
5     -0400
6     -0700
7     +0530
8     +0100
9     +0800
10    +0900
Name: utc_offset, dtype: object

In [22]:

df_timezone.loc[:,'utc_offset'] = df_utc_offset
df_timezone


Unnamed: 0,timezone,validity,abbr,utc_offset
0,Asia/Shanghai,True,CST,800
2,America/New_York,True,EDT,-400
3,Europe/London,True,BST,100
5,US/Eastern,True,EDT,-400
6,America/Los_Angeles,True,PDT,-700
7,Asia/Kolkata,True,IST,530
8,Europe/London,True,BST,100
9,Asia/Hong_Kong,True,HKT,800
10,Asia/Tokyo,True,JST,900


In [23]:
for tz in df_timezone.loc[:,'timezone']:
    print(f"{tz}: {datetime.now(pytz.timezone(tz))} - {datetime.now(pytz.timezone(tz)).strftime('%Z')}, {datetime.now(pytz.timezone(tz)).strftime('%z')}")

Asia/Shanghai: 2024-09-17 14:19:14.405560+08:00 - CST, +0800
America/New_York: 2024-09-17 02:19:14.405642-04:00 - EDT, -0400
Europe/London: 2024-09-17 07:19:14.405672+01:00 - BST, +0100
US/Eastern: 2024-09-17 02:19:14.405695-04:00 - EDT, -0400
America/Los_Angeles: 2024-09-16 23:19:14.405716-07:00 - PDT, -0700
Asia/Kolkata: 2024-09-17 11:49:14.405737+05:30 - IST, +0530
Europe/London: 2024-09-17 07:19:14.405756+01:00 - BST, +0100
Asia/Hong_Kong: 2024-09-17 14:19:14.405773+08:00 - HKT, +0800
Asia/Tokyo: 2024-09-17 15:19:14.405793+09:00 - JST, +0900


In [24]:
for dt,tz in zip(df_no_tz.loc[0:8], df_timezone.loc[:,'timezone']):
    print(f"{dt}: {dt.tz_localize(tz)}, as {dt.tz_localize(tz).astimezone('utc')}")

2023-12-25 12:10:00: 2023-12-25 12:10:00+08:00, as 2023-12-25 04:10:00+00:00
2023-12-25 23:00:00: 2023-12-25 23:00:00-05:00, as 2023-12-26 04:00:00+00:00
2011-11-04 00:00:00: 2011-11-04 00:00:00+00:00, as 2011-11-04 00:00:00+00:00
2011-11-04 00:00:00: 2011-11-04 00:00:00-04:00, as 2011-11-04 04:00:00+00:00
2011-11-04 00:05:23: 2011-11-04 00:05:23-07:00, as 2011-11-04 07:05:23+00:00
2011-11-04 00:05:23.283185: 2011-11-04 00:05:23.283185+05:30, as 2011-11-03 18:35:23.283185+00:00
2011-11-04 00:05:23: 2011-11-04 00:05:23+00:00, as 2011-11-04 00:05:23+00:00
2011-11-04 00:05:23.283185: 2011-11-04 00:05:23.283185+08:00, as 2011-11-03 16:05:23.283185+00:00


In [25]:
#| hide
import nbdev; nbdev.nbdev_export()