## Dates & Times
### Programming 06 slides

#### Working with dates and times

In [1]:
import pandas as pd

# Example data
df = pd.DataFrame({
    "start": ["2023-01-01", "2023-03-15"],
    "end": ["2023-01-10", "2023-03-20"]
})

print(df)
print("============================================================")
df.info()


        start         end
0  2023-01-01  2023-01-10
1  2023-03-15  2023-03-20
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   start   2 non-null      object
 1   end     2 non-null      object
dtypes: object(2)
memory usage: 164.0+ bytes


In [22]:
# Convert to datetime
df["start"] = pd.to_datetime(df["start"])
df["end"] = pd.to_datetime(df["end"])

print(df)
print("============================================================")
df.info()

       start        end
0 2023-01-01 2023-01-10
1 2023-03-15 2023-03-20
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   start   2 non-null      datetime64[ns]
 1   end     2 non-null      datetime64[ns]
dtypes: datetime64[ns](2)
memory usage: 164.0 bytes


In [23]:
# Calculate difference
df = df.assign(duration=df["end"] - df["start"])
print(df)


       start        end duration
0 2023-01-01 2023-01-10   9 days
1 2023-03-15 2023-03-20   5 days


In [24]:
# Add 7 days to start
df = df.assign(start_plus_week_extra=df["start"] + pd.Timedelta(days=7, hours=3))
#df = df.assign(duration_week_extra=df["end"] - df["start_plus_week_extra"])
df


Unnamed: 0,start,end,duration,start_plus_week_extra
0,2023-01-01,2023-01-10,9 days,2023-01-08 03:00:00
1,2023-03-15,2023-03-20,5 days,2023-03-22 03:00:00


## Date-times as a Data Type in pandas

Single timestamp: 

1. It is precise to nanoseconds and supports operations like adding/subtracting timedeltas.

2. Example use: storing a single event’s exact date and time.

In [None]:
# Single Timestamp, aka a single point in time
ts = pd.Timestamp("2023-01-01 12:30:00")
type
#ts = ts - pd.Timedelta(days=2, hours=4)
print("Timestamp:", ts)

Timestamp: 2023-01-01 12:30:00


pd.Timedelta represents a duration (the difference between two times).

1. You can add or subtract it from a Timestamp to shift time forward/backward.

2. Example: adding 3 days and 5 hours to a deadline.

In [None]:
# Timedelta (duration)
delta = pd.Timedelta(days=3, hours=5)

#delta = delta - pd.Timedelta(hours=2)
print("Timedelta:", delta)

Timedelta: 3 days 03:00:00


pd.to_datetime() converts text into datetime objects.

In [27]:
# Converting text to datetime
date = pd.to_datetime("2023-01-01")
print("Datetime:", date)

Datetime: 2023-01-01 00:00:00


## Time Periods in pandas

In [28]:
# Timedelta
# # Duration of 1 hour
delta = pd.Timedelta(hours=1)

# Add to a timestamp
ts = pd.Timestamp("2023-01-01 12:00:00")
print("Timestamp + 1h:", ts + delta)


Timestamp + 1h: 2023-01-01 13:00:00


In [None]:
# Interval
# Duration of 5 hours
delta = pd.Timedelta(hours=5)

# Interval from 12:00 to 17:00
interval = pd.Interval(left=ts, right=ts + delta, closed="both")
print("Interval:", interval)


Interval: [2023-01-01 12:30:00, 2023-01-01 17:30:00]


In [45]:
# To check if a time is in the interval:
print(ts in interval) 
print(ts + delta in interval) 
print(ts - delta in interval)  

True
True
False


All of is this is also possible on a dataset. Here we take a new flights.csv with specific flight data, at first they are stored as strings

In [30]:
flights = pd.read_csv("flighttimes.csv")
flights

Unnamed: 0,dep_time,sched_dep_time,arr_time,sched_arr_time,carrier,flight,tailnum,origin,dest,distance,origin_lat,origin_long,dest_lat,dest_long
0,2013-01-01 05:17,2013-01-01 05:15,2013-01-01 08:30,2013-01-01 08:19,UA,1545,N14228,EWR,IAH,1400,40.692497,-74.168661,29.980472,-95.339722
1,2013-01-01 05:33,2013-01-01 05:29,2013-01-01 08:50,2013-01-01 08:30,UA,1714,N24211,LGA,IAH,1416,40.777243,-73.872609,29.980472,-95.339722
2,2013-01-01 05:42,2013-01-01 05:40,2013-01-01 09:23,2013-01-01 08:50,AA,1141,N619AA,JFK,MIA,1089,40.639751,-73.778926,25.793250,-80.290556
3,2013-01-01 05:44,2013-01-01 05:45,2013-01-01 10:04,2013-01-01 10:22,B6,725,N804JB,JFK,BQN,1576,40.639751,-73.778926,18.494861,-67.129444
4,2013-01-01 05:54,2013-01-01 06:00,2013-01-01 08:12,2013-01-01 08:37,DL,461,N668DN,LGA,ATL,762,40.777243,-73.872609,33.640444,-84.426944
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
336771,,2013-09-30 14:55,,2013-09-30 16:34,9E,3393,,JFK,DCA,213,40.639751,-73.778926,38.852083,-77.037722
336772,,2013-09-30 22:00,,2013-09-30 23:12,9E,3525,,LGA,SYR,198,40.777243,-73.872609,43.111187,-76.106311
336773,,2013-09-30 12:10,,2013-09-30 13:30,MQ,3461,N535MQ,LGA,BNA,764,40.777243,-73.872609,36.124477,-86.678182
336774,,2013-09-30 11:59,,2013-09-30 13:44,MQ,3572,N511MQ,LGA,CLE,419,40.777243,-73.872609,41.410894,-81.849397


In [31]:
flights.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 336776 entries, 0 to 336775
Data columns (total 14 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   dep_time        328521 non-null  object 
 1   sched_dep_time  336776 non-null  object 
 2   arr_time        328063 non-null  object 
 3   sched_arr_time  336776 non-null  object 
 4   carrier         336776 non-null  object 
 5   flight          336776 non-null  int64  
 6   tailnum         334264 non-null  object 
 7   origin          336776 non-null  object 
 8   dest            336776 non-null  object 
 9   distance        336776 non-null  int64  
 10  origin_lat      336776 non-null  float64
 11  origin_long     336776 non-null  float64
 12  dest_lat        336776 non-null  float64
 13  dest_long       336776 non-null  float64
dtypes: float64(4), int64(2), object(8)
memory usage: 36.0+ MB


In [47]:
import pandas as pd

flights = pd.read_csv("flighttimes.csv")
# Convert to datetime
flights["dep_time"] = pd.to_datetime(flights["dep_time"])

# Calculate flight delay (pretend all flights were delayed 90 minutes)
flights = flights.assign(delay=pd.Timedelta(minutes=90))

flights = flights.assign(new_departure=flights["dep_time"] + flights["delay"])

flights = flights[["dep_time","delay", "new_departure"]]
print(flights.head(10))

             dep_time           delay       new_departure
0 2013-01-01 05:17:00 0 days 01:30:00 2013-01-01 06:47:00
1 2013-01-01 05:33:00 0 days 01:30:00 2013-01-01 07:03:00
2 2013-01-01 05:42:00 0 days 01:30:00 2013-01-01 07:12:00
3 2013-01-01 05:44:00 0 days 01:30:00 2013-01-01 07:14:00
4 2013-01-01 05:54:00 0 days 01:30:00 2013-01-01 07:24:00
5 2013-01-01 05:54:00 0 days 01:30:00 2013-01-01 07:24:00
6 2013-01-01 05:55:00 0 days 01:30:00 2013-01-01 07:25:00
7 2013-01-01 05:57:00 0 days 01:30:00 2013-01-01 07:27:00
8 2013-01-01 05:57:00 0 days 01:30:00 2013-01-01 07:27:00
9 2013-01-01 05:58:00 0 days 01:30:00 2013-01-01 07:28:00


### Date and Time Formats in pandas

1. pd.to_datetime() can automatically parse many common formats

2. If parsing fails, you can specify a format string for example: pd.to_datetime("2023-01-15", format="%Y-%m-%d") 

In [33]:
pd.to_datetime("2023-01-15", format="%Y-%m-%d")
# 2023-01-15 00:00:00

pd.to_datetime("15/01/2023", format="%d/%m/%Y")
# 2023-01-15 00:00:00

pd.to_datetime("01-15-23 14:30", format="%m-%d-%y %H:%M")
# 2023-01-15 14:30:00

# And a whole lot more...

Timestamp('2023-01-15 14:30:00')

## <dataframe> .dt.total_seconds()

We can convert datetime objects to seconds/minutes by using dt.total_seconds and dt.total_seconds / 60 respectively

In [34]:
# Example data: scheduled vs actual meeting times
df = pd.DataFrame({
    "scheduled": ["2023-05-01 09:00:00", "2023-05-01 14:00:00"],
    "actual":    ["2023-05-01 09:10:00", "2023-05-01 13:50:00"]
})

# Convert to datetime
df["scheduled"] = pd.to_datetime(df["scheduled"])
df["actual"] = pd.to_datetime(df["actual"])

# Calculate delay in minutes
df = df.assign(delay_secs=(df["actual"] - df["scheduled"]).dt.total_seconds())
df = df.assign(delay_min=df["delay_secs"] / 60)

print(df)

            scheduled              actual  delay_secs  delay_min
0 2023-05-01 09:00:00 2023-05-01 09:10:00       600.0       10.0
1 2023-05-01 14:00:00 2023-05-01 13:50:00      -600.0      -10.0
