# Time Series

In [1]:
# Time Series data is an important form of structured data in many different fields, such as finance,economics,ecology, neuroscience, and physics. Anything that is observed or measured at many points in time forms a time series. Many time series are fixed frequency, which is to say that data points occur at regular intervals according to some rule, such as every 15 seconds, every 5 minutes, or once per month. Time series can also be irregular without a fixed unit of time or offset between units. How you mark and refer to time series data depends on the application, and you may have one of the following:
# Timestamps, specific instants in time 
# Fixed periods, such as the month January 2007 or the full year 2010
# Intervals of time, indicated by a start and end timestamp. Periods can be thought of as special cases of intervals
# Experiment or elapsed time; each timestamp is a measure of time relative to a particular start time (e.g., the diameter of a cookie baking each second since being placed in the oven)

# Date and Time Data Types and Tools

In [2]:
#  The python standard library includes data types for date and time data, as well as calendar-related functionality.
# The datetime, time, and calendar modules are the main places to start. The datetime.datetime type or simply datetime is widely used:
from datetime import datetime
now = datetime.now()
print(now)

2024-01-30 16:36:36.265225


In [3]:
now.year, now.month, now.day 

(2024, 1, 30)

In [4]:
# datetime stores both the date and time down to the microsecond. datetime.timedelta represents the temporal difference between tow datetime objects"ch6_data loading_storage and file formats.ipynb
delta = datetime(2011,1,7) - datetime(2008,6,24,8,15) # 2011-01-07 00:00:00 - 2008-06-24 08:15:00 = 926 days, 15:45:00

In [5]:
delta.days

926

In [6]:
delta.seconds

56700

In [7]:
# you can add (or substract) a timedelta or multiple thereof to a datetime object to yield a new shifted object:
from datetime import timedelta 
start = datetime(2011,1,7)

In [12]:
from datetime import datetime 
start + timedelta(12) # 2011-01-19 00:00:00 

datetime.datetime(2011, 1, 19, 0, 0)

In [13]:
# Types in datetime module 
# date stores the calendar date(year, month, day) using the Gregorian calendar 
# time stores the time as hours, minutes, seconds, and microseconds
# datetime stores both date and time
# timedelta represents the difference between two datetime values as days, seconds, and microseconds

# Converting between string and datetime 

datetime objects and pandas Timestamp objects can be converted to one another very easily:

```python

In [14]:
from turtle import stamp


stamp = datetime.now() # 2019-11-05 15:54:00.000000
str(stamp) # '2019-11-05 15:54:00.000000' 

'2024-01-30 16:48:01.991575'

In [15]:
stamp.strftime('%Y-%m-%d') # '2019-11-05' 

'2024-01-30'

In [16]:
# for a complete list of the format codes. These same format codes can be used to convert strings to dates using datetime.strptime: 
value = '2011-01-03'

In [17]:
datetime.strptime(value, '%Y-%m-%d') # datetime.datetime(2011, 1, 3, 0, 0) 

datetime.datetime(2011, 1, 3, 0, 0)

In [18]:
datestrs= ['7/6/2011', '8/6/2011']

In [19]:
datestrs 

['7/6/2011', '8/6/2011']

In [20]:
[datetime.strptime(x, '%m/%d/%Y') for x in datestrs] # [datetime.datetime(2011, 7, 6, 0, 0), datetime.datetime(2011, 8, 6, 0, 0)] 


[datetime.datetime(2011, 7, 6, 0, 0), datetime.datetime(2011, 8, 6, 0, 0)]

In [21]:
# datetime.strptime is the best way to parse a date with a known format. However, it can be a bit annoying to have to 
# write a format spec each time, especially for common date formats. 
# In this case, you can use the parser.parse method in the third-party dateutil package (this is installed automatically when you install pandas): 
from dateutil.parser import parse 
parse('2011-01-03') # datetime.datetime(2011, 1, 3, 0, 0) 

datetime.datetime(2011, 1, 3, 0, 0)

In [22]:
# dateutil is capable of parsing most human-intelligible date representations: 
parse('Jan 31, 1997 10:45 PM') # datetime.datetime(1997, 1, 31, 22, 45) 

datetime.datetime(1997, 1, 31, 22, 45)

In [23]:
# In internationally locales, day apprearing before month is very common, so you can pass dayfirst=True to indicate this:
parse('6/12/2011', dayfirst=True) # datetime.datetime(2011, 12, 6, 0, 0) 

datetime.datetime(2011, 12, 6, 0, 0)

In [24]:
# pandas is generally oriented toward working with arrays of dates, whether used as an axis index or a column in a DataFrame.
# The to_datetime method parses many different kinds of date representations. Standard date formats like ISO 8601 can be parsed very quickly: 
import pandas as pd 
datestrs = ['2011-07-06 12:00:00', '2011-08-06 00:00:00'] 

In [25]:
pd.to_datetime(datestrs) # DatetimeIndex(['2011-07-06 12:00:00', '2011-08-06'], dtype='datetime64[ns]', freq=None) 

DatetimeIndex(['2011-07-06 12:00:00', '2011-08-06 00:00:00'], dtype='datetime64[ns]', freq=None)

In [26]:
# It also handles values that should be considered missing (None, empty string, etc.): 
idx = pd.to_datetime(datestrs + [None]) 

In [27]:
idx

DatetimeIndex(['2011-07-06 12:00:00', '2011-08-06 00:00:00', 'NaT'], dtype='datetime64[ns]', freq=None)

In [28]:
# Datetime format specification (ISO C89 compatible) 
# %Y 4-digit year
# %y 2-digit year
# %m 2-digit month [01,12]
# %d 2-digit day [01,31]
# %H Hour (24-hour clock)[00,23]
# %I Hour (12-hour clock)[01,12]
# %M 2-digit minute [00,59]
# %S Second [00,61] (seconds 60,61 account for leap seconds)
# %w Weekday as integer [0(Sunday),6]
# %U Week number of the year [00,53]; Sunday is considered the first day of the week, and days before the first Sunday of the year are “week 0”
# %W Week number of the year [00,53]; Monday is considered the first day of the week, and days before the first Monday of the year are “week 0”
# %z UTC time zone offset as +HHMM or -HHMM; empty if time zone naive
# %F Shortcut for %Y-%m-%d (e.g., 2012-4-18)
# %D Shortcut for %m/%d/%y (e.g., 04/18/12)

In [29]:
#datetime objects also have a number of locale-specific formatting options for systems in other countries or languages. 
# For example, in German, the day appears before the month:
# 2011-03-12 04:00:00 PM -> 12.03.2011 16:00:00
# The German locale (and others) can be indicated in pandas like so:
pd.to_datetime(datestrs[0], dayfirst=True) # Timestamp('2011-06-07 12:00:00')

Timestamp('2011-07-06 12:00:00')

In [30]:
# Locale - specific date formatting 
# %a Weekday as locale’s abbreviated name. Sun, Mon, ..., Sat (en_US); So, Mo, ..., Sa (de_DE)
# %A Weekday as locale’s full name. Sunday, Monday, ..., Saturday (en_US); Sonntag, Montag, ..., Samstag (de_DE)
# %b Month as locale’s abbreviated name. Jan, Feb, ..., Dec (en_US); Jan, Feb, ..., Dez (de_DE)
# %B Month as locale’s full name. January, February, ..., December (en_US); Januar, Februar, ..., Dezember (de_DE)
# %c Locale’s appropriate date and time representation. Tue Aug 16 21:30:00 1988 (en_US); Di 16 Aug 21:30:00 1988 (de_DE)
# %p Locale’s equivalent of either AM or PM. AM, PM (en_US); am, pm (de_DE)
# %x Locale’s appropriate date representation. 08/16/88 (None); 08/16/1988 (en_US); 16.08.1988 (de_DE)
# %X Locale’s appropriate time representation. 21:30:00 (en_US); 21:30:00 (de_DE)

# Time Series Basics 
The most basic kind of time series object in pandas is a Series indexed by timestamps, which is often represented external to pandas as Python strings or datetime objects: 

```python

In [2]:
from datetime import datetime

dates = [datetime(2011,1,2), datetime(2011,1,5), datetime(2011,1,7), datetime(2011,1,8), datetime(2011,1,10), datetime(2011,1,12)]


In [3]:
from pandas import Series, DataFrame 
import numpy as np 
ts = Series (np.random.randn(6), index=dates)

In [4]:
ts

2011-01-02   -0.191806
2011-01-05    1.348229
2011-01-07    0.228457
2011-01-08    0.941104
2011-01-10    2.017281
2011-01-12    0.614095
dtype: float64

In [5]:
# Under the hood, these datetime objects have been put in a DatatimeIndex: 
# and the variable ts is now of type TimeSeries 
type(ts) # pandas.core.series.Series 

pandas.core.series.Series

In [6]:
ts.index 

DatetimeIndex(['2011-01-02', '2011-01-05', '2011-01-07', '2011-01-08',
               '2011-01-10', '2011-01-12'],
              dtype='datetime64[ns]', freq=None)

In [7]:
# Like other Series, arithmetic operations between differently-indexed time series automatically align on the dates: 
ts + ts[::2] # 2011-01-02 00:00:00   -0.168180
            # 2011-01-05 00:00:00         NaN
            # 2011-01-07 00:00:00   -0.759805
            # 2011-01-08 00:00:00         NaN
            # 2011-01-10 00:00:00    0.669935
            # 2011-01-12 00:00:00         NaN
            # dtype: float64

2011-01-02   -0.383613
2011-01-05         NaN
2011-01-07    0.456914
2011-01-08         NaN
2011-01-10    4.034562
2011-01-12         NaN
dtype: float64

In [8]:
# pandas stores timestamps using NumPy's datetime64 data type at the nanosecond resolution: 
ts.index.dtype # dtype('<M8[ns]') 

dtype('<M8[ns]')

In [9]:
# Scalar values from a DatetimeIndex are pandas Timestamp objects: 
stamp = ts.index[0] 

In [10]:
stamp 

Timestamp('2011-01-02 00:00:00')

In [11]:
# A Timestamp can be substituted anywhere you would use a datetime object. Additionally, it can store frequency 
# information (if any) and understands how to do time zone conversions and other kinds of manipulations. 

# Indexing, Selection, Subsetting

In [12]:
# Timeseries is a subclass of Series and thus behaves in the same way with regard to many arithmetic functions: 
stamp = ts.index[2] # 2011-01-07 00:00:00 

In [13]:
ts[stamp] 

0.22845706703857016

In [14]:
# As a convenience, you can also pass a string that is interpretable as a date: 
ts['1/10/2011'] 

2.0172809964902885

In [15]:
ts['20110110']

2.0172809964902885

In [17]:
# For longer time series, a year or only a year and month can be passed to easily select slices of data: 
import pandas as pd 
longer_ts = Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000)) 

In [18]:
longer_ts 

2000-01-01   -0.594228
2000-01-02    0.433961
2000-01-03    1.041535
2000-01-04   -0.699783
2000-01-05    0.638649
                ...   
2002-09-22   -0.231598
2002-09-23   -0.961379
2002-09-24    1.503900
2002-09-25    0.589409
2002-09-26   -1.130632
Freq: D, Length: 1000, dtype: float64

In [19]:
longer_ts['2001'] 

2001-01-01    0.852516
2001-01-02    1.398564
2001-01-03   -1.095709
2001-01-04   -0.902604
2001-01-05    0.683059
                ...   
2001-12-27   -0.866160
2001-12-28   -0.595024
2001-12-29    0.237672
2001-12-30    0.085976
2001-12-31    0.750141
Freq: D, Length: 365, dtype: float64

In [20]:
longer_ts['2001-05']

2001-05-01   -0.608742
2001-05-02   -0.318897
2001-05-03    0.097457
2001-05-04    0.828169
2001-05-05    0.680986
2001-05-06    0.123383
2001-05-07   -1.711345
2001-05-08   -1.253020
2001-05-09   -0.599664
2001-05-10    0.698987
2001-05-11   -0.163755
2001-05-12    1.368672
2001-05-13   -0.028034
2001-05-14    0.243480
2001-05-15   -0.376212
2001-05-16    0.890416
2001-05-17    0.293890
2001-05-18    0.772936
2001-05-19    0.135351
2001-05-20   -0.528528
2001-05-21   -0.283064
2001-05-22   -0.702420
2001-05-23   -0.188870
2001-05-24   -1.020628
2001-05-25   -0.870698
2001-05-26   -1.342409
2001-05-27    1.240621
2001-05-28    0.052077
2001-05-29    0.661220
2001-05-30    0.984947
2001-05-31    0.689535
Freq: D, dtype: float64

In [21]:
# Slicing with dates works just like with a regular Series: 
ts[datetime(2011,1,7):]

2011-01-07    0.228457
2011-01-08    0.941104
2011-01-10    2.017281
2011-01-12    0.614095
dtype: float64

In [22]:
# Because most time series data is ordered chronologically, you can slice with timestamps not contained in a time series to perform a range query: 
ts 

2011-01-02   -0.191806
2011-01-05    1.348229
2011-01-07    0.228457
2011-01-08    0.941104
2011-01-10    2.017281
2011-01-12    0.614095
dtype: float64

In [23]:
ts['1/6/2011':'1/11/2011']  

2011-01-07    0.228457
2011-01-08    0.941104
2011-01-10    2.017281
dtype: float64

In [24]:
# As before you can pass either a string date, datetime, or timestamp. Remember that slicing in this manner produces views on the source time series just like slicing NumPy arrays. 
# There is an equivalent instance method truncate which slices a Series between two dates: 

ts.truncate(after='1/9/2011') 

2011-01-02   -0.191806
2011-01-05    1.348229
2011-01-07    0.228457
2011-01-08    0.941104
dtype: float64

In [25]:
# All of the above holds true for DataFrame, too, indexing on its rows: 
dates = pd.date_range('1/1/2000', periods=100, freq='W-WED') 

In [26]:
long_df = DataFrame(np.random.randn(100,4), index=dates, columns=['Colorado', 'Texas', 'New York', 'Ohio'])  

In [27]:
long_df.loc['5-2001'] 

Unnamed: 0,Colorado,Texas,New York,Ohio
2001-05-02,-0.028066,-2.563557,-0.591534,0.273903
2001-05-09,-0.934964,-0.973781,-0.289601,1.061917
2001-05-16,-0.796096,1.027706,0.92575,-2.077534
2001-05-23,1.042699,-0.311563,0.592067,-1.467479
2001-05-30,-0.150522,0.754206,-1.315557,-0.144438


# Time Series with Duplicate Indices 

In [28]:
# In some applications, you may wish to use time series data indexed by timestamps as if it were a fixed frequency like daily or monthly, even if it is not.
# The asfreq method enables this. We use the same long_ts time series from above:

dates = pd.DatetimeIndex(['1/1/2000', '1/2/2000', '1/2/2000', '1/2/2000', '1/3/2000']) 

In [29]:
dup_ts = Series(np.arange(5), index=dates)  
dup_ts

2000-01-01    0
2000-01-02    1
2000-01-02    2
2000-01-02    3
2000-01-03    4
dtype: int64

In [30]:
# We can tell that the index is not unique by checking its is_unique property: 
dup_ts.index.is_unique # False 

False

In [31]:
# Indexing into this time series will either produce scalar values or slices depending on whether a timestamp is duplicated:
dup_ts['1/3/2000'] # 4 

4

In [32]:
dup_ts['1/2/2000'] # 1/2/2000    1 1/2/2000    2 1/2/2000    3 dtype: int64

2000-01-02    1
2000-01-02    2
2000-01-02    3
dtype: int64

In [33]:
# Suppose you wanted to group the data by timestamp and apply a group function like sum. 
# One way to do this is to use the groupby and pass level=0 (the only level of indexing): 
grouped = dup_ts.groupby(level=0) 

In [34]:
grouped.mean()

2000-01-01    0.0
2000-01-02    2.0
2000-01-03    4.0
dtype: float64

In [35]:
grouped.count() 

2000-01-01    1
2000-01-02    3
2000-01-03    1
dtype: int64

# Date Ranges, Frequencies, and Shifting

In [36]:
# Generic time series in pandas are assumed to be irregular, that is, they have no fixed frequency.For many applications 
# this is sufficient.However, it's often desirable to work relative to a fixed frequency, such as daily, monthly, or every 15 mins. 
# even if that means introducing missing values into a time series. Fortunately, pandas has a full suite of standard time series frequencies and tools for resampling, inferring frequencies,
# and generating fixed frequency date ranges. 
# For example, in the example time series, converting it to be fixed daily frequency can be accomplished by calling resample: 

ts 

2011-01-02   -0.191806
2011-01-05    1.348229
2011-01-07    0.228457
2011-01-08    0.941104
2011-01-10    2.017281
2011-01-12    0.614095
dtype: float64

In [37]:
ts. resample('D') 

<pandas.core.resample.DatetimeIndexResampler object at 0x12e80a590>

# Generating Date Ranges 
While I used it previously without explanation, you may have guessed that pandas.date_range is responsible for generating a DatetimeIndex with an indicated length according to a particular frequency: 


In [38]:
index = pd.date_range('4/1/2012', '6/1/2012') 

In [39]:
index 

DatetimeIndex(['2012-04-01', '2012-04-02', '2012-04-03', '2012-04-04',
               '2012-04-05', '2012-04-06', '2012-04-07', '2012-04-08',
               '2012-04-09', '2012-04-10', '2012-04-11', '2012-04-12',
               '2012-04-13', '2012-04-14', '2012-04-15', '2012-04-16',
               '2012-04-17', '2012-04-18', '2012-04-19', '2012-04-20',
               '2012-04-21', '2012-04-22', '2012-04-23', '2012-04-24',
               '2012-04-25', '2012-04-26', '2012-04-27', '2012-04-28',
               '2012-04-29', '2012-04-30', '2012-05-01', '2012-05-02',
               '2012-05-03', '2012-05-04', '2012-05-05', '2012-05-06',
               '2012-05-07', '2012-05-08', '2012-05-09', '2012-05-10',
               '2012-05-11', '2012-05-12', '2012-05-13', '2012-05-14',
               '2012-05-15', '2012-05-16', '2012-05-17', '2012-05-18',
               '2012-05-19', '2012-05-20', '2012-05-21', '2012-05-22',
               '2012-05-23', '2012-05-24', '2012-05-25', '2012-05-26',
      

In [42]:
# By default, date_range generates daily timestamps.If you  pass only a start or end date, you must pass a number of periods to generate. 
pd.date_range(start='4/1/2021', periods=25) 

DatetimeIndex(['2021-04-01', '2021-04-02', '2021-04-03', '2021-04-04',
               '2021-04-05', '2021-04-06', '2021-04-07', '2021-04-08',
               '2021-04-09', '2021-04-10', '2021-04-11', '2021-04-12',
               '2021-04-13', '2021-04-14', '2021-04-15', '2021-04-16',
               '2021-04-17', '2021-04-18', '2021-04-19', '2021-04-20',
               '2021-04-21', '2021-04-22', '2021-04-23', '2021-04-24',
               '2021-04-25'],
              dtype='datetime64[ns]', freq='D')

In [43]:
# The start and end dates define strict boundries for the  generated date index. For example, if you wanted a date index containing the last business day of each month, 
# you can would pass the "BM"  frequency(business end of month) and only dates falling on or inside the date interval will be included: 
pd.date_range('1/1/2000', '12/1/2000', freq='BM' ) 

DatetimeIndex(['2000-01-31', '2000-02-29', '2000-03-31', '2000-04-28',
               '2000-05-31', '2000-06-30', '2000-07-31', '2000-08-31',
               '2000-09-29', '2000-10-31', '2000-11-30'],
              dtype='datetime64[ns]', freq='BM')

In [44]:
# date_range by default preserves the time(if any) of the start or end timestamp: 
pd.date_range('5/1/2021  12:56:31',  periods=5) 

DatetimeIndex(['2021-05-01 12:56:31', '2021-05-02 12:56:31',
               '2021-05-03 12:56:31', '2021-05-04 12:56:31',
               '2021-05-05 12:56:31'],
              dtype='datetime64[ns]', freq='D')