In [31]:
import pandas as pd
# We specify when we want to import a python core module to save memory
import datetime as dt

# Review of Python's datetime Module

In [32]:
# There is both a datetime module and within it a .datetime() method
dt.date(year = 2016, month =  4, day = 12)
# this will create a datetime object

datetime.date(2016, 4, 12)

In [33]:
# Parameters not necessary
someday = dt.date(2016, 4, 12)

In [34]:
# Attributes available
someday.year

2016

In [35]:
someday.month

4

In [36]:
someday.day

12

In [37]:
# No time provided, will default to midnight
dt.datetime(2016, 4, 12)

datetime.datetime(2016, 4, 12, 0, 0)

In [38]:
#Parameters not necessary
dt.datetime(2016, 4, 12, hour = 8, minute = 13, second = 57)

datetime.datetime(2016, 4, 12, 8, 13, 57)

In [39]:
# More readable when passed through str method
str(dt.datetime(2016, 4, 12, hour = 8, minute = 13, second = 57))

'2016-04-12 08:13:57'

In [40]:
str(someday)

'2016-04-12'

In [41]:
sometime = dt.datetime(2016, 4, 12, 8, 13, 57)

In [42]:
sometime.year
sometime.month
sometime.day
sometime.minute

13

# The pandas Timestamp Object

In [43]:
# Pandas is quite clever and can accept many different strings for dates as well as datetime objects
# Issues arrise when month and date are under 12 and jumbled
pd.Timestamp("2015-03-31")

Timestamp('2015-03-31 00:00:00')

In [44]:
pd.Timestamp("2015/3/31")

Timestamp('2015-03-31 00:00:00')

In [45]:
pd.Timestamp("2013, 11, 04")

Timestamp('2013-11-04 00:00:00')

In [46]:
pd.Timestamp("2021-03-08 08:35:15")

Timestamp('2021-03-08 08:35:15')

In [47]:
pd.Timestamp("2021-03-08 8:35:15 PM")

Timestamp('2021-03-08 20:35:15')

In [48]:
pd.Timestamp(dt.date(2015, 1 ,1))

Timestamp('2015-01-01 00:00:00')

In [49]:
# There is alot more you can do with a pd.Timestamp than a python datetime
pd.Timestamp(dt.datetime(2015, 1 ,1, 21, 35, 22))

Timestamp('2015-01-01 21:35:22')

# The pandas DateTimeIndex Object

In [50]:
# Strings will be converted to datetime and placed in pandas object
dates = ["2016-01-02", "2016-04-12", "2012-09-13"]
pd.DatetimeIndex(dates)

DatetimeIndex(['2016-01-02', '2016-04-12', '2012-09-13'], dtype='datetime64[ns]', freq=None)

In [51]:
dates = [dt.date(2016, 1, 2), dt.date(2016, 4, 12), dt.date(2016, 9, 13)]
dtIndex = pd.DatetimeIndex(dates)

In [52]:
#Using the datetimes created above to serve as an idex for values to be passed into a series
values = [100, 200, 300]
pd.Series(data = values, index = dtIndex)

2016-01-02    100
2016-04-12    200
2016-09-13    300
dtype: int64

# The pd.to_datetime() Method

In [53]:
# Very flexible method
pd.to_datetime("2001-04-19")
pd.to_datetime(dt.date(2015, 1, 1))
pd.to_datetime(dt.datetime(2015, 1, 1, 14, 35, 20))
pd.to_datetime(['2015-01-03', "2014/02/08", "2016", "July 4th, 1996"])

DatetimeIndex(['2015-01-03', '2014-02-08', '2016-01-01', '1996-07-04'], dtype='datetime64[ns]', freq=None)

In [54]:
# Will default to string list
pd.Series(['2015-01-03', "2014/02/08", "2016", "July 4th, 1996"])

0        2015-01-03
1        2014/02/08
2              2016
3    July 4th, 1996
dtype: object

In [55]:
times = pd.Series(['2015-01-03', "2014/02/08", "2016", "July 4th, 1996"])
times

0        2015-01-03
1        2014/02/08
2              2016
3    July 4th, 1996
dtype: object

In [56]:
# Can convert series of strings into series of datetimes
pd.to_datetime(times)

0   2015-01-03
1   2014-02-08
2   2016-01-01
3   1996-07-04
dtype: datetime64[ns]

In [57]:
dates = pd.Series(["July 4th, 1996", "10/04/1991", "Hello", "2015-02-31"])
dates

0    July 4th, 1996
1        10/04/1991
2             Hello
3        2015-02-31
dtype: object

In [58]:
# Cannot convert to datetime due to unacceptable strings
pd.to_datetime(dates)

ValueError: ('Unknown string format:', 'Hello')

In [61]:
# Instead of raising error it will fill it in with not a time i.e NaT
pd.to_datetime(dates, errors = "coerce")

0   1996-07-04
1   1991-10-04
2          NaT
3          NaT
dtype: datetime64[ns]

In [62]:
# Unix time, unit parameter need "s" for seconds
pd.to_datetime([1349720105, 1349806505, 1349892905], unit = "s")

DatetimeIndex(['2012-10-08 18:15:05', '2012-10-09 18:15:05',
               '2012-10-10 18:15:05'],
              dtype='datetime64[ns]', freq=None)

# Create Range of Dates with the pd.date_range() Method, Part 1

In [63]:
# Need to use 2 of 3 arguments, freq parameter is set to "D" for day i.e the interval
pd.date_range(start = "2016-01-01", end = "2016-01-10", freq = "D")

DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',
               '2016-01-05', '2016-01-06', '2016-01-07', '2016-01-08',
               '2016-01-09', '2016-01-10'],
              dtype='datetime64[ns]', freq='D')

In [64]:
# freq can chnage to "2D", "B" for business days
pd.date_range(start = "2016-01-01", end = "2016-01-10", freq = "1D")

DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',
               '2016-01-05', '2016-01-06', '2016-01-07', '2016-01-08',
               '2016-01-09', '2016-01-10'],
              dtype='datetime64[ns]', freq='D')

In [65]:
# See that weekends are eliminated
pd.date_range(start = "2016-01-01", end = "2016-01-10", freq = "B")

DatetimeIndex(['2016-01-01', '2016-01-04', '2016-01-05', '2016-01-06',
               '2016-01-07', '2016-01-08'],
              dtype='datetime64[ns]', freq='B')

In [66]:
# "W" for days only falling on Sunday
pd.date_range(start = "2016-01-01", end = "2016-01-10", freq = "W")

DatetimeIndex(['2016-01-03', '2016-01-10'], dtype='datetime64[ns]', freq='W-SUN')

In [67]:
# Can change the start day for week by writing "W-FR", etc..
# "H" for Hour, or prefix it with an hour i.e "6H"
# "M" month-ends, "MS" gives us month-starts
# "A" for year-ends
pd.date_range(start = "2016-01-01", end = "2016-05-10", freq = "M")

DatetimeIndex(['2016-01-31', '2016-02-29', '2016-03-31', '2016-04-30'], dtype='datetime64[ns]', freq='M')

In [68]:
# This creates a list of timestamp objects
times = pd.date_range(start = "2016-01-01", end = "2016-01-10", freq = "D")
type(times)

pandas.core.indexes.datetimes.DatetimeIndex

In [69]:
times[0]

Timestamp('2016-01-01 00:00:00', freq='D')

# Create Range of Dates with the pd.date_range() Method, Part 2

In [70]:
# periods is the number of timestamps to generate
pd.date_range(start = "2012-09-09", periods = 25, freq = "D")

DatetimeIndex(['2012-09-09', '2012-09-10', '2012-09-11', '2012-09-12',
               '2012-09-13', '2012-09-14', '2012-09-15', '2012-09-16',
               '2012-09-17', '2012-09-18', '2012-09-19', '2012-09-20',
               '2012-09-21', '2012-09-22', '2012-09-23', '2012-09-24',
               '2012-09-25', '2012-09-26', '2012-09-27', '2012-09-28',
               '2012-09-29', '2012-09-30', '2012-10-01', '2012-10-02',
               '2012-10-03'],
              dtype='datetime64[ns]', freq='D')

In [71]:
# 25 days with one day intervals have been created
len(pd.date_range(start = "2012-09-09", periods = 25, freq = "D"))

25

In [72]:
# 25 business days with gaps of 50 business days
pd.date_range(start = "2012-09-09", periods = 25, freq = "50B")

DatetimeIndex(['2012-09-10', '2012-11-19', '2013-01-28', '2013-04-08',
               '2013-06-17', '2013-08-26', '2013-11-04', '2014-01-13',
               '2014-03-24', '2014-06-02', '2014-08-11', '2014-10-20',
               '2014-12-29', '2015-03-09', '2015-05-18', '2015-07-27',
               '2015-10-05', '2015-12-14', '2016-02-22', '2016-05-02',
               '2016-07-11', '2016-09-19', '2016-11-28', '2017-02-06',
               '2017-04-17'],
              dtype='datetime64[ns]', freq='50B')

In [73]:
# 50 days with intervals of a week ("W"), will automatically start week on Sunday 
# but can be changed to whatever day i.e "W-TUE"
pd.date_range(start = "2012-09-09", periods = 50, freq = "W")

DatetimeIndex(['2012-09-09', '2012-09-16', '2012-09-23', '2012-09-30',
               '2012-10-07', '2012-10-14', '2012-10-21', '2012-10-28',
               '2012-11-04', '2012-11-11', '2012-11-18', '2012-11-25',
               '2012-12-02', '2012-12-09', '2012-12-16', '2012-12-23',
               '2012-12-30', '2013-01-06', '2013-01-13', '2013-01-20',
               '2013-01-27', '2013-02-03', '2013-02-10', '2013-02-17',
               '2013-02-24', '2013-03-03', '2013-03-10', '2013-03-17',
               '2013-03-24', '2013-03-31', '2013-04-07', '2013-04-14',
               '2013-04-21', '2013-04-28', '2013-05-05', '2013-05-12',
               '2013-05-19', '2013-05-26', '2013-06-02', '2013-06-09',
               '2013-06-16', '2013-06-23', '2013-06-30', '2013-07-07',
               '2013-07-14', '2013-07-21', '2013-07-28', '2013-08-04',
               '2013-08-11', '2013-08-18'],
              dtype='datetime64[ns]', freq='W-SUN')

In [74]:
# When specifying hours a list of datetimes will be displayed
pd.date_range(start = "2012-09-09", periods = 10, freq = "6H")

DatetimeIndex(['2012-09-09 00:00:00', '2012-09-09 06:00:00',
               '2012-09-09 12:00:00', '2012-09-09 18:00:00',
               '2012-09-10 00:00:00', '2012-09-10 06:00:00',
               '2012-09-10 12:00:00', '2012-09-10 18:00:00',
               '2012-09-11 00:00:00', '2012-09-11 06:00:00'],
              dtype='datetime64[ns]', freq='6H')

# Create Range of Dates with the pd.date_range() Method, Part 3

In [75]:
# freq set to "D" by default
# Will move backwards from the end date, and produce an ordered list starting with earliest date
pd.date_range(end = "1999-12-31", periods = 20, freq = "D")

DatetimeIndex(['1999-12-12', '1999-12-13', '1999-12-14', '1999-12-15',
               '1999-12-16', '1999-12-17', '1999-12-18', '1999-12-19',
               '1999-12-20', '1999-12-21', '1999-12-22', '1999-12-23',
               '1999-12-24', '1999-12-25', '1999-12-26', '1999-12-27',
               '1999-12-28', '1999-12-29', '1999-12-30', '1999-12-31'],
              dtype='datetime64[ns]', freq='D')

In [76]:
# All Saturdays, only includes the end date if it is a Saturday
# https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timeseries-offset-aliases

pd.date_range(end = "1999-12-31", periods = 20, freq = "W-SAT")

DatetimeIndex(['1999-08-14', '1999-08-21', '1999-08-28', '1999-09-04',
               '1999-09-11', '1999-09-18', '1999-09-25', '1999-10-02',
               '1999-10-09', '1999-10-16', '1999-10-23', '1999-10-30',
               '1999-11-06', '1999-11-13', '1999-11-20', '1999-11-27',
               '1999-12-04', '1999-12-11', '1999-12-18', '1999-12-25'],
              dtype='datetime64[ns]', freq='W-SAT')

# The .dt Accessor

In [77]:
# Similar to .str prefix
pd.date_range(start = "2010-01-01", end = "2010-12-31", freq = "24D")

DatetimeIndex(['2010-01-01', '2010-01-25', '2010-02-18', '2010-03-14',
               '2010-04-07', '2010-05-01', '2010-05-25', '2010-06-18',
               '2010-07-12', '2010-08-05', '2010-08-29', '2010-09-22',
               '2010-10-16', '2010-11-09', '2010-12-03', '2010-12-27'],
              dtype='datetime64[ns]', freq='24D')

In [78]:
bunch_of_dates = pd.date_range(start = "2010-01-01", end = "2010-12-31", freq = "24D")
s = pd.Series(bunch_of_dates)
s.head(3)

0   2010-01-01
1   2010-01-25
2   2010-02-18
dtype: datetime64[ns]

In [79]:
# Need to fix .dt accessor to go from series to datetime to attributes
s.dt.day
# Loops through each date and returns name of day
s.dt.weekday_name.head(10)

0       Friday
1       Monday
2     Thursday
3       Sunday
4    Wednesday
5     Saturday
6      Tuesday
7       Friday
8       Monday
9     Thursday
dtype: object

In [80]:
# Returns boolean if date is at the start of a quarter
mask = s.dt.is_quarter_start
s[mask]

0   2010-01-01
dtype: datetime64[ns]

In [81]:
# Many useful available methods. Below returns empty list as no date fis a month-end.
mask = s.dt.is_month_end
s[mask]

Series([], dtype: datetime64[ns])

# Import Financial Data Set with pandas_datareader Library

In [82]:
import pandas as pd
import datetime as dt

#Import only a specific module from a library to save memory
from pandas_datareader import data 

In [83]:
company = "MSFT" # Use specific company stock symbol, this is microsoft
start = "2010-01-01"
end = "2017-12-31"

stocks = data.DataReader(name = company, data_source = 'yahoo', start = start, end = end)
stocks.head(3)

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-01-04,31.1,30.59,30.620001,30.950001,38409100.0,24.525019
2010-01-05,31.1,30.639999,30.85,30.959999,49749600.0,24.532942
2010-01-06,31.08,30.52,30.879999,30.77,58182400.0,24.382378


In [84]:
# Various attributes available
stocks.values
stocks.columns
stocks.index # datetime objects
stocks.axes

[DatetimeIndex(['2010-01-04', '2010-01-05', '2010-01-06', '2010-01-07',
                '2010-01-08', '2010-01-11', '2010-01-12', '2010-01-13',
                '2010-01-14', '2010-01-15',
                ...
                '2017-12-15', '2017-12-18', '2017-12-19', '2017-12-20',
                '2017-12-21', '2017-12-22', '2017-12-26', '2017-12-27',
                '2017-12-28', '2017-12-29'],
               dtype='datetime64[ns]', name='Date', length=2013, freq=None),
 Index(['High', 'Low', 'Open', 'Close', 'Volume', 'Adj Close'], dtype='object')]

# Selecting from a DataFrame with a DateTime Index

In [85]:
stocks = data.DataReader(name = company, data_source = 'yahoo', start = start, end = end)
stocks.head(3)

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-01-04,31.1,30.59,30.620001,30.950001,38409100.0,24.525019
2010-01-05,31.1,30.639999,30.85,30.959999,49749600.0,24.532942
2010-01-06,31.08,30.52,30.879999,30.77,58182400.0,24.382378


In [86]:
# Extract using an index label
stocks.loc["2014-03-04"]

High         3.848000e+01
Low          3.807000e+01
Open         3.820000e+01
Close        3.841000e+01
Volume       2.680240e+07
Adj Close    3.404357e+01
Name: 2014-03-04 00:00:00, dtype: float64

In [87]:
# Extract using an index position
stocks.iloc[300]

High         2.576000e+01
Low          2.535000e+01
Open         2.549000e+01
Close        2.569000e+01
Volume       5.447340e+07
Adj Close    2.090376e+01
Name: 2011-03-14 00:00:00, dtype: float64

In [88]:
stocks.loc["2013-10-01" : "2013-10-07"]

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2013-10-01,33.610001,33.299999,33.349998,33.580002,36718700.0,29.318779
2013-10-02,34.029999,33.290001,33.360001,33.919998,46946800.0,29.615629
2013-10-03,34.0,33.419998,33.880001,33.860001,38703800.0,29.563251
2013-10-04,33.990002,33.619999,33.689999,33.880001,33008100.0,29.580702
2013-10-07,33.709999,33.200001,33.599998,33.299999,35069300.0,29.074312


In [89]:
# Pulling non-sequential dates
birthdays = pd.date_range(start ="1986-06-14", end = "2019-06-22", freq = pd.DateOffset(years = 1))

In [90]:
# Checking if my birthdays are in the stocks date time index
mask = stocks.index.isin(birthdays)

In [91]:
stocks.loc[mask]

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-06-14,25.959999,25.469999,25.860001,25.5,50972400.0,20.392487
2011-06-14,24.450001,24.190001,24.299999,24.219999,42894500.0,19.836807
2012-06-14,29.459999,28.879999,29.33,29.34,39458900.0,24.685299
2013-06-14,34.689999,34.25,34.549999,34.400002,53192600.0,29.824564
2016-06-14,50.099998,49.57,49.900002,49.830002,42577100.0,46.92115
2017-06-14,71.099998,69.43,70.910004,70.269997,25510700.0,67.823059


# Timestamp Object Attributes

In [92]:
stocks = data.DataReader(name = company, data_source = 'yahoo', start = start, end = end)
stocks.head(3)

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-01-04,31.1,30.59,30.620001,30.950001,38409100.0,24.525019
2010-01-05,31.1,30.639999,30.85,30.959999,49749600.0,24.532942
2010-01-06,31.08,30.52,30.879999,30.77,58182400.0,24.382378


In [94]:
someday = stocks.index[500]

In [95]:
# Adding a brand new column from attributes. Creates a new series and populates our df with it
stocks.insert(0, "Day of Week", stocks.index.weekday_name)

In [96]:
stocks.head()

Unnamed: 0_level_0,Day of Week,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-01-04,Monday,31.1,30.59,30.620001,30.950001,38409100.0,24.525019
2010-01-05,Tuesday,31.1,30.639999,30.85,30.959999,49749600.0,24.532942
2010-01-06,Wednesday,31.08,30.52,30.879999,30.77,58182400.0,24.382378
2010-01-07,Thursday,30.700001,30.190001,30.629999,30.450001,50559700.0,24.128809
2010-01-08,Friday,30.879999,30.24,30.280001,30.66,51197400.0,24.295214


In [97]:
stocks.insert(1, "Is Start of Month", stocks.index.is_month_start)
stocks.head(40)

Unnamed: 0_level_0,Day of Week,Is Start of Month,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2010-01-04,Monday,False,31.1,30.59,30.620001,30.950001,38409100.0,24.525019
2010-01-05,Tuesday,False,31.1,30.639999,30.85,30.959999,49749600.0,24.532942
2010-01-06,Wednesday,False,31.08,30.52,30.879999,30.77,58182400.0,24.382378
2010-01-07,Thursday,False,30.700001,30.190001,30.629999,30.450001,50559700.0,24.128809
2010-01-08,Friday,False,30.879999,30.24,30.280001,30.66,51197400.0,24.295214
2010-01-11,Monday,False,30.76,30.120001,30.709999,30.27,68754700.0,23.986179
2010-01-12,Tuesday,False,30.4,29.91,30.15,30.07,65912100.0,23.827694
2010-01-13,Wednesday,False,30.52,30.01,30.26,30.35,51863500.0,24.04957
2010-01-14,Thursday,False,31.1,30.26,30.309999,30.959999,63228100.0,24.532942
2010-01-15,Friday,False,31.24,30.709999,31.08,30.860001,79913200.0,24.453693


In [101]:
# Filter df by dates that are start of month
stocks[stocks["Is Start of Month"]].head(3)

Unnamed: 0_level_0,Day of Week,Is Start of Month,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2010-02-01,Monday,True,28.48,27.92,28.389999,28.41,85931100.0,22.512297
2010-03-01,Monday,True,29.049999,28.530001,28.77,29.02,43805400.0,23.103207
2010-04-01,Thursday,True,29.540001,28.620001,29.35,29.16,74768100.0,23.214651


# The .truncate() Method

In [102]:
stocks = data.DataReader(name = company, data_source = 'yahoo', start = start, end = end)
stocks.head(3)

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-01-04,31.1,30.59,30.620001,30.950001,38409100.0,24.525019
2010-01-05,31.1,30.639999,30.85,30.959999,49749600.0,24.532942
2010-01-06,31.08,30.52,30.879999,30.77,58182400.0,24.382378


In [103]:
# Extract rows from df that fall between the two dates passed to the parameters
stocks.truncate(before = "2011-02-05", after = "2011-02-28")
# If it is a custom data set make sure your datetime index is sorted, pandas takes care of it.

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2011-02-07,28.34,27.790001,27.799999,28.200001,68980900.0,22.811296
2011-02-08,28.34,28.049999,28.1,28.280001,34904200.0,22.876007
2011-02-09,28.26,27.91,28.190001,27.969999,52905100.0,22.62525
2011-02-10,27.940001,27.290001,27.93,27.5,76672400.0,22.245062
2011-02-11,27.809999,27.07,27.76,27.25,83939700.0,22.042831
2011-02-14,27.27,26.950001,27.209999,27.23,56766200.0,22.026653
2011-02-15,27.33,26.950001,27.040001,26.959999,44116500.0,21.937143
2011-02-16,27.07,26.6,27.049999,27.02,70817900.0,21.985968
2011-02-17,27.370001,26.91,26.969999,27.209999,57207300.0,22.140572
2011-02-18,27.209999,26.99,27.129999,27.059999,68667800.0,22.018515


# pd.DateOffset Objects

In [105]:
# Will get information from start to current datetime, more dynamic method that hard coding
stocks = data.DataReader(name = "GOOG", data_source = "yahoo",
               start = dt.date(2000, 1, 1), end = dt.datetime.now())
stocks.head()

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2004-08-19,51.693783,47.669952,49.676899,49.845802,44994500.0,49.845802
2004-08-20,54.187561,49.925285,50.178635,53.80505,23005800.0,53.80505
2004-08-23,56.373344,54.172661,55.017166,54.346527,18393200.0,54.346527
2004-08-24,55.439419,51.450363,55.260582,52.096165,15361800.0,52.096165
2004-08-25,53.651051,51.604362,52.140873,52.657513,9257400.0,52.657513


In [106]:
stocks.index

DatetimeIndex(['2004-08-19', '2004-08-20', '2004-08-23', '2004-08-24',
               '2004-08-25', '2004-08-26', '2004-08-27', '2004-08-30',
               '2004-08-31', '2004-09-01',
               ...
               '2019-06-11', '2019-06-12', '2019-06-13', '2019-06-14',
               '2019-06-17', '2019-06-18', '2019-06-19', '2019-06-20',
               '2019-06-21', '2019-06-24'],
              dtype='datetime64[ns]', name='Date', length=3737, freq=None)

In [107]:
# Generate an amount of time (a duration) and add it to the dates
# We will add 5 days to every datetime index and we have a new dataframe, can use years, hours, months etc
stocks.index + pd.DateOffset(days = 5)

# You can also subtract and use many arguments i.e stocks.index - pd.DateOffset(months = 5, years = 2, days = 5)

DatetimeIndex(['2004-08-24', '2004-08-25', '2004-08-28', '2004-08-29',
               '2004-08-30', '2004-08-31', '2004-09-01', '2004-09-04',
               '2004-09-05', '2004-09-06',
               ...
               '2019-06-16', '2019-06-17', '2019-06-18', '2019-06-19',
               '2019-06-22', '2019-06-23', '2019-06-24', '2019-06-25',
               '2019-06-26', '2019-06-29'],
              dtype='datetime64[ns]', name='Date', length=3737, freq=None)

# More Fun with pd.DateOffset Objects

In [112]:
import pandas as pd
import datetime as dt
from pandas_datareader import data 
from pandas.tseries.offsets import * # Import everything 

In [109]:
stocks = data.DataReader(name = "GOOG", data_source = "yahoo",
               start = dt.date(2000, 1, 1), end = dt.datetime.now())
stocks.head()

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2004-08-19,51.693783,47.669952,49.676899,49.845802,44994500.0,49.845802
2004-08-20,54.187561,49.925285,50.178635,53.80505,23005800.0,53.80505
2004-08-23,56.373344,54.172661,55.017166,54.346527,18393200.0,54.346527
2004-08-24,55.439419,51.450363,55.260582,52.096165,15361800.0,52.096165
2004-08-25,53.651051,51.604362,52.140873,52.657513,9257400.0,52.657513


In [111]:
# Looking to merge dates with specific dates like month end, quarter start etc
# This will look at the date and find the next available month end, we make sure its next by using "+"
# Find previous month end by using "-"
stocks.index + pd.tseries.offsets.MonthEnd()

DatetimeIndex(['2004-08-31', '2004-08-31', '2004-08-31', '2004-08-31',
               '2004-08-31', '2004-08-31', '2004-08-31', '2004-08-31',
               '2004-09-30', '2004-09-30',
               ...
               '2019-06-30', '2019-06-30', '2019-06-30', '2019-06-30',
               '2019-06-30', '2019-06-30', '2019-06-30', '2019-06-30',
               '2019-06-30', '2019-06-30'],
              dtype='datetime64[ns]', name='Date', length=3737, freq=None)

In [116]:
# Many methods available on pd.tseries.offsets module.
# We will import pd.tseries.offsets at the beginning for easier use
stocks.index + MonthEnd()
stocks.index + BMonthEnd()
stocks.index - BQuarterBegin()

DatetimeIndex(['2004-06-01', '2004-06-01', '2004-06-01', '2004-06-01',
               '2004-06-01', '2004-06-01', '2004-06-01', '2004-06-01',
               '2004-06-01', '2004-06-01',
               ...
               '2019-06-03', '2019-06-03', '2019-06-03', '2019-06-03',
               '2019-06-03', '2019-06-03', '2019-06-03', '2019-06-03',
               '2019-06-03', '2019-06-03'],
              dtype='datetime64[ns]', name='Date', length=3737, freq=None)

In [117]:
stocks.index + YearEnd()

DatetimeIndex(['2004-12-31', '2004-12-31', '2004-12-31', '2004-12-31',
               '2004-12-31', '2004-12-31', '2004-12-31', '2004-12-31',
               '2004-12-31', '2004-12-31',
               ...
               '2019-12-31', '2019-12-31', '2019-12-31', '2019-12-31',
               '2019-12-31', '2019-12-31', '2019-12-31', '2019-12-31',
               '2019-12-31', '2019-12-31'],
              dtype='datetime64[ns]', name='Date', length=3737, freq=None)

In [118]:
stocks.index - YearEnd()

DatetimeIndex(['2003-12-31', '2003-12-31', '2003-12-31', '2003-12-31',
               '2003-12-31', '2003-12-31', '2003-12-31', '2003-12-31',
               '2003-12-31', '2003-12-31',
               ...
               '2018-12-31', '2018-12-31', '2018-12-31', '2018-12-31',
               '2018-12-31', '2018-12-31', '2018-12-31', '2018-12-31',
               '2018-12-31', '2018-12-31'],
              dtype='datetime64[ns]', name='Date', length=3737, freq=None)

# The Timedelta Object

In [129]:
# Represents a duration, a timespan not a point that can be plotted on a calendar
# To create a Timedelta object we can subtract two Timestamps from eachother
timeA = pd.Timestamp("2018-03-31 04:16:49 PM")
timeB = pd.Timestamp("2018-03-20 04:11:41 PM")

In [130]:
timeA - timeB

Timedelta('11 days 00:05:08')

In [131]:
type(timeA - timeB)

pandas._libs.tslibs.timedeltas.Timedelta

In [132]:
type(timeA)

pandas._libs.tslibs.timestamps.Timestamp

In [133]:
# Will get negative values if timestamps are switched around
timeB - timeA

Timedelta('-12 days +23:54:52')

In [135]:
# Constructor method for new object, same parameters as DateOffset() except years
pd.Timedelta(days = 3)
pd.Timedelta(days = 3, minutes = 48, hours = 8, weeks = 9)

Timedelta('66 days 08:48:00')

In [136]:
# Can also create objects with strings, very flexible, although weeks will not work
pd.Timedelta("3 days 6 hours 48 minutes 22 seconds")

Timedelta('3 days 06:48:22')

# Timedeltas in a Dataset

In [140]:
shipping = pd.read_csv("ecommerce.csv", index_col = "ID", parse_dates = ["order_date", "delivery_date"])

In [141]:
shipping.head(3)

Unnamed: 0_level_0,order_date,delivery_date
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1998-05-24,1999-02-05
2,1992-04-22,1998-03-06
4,1991-02-10,1992-08-26


In [145]:
# How long between dates? Choose column to subract from. This will get you a timedelta series. 
# Add this to dataframe by set it as a new column
shipping["Delivery Time"] = shipping["delivery_date"] - shipping["order_date"]
shipping.head(3)

Unnamed: 0_level_0,order_date,delivery_date,Delivery Time
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1998-05-24,1999-02-05,257 days
2,1992-04-22,1998-03-06,2144 days
4,1991-02-10,1992-08-26,563 days


In [146]:
# Combing datetime column with timedelta i.e adding Delivery Time to delivery_date
# This will produce a datetime series
# This is then added as a new column
shipping["Twice As Long"] = shipping["delivery_date"] + shipping["Delivery Time"]
shipping.head(3)

Unnamed: 0_level_0,order_date,delivery_date,Delivery Time,Twice As Long
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1998-05-24,1999-02-05,257 days,1999-10-20
2,1992-04-22,1998-03-06,2144 days,2004-01-18
4,1991-02-10,1992-08-26,563 days,1994-03-12


In [147]:
shipping.dtypes

order_date        datetime64[ns]
delivery_date     datetime64[ns]
Delivery Time    timedelta64[ns]
Twice As Long     datetime64[ns]
dtype: object

In [149]:
# Filter rows that took more than a year to deliver, (can adjust days etc)
mask = shipping["Delivery Time"] > "365 days"
shipping[mask].head()

Unnamed: 0_level_0,order_date,delivery_date,Delivery Time,Twice As Long
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,1992-04-22,1998-03-06,2144 days,2004-01-18
4,1991-02-10,1992-08-26,563 days,1994-03-12
5,1992-07-21,1997-11-20,1948 days,2003-03-22
7,1993-09-02,1998-06-10,1742 days,2003-03-18
9,1990-01-25,1994-10-02,1711 days,1999-06-09


In [151]:
shipping["Delivery Time"].min()
shipping["Delivery Time"].max()

Timedelta('3583 days 00:00:00')