In [2]:
import pandas as pd
import datetime as dt

Starting with a review of the Python date and time functions

# Section 10; Part 129
Review of Python's `datetime` module

In [5]:
# date - Stores Year/Month/Day
# datetime - Stores Year/Month/Day Hour/Minute/Seconds/Microseconds

# Create a specific date
day = dt.date(2016, 12, 7)
day

datetime.date(2016, 12, 7)

In [6]:
# Attributes in date object
print day.year
print day.month
print day.day

2016
12
7


In [8]:
# Create a specific datetime object

# Notice it defaults to midnight when a time is not provided
dt.datetime(2016, 12, 7)

datetime.datetime(2016, 12, 7, 0, 0)

In [9]:
# Full datetime object
dt.datetime(2016, 12, 7, 8, 17, 32)

datetime.datetime(2016, 12, 7, 8, 17, 32)

In [14]:
# Afternoon times require 24 hour time

# This is 5:17:32 in the afternoon
afternoon = dt.datetime(2016, 12, 7, 17, 17, 32)
afternoon

datetime.datetime(2016, 12, 7, 17, 17, 32)

In [12]:
# More readable format
str(day)

'2016-12-07'

In [15]:
str(afternoon)

'2016-12-07 17:17:32'

In [17]:
# Attributes
print afternoon.hour
print afternoon.minute
print afternoon.second
print afternoon.microsecond

17
17
32
0


# Section 10; Part 130
The pandas Timestamp Object

 - pandas equivilant to Python's `datetime` object
 - Will default to midnight if a time portion isn't provided

In [18]:
# Creating a timestamp
#   - Will accept strings or Python datetime objects
pd.Timestamp('2015-03-31')

Timestamp('2015-03-31 00:00:00')

In [19]:
pd.Timestamp('2018/12/31')

Timestamp('2018-12-31 00:00:00')

In [20]:
pd.Timestamp('2013,11,11')

Timestamp('2013-11-01 00:00:00')

In [21]:
pd.Timestamp('1/1/2015')

Timestamp('2015-01-01 00:00:00')

In [23]:
# DD/MM/YYY
#  Pandas can figure this out since no month = 19
pd.Timestamp('19/12/2015')

Timestamp('2015-12-19 00:00:00')

In [25]:
# Has problems with formats where day is less than 12 though
# Should this be March 4th or April 3?
# pandas makes it March 4th
pd.Timestamp('3/4/2016')

Timestamp('2016-03-04 00:00:00')

In [26]:
# With times
pd.Timestamp('2016-12-25 8:30:01')

Timestamp('2016-12-25 08:30:01')

In [27]:
# And afternoons require 24 hours again
pd.Timestamp('2016-12-25 17:31:42')

Timestamp('2016-12-25 17:31:42')

In [28]:
# Can take date/datetime objects
pd.Timestamp(dt.date(2016,12,7))

Timestamp('2016-12-07 00:00:00')

In [29]:
pd.Timestamp(dt.datetime(2016,12,7,8,35,12))

Timestamp('2016-12-07 08:35:12')

# Section 10; Part 131
The pandas DateTimeIndex object

 - Storage container for storing `Timestamp` objects

In [31]:
# Converts items in list to Timestamps then stores Timestamps in new object
# Notice that the slashes have been converted to dashes too
dates = ['2016-01-02', '2016/04/12', '2009-09-07']
pd.DatetimeIndex(dates)

DatetimeIndex(['2016-01-02', '2016-04-12', '2009-09-07'], dtype='datetime64[ns]', freq=None)

In [36]:
# Can convert multiple formats at the same time
# Since a time is provided, all other items default to midnight
dates = [dt.date(2016,1,1), dt.datetime(1994,6,13,9,15), '2009-09-07']
timeindex = pd.DatetimeIndex(dates)
timeindex

DatetimeIndex(['2016-01-01 00:00:00', '1994-06-13 09:15:00',
               '2009-09-07 00:00:00'],
              dtype='datetime64[ns]', freq=None)

In [38]:
# Create series
values = [100, 200, 300]
s = pd.Series(values, index = timeindex)
s

2016-01-01 00:00:00    100
1994-06-13 09:15:00    200
2009-09-07 00:00:00    300
dtype: int64

# Section 10; Part 132
The `pd.to_datetime()` method

 - Called directly on pandas library
 - Convert existing object to a pandas time related object
 - Most common usage is converting existing pandas series to timestamp objects

In [39]:
pd.to_datetime('2001-04-19')

Timestamp('2001-04-19 00:00:00')

In [40]:
pd.to_datetime(dt.date(2005, 8, 6))

Timestamp('2005-08-06 00:00:00')

In [41]:
pd.to_datetime(dt.datetime(2007, 5, 26, 15, 30))

Timestamp('2007-05-26 15:30:00')

In [43]:
# List of strings
#  Notice the string with just year is converted to Jan 1
pd.to_datetime(['2016-12-25', '2014/02/08', '2014', 'July 4th, 1996'])

DatetimeIndex(['2016-12-25', '2014-02-08', '2014-01-01', '1996-07-04'], dtype='datetime64[ns]', freq=None)

In [45]:
# Create new series - This will be a series of strings
times = pd.Series(['2016-12-25', '2014/02/08', '2014', 'July 4th, 1996'])
times

0        2016-12-25
1        2014/02/08
2              2014
3    July 4th, 1996
dtype: object

In [46]:
pd.to_datetime(times)

0   2016-12-25
1   2014-02-08
2   2014-01-01
3   1996-07-04
dtype: datetime64[ns]

In [48]:
baddates = pd.Series(['July 4th, 1996', '10/04/1991', 'Hello', '2015-02-31'])
baddates

0    July 4th, 1996
1        10/04/1991
2             Hello
3        2015-02-31
dtype: object

In [49]:
# Can't convert everything
pd.to_datetime(baddates)

ValueError: Unknown string format

In [50]:
# Can handle these errors though
#  errors - Default 'raise' (show the user); alternative is 'coerce' (if able to convert, do so. 
#                            everything else 'NaT' - Not a Time)
pd.to_datetime(baddates, errors='coerce')

0   1996-07-04
1   1991-10-04
2          NaT
3          NaT
dtype: datetime64[ns]

In [52]:
# Convert Unix time
#  pass unit parameter with a unit of 's' for seconds
pd.to_datetime([1349720105, 1349806505, 1349892905, 1349979305, 1350065705], unit='s')

DatetimeIndex(['2012-10-08 18:15:05', '2012-10-09 18:15:05',
               '2012-10-10 18:15:05', '2012-10-11 18:15:05',
               '2012-10-12 18:15:05'],
              dtype='datetime64[ns]', freq=None)

# Section 10; Part 133
Create range of dates with the `pd.date_range()` method (Part I)

 - Using `start` and `end` parameters

In [54]:
# Needs two of the first three parameters to function
#   start - Start date
#   end = End date
#   periods = Number of results we want (number of timestamps we want)
#   freq = Frequency interval (default is 'D' for days)
times = pd.date_range(start = '2016-01-01', end = '2016-01-10', freq= 'D')
times

DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',
               '2016-01-05', '2016-01-06', '2016-01-07', '2016-01-08',
               '2016-01-09', '2016-01-10'],
              dtype='datetime64[ns]', freq='D')

In [55]:
# Can access individual elements because it's an Index object
times[0]

Timestamp('2016-01-01 00:00:00', freq='D')

In [56]:
# Change frequency to be 2 days
pd.date_range(start = '2016-01-01', end = '2016-01-10', freq= '2D')

DatetimeIndex(['2016-01-01', '2016-01-03', '2016-01-05', '2016-01-07',
               '2016-01-09'],
              dtype='datetime64[ns]', freq='2D')

In [57]:
# business days (only excludes weekends, not holidays)
pd.date_range(start = '2016-01-01', end = '2016-01-10', freq= 'B')

DatetimeIndex(['2016-01-01', '2016-01-04', '2016-01-05', '2016-01-06',
               '2016-01-07', '2016-01-08'],
              dtype='datetime64[ns]', freq='B')

In [60]:
# Frequency = W (week)
#  Starting the weeks on Sunday
pd.date_range(start = '2016-01-01', end = '2016-01-15', freq= 'W')

DatetimeIndex(['2016-01-03', '2016-01-10'], dtype='datetime64[ns]', freq='W-SUN')

In [61]:
# Frequency = weeks, but specify week starts on a different day
pd.date_range(start = '2016-01-01', end = '2016-01-15', freq= 'W-FRI')

DatetimeIndex(['2016-01-01', '2016-01-08', '2016-01-15'], dtype='datetime64[ns]', freq='W-FRI')

In [62]:
# Hourly frequency
pd.date_range(start = '2016-01-01', end = '2016-01-15', freq= 'H')

DatetimeIndex(['2016-01-01 00:00:00', '2016-01-01 01:00:00',
               '2016-01-01 02:00:00', '2016-01-01 03:00:00',
               '2016-01-01 04:00:00', '2016-01-01 05:00:00',
               '2016-01-01 06:00:00', '2016-01-01 07:00:00',
               '2016-01-01 08:00:00', '2016-01-01 09:00:00',
               ...
               '2016-01-14 15:00:00', '2016-01-14 16:00:00',
               '2016-01-14 17:00:00', '2016-01-14 18:00:00',
               '2016-01-14 19:00:00', '2016-01-14 20:00:00',
               '2016-01-14 21:00:00', '2016-01-14 22:00:00',
               '2016-01-14 23:00:00', '2016-01-15 00:00:00'],
              dtype='datetime64[ns]', length=337, freq='H')

In [63]:
# Six hourly frequency
pd.date_range(start = '2016-01-01', end = '2016-01-15', freq= '6H')

DatetimeIndex(['2016-01-01 00:00:00', '2016-01-01 06:00:00',
               '2016-01-01 12:00:00', '2016-01-01 18:00:00',
               '2016-01-02 00:00:00', '2016-01-02 06:00:00',
               '2016-01-02 12:00:00', '2016-01-02 18:00:00',
               '2016-01-03 00:00:00', '2016-01-03 06:00:00',
               '2016-01-03 12:00:00', '2016-01-03 18:00:00',
               '2016-01-04 00:00:00', '2016-01-04 06:00:00',
               '2016-01-04 12:00:00', '2016-01-04 18:00:00',
               '2016-01-05 00:00:00', '2016-01-05 06:00:00',
               '2016-01-05 12:00:00', '2016-01-05 18:00:00',
               '2016-01-06 00:00:00', '2016-01-06 06:00:00',
               '2016-01-06 12:00:00', '2016-01-06 18:00:00',
               '2016-01-07 00:00:00', '2016-01-07 06:00:00',
               '2016-01-07 12:00:00', '2016-01-07 18:00:00',
               '2016-01-08 00:00:00', '2016-01-08 06:00:00',
               '2016-01-08 12:00:00', '2016-01-08 18:00:00',
               '2016-01-

In [64]:
# Monthly (month end)
#    This is empty because our range doesn't cross a month end
pd.date_range(start = '2016-01-01', end = '2016-01-15', freq= 'M')

DatetimeIndex([], dtype='datetime64[ns]', freq='M')

In [66]:
# This give month ends
pd.date_range(start = '2016-01-01', end = '2016-12-31', freq= 'M')

DatetimeIndex(['2016-01-31', '2016-02-29', '2016-03-31', '2016-04-30',
               '2016-05-31', '2016-06-30', '2016-07-31', '2016-08-31',
               '2016-09-30', '2016-10-31', '2016-11-30', '2016-12-31'],
              dtype='datetime64[ns]', freq='M')

In [67]:
# Start of each month covered in series
pd.date_range(start = '2016-01-01', end = '2016-12-15', freq= 'MS')

DatetimeIndex(['2016-01-01', '2016-02-01', '2016-03-01', '2016-04-01',
               '2016-05-01', '2016-06-01', '2016-07-01', '2016-08-01',
               '2016-09-01', '2016-10-01', '2016-11-01', '2016-12-01'],
              dtype='datetime64[ns]', freq='MS')

In [68]:
# Yearly (year end)
pd.date_range(start = '2016-01-01', end = '2050-01-01', freq= 'A')

DatetimeIndex(['2016-12-31', '2017-12-31', '2018-12-31', '2019-12-31',
               '2020-12-31', '2021-12-31', '2022-12-31', '2023-12-31',
               '2024-12-31', '2025-12-31', '2026-12-31', '2027-12-31',
               '2028-12-31', '2029-12-31', '2030-12-31', '2031-12-31',
               '2032-12-31', '2033-12-31', '2034-12-31', '2035-12-31',
               '2036-12-31', '2037-12-31', '2038-12-31', '2039-12-31',
               '2040-12-31', '2041-12-31', '2042-12-31', '2043-12-31',
               '2044-12-31', '2045-12-31', '2046-12-31', '2047-12-31',
               '2048-12-31', '2049-12-31'],
              dtype='datetime64[ns]', freq='A-DEC')

# Section 10; Part 134
Create range of dates with `pd.date_range()` (Part II)

 - Using `start` and `periods` parameters

In [69]:
# Generate 25 days from Sept 9, 2012
pd.date_range(start = '2012-09-12', periods = 25, freq= 'D')

DatetimeIndex(['2012-09-12', '2012-09-13', '2012-09-14', '2012-09-15',
               '2012-09-16', '2012-09-17', '2012-09-18', '2012-09-19',
               '2012-09-20', '2012-09-21', '2012-09-22', '2012-09-23',
               '2012-09-24', '2012-09-25', '2012-09-26', '2012-09-27',
               '2012-09-28', '2012-09-29', '2012-09-30', '2012-10-01',
               '2012-10-02', '2012-10-03', '2012-10-04', '2012-10-05',
               '2012-10-06'],
              dtype='datetime64[ns]', freq='D')

`freq` can be modified with the `start`/`period` combination like the previous lesson

In [71]:
# 50 business days (again, only weekends aren't counted, pandas doesn't know holidays by default)
pd.date_range(start = '2012-09-09', periods = 50, freq = 'B')

DatetimeIndex(['2012-09-10', '2012-09-11', '2012-09-12', '2012-09-13',
               '2012-09-14', '2012-09-17', '2012-09-18', '2012-09-19',
               '2012-09-20', '2012-09-21', '2012-09-24', '2012-09-25',
               '2012-09-26', '2012-09-27', '2012-09-28', '2012-10-01',
               '2012-10-02', '2012-10-03', '2012-10-04', '2012-10-05',
               '2012-10-08', '2012-10-09', '2012-10-10', '2012-10-11',
               '2012-10-12', '2012-10-15', '2012-10-16', '2012-10-17',
               '2012-10-18', '2012-10-19', '2012-10-22', '2012-10-23',
               '2012-10-24', '2012-10-25', '2012-10-26', '2012-10-29',
               '2012-10-30', '2012-10-31', '2012-11-01', '2012-11-02',
               '2012-11-05', '2012-11-06', '2012-11-07', '2012-11-08',
               '2012-11-09', '2012-11-12', '2012-11-13', '2012-11-14',
               '2012-11-15', '2012-11-16'],
              dtype='datetime64[ns]', freq='B')

In [72]:
# 50 weeks (starting on Sunday)
pd.date_range(start = '2012-09-09', periods = 50, freq = 'W')

DatetimeIndex(['2012-09-09', '2012-09-16', '2012-09-23', '2012-09-30',
               '2012-10-07', '2012-10-14', '2012-10-21', '2012-10-28',
               '2012-11-04', '2012-11-11', '2012-11-18', '2012-11-25',
               '2012-12-02', '2012-12-09', '2012-12-16', '2012-12-23',
               '2012-12-30', '2013-01-06', '2013-01-13', '2013-01-20',
               '2013-01-27', '2013-02-03', '2013-02-10', '2013-02-17',
               '2013-02-24', '2013-03-03', '2013-03-10', '2013-03-17',
               '2013-03-24', '2013-03-31', '2013-04-07', '2013-04-14',
               '2013-04-21', '2013-04-28', '2013-05-05', '2013-05-12',
               '2013-05-19', '2013-05-26', '2013-06-02', '2013-06-09',
               '2013-06-16', '2013-06-23', '2013-06-30', '2013-07-07',
               '2013-07-14', '2013-07-21', '2013-07-28', '2013-08-04',
               '2013-08-11', '2013-08-18'],
              dtype='datetime64[ns]', freq='W-SUN')

In [73]:
# 50 weeks (starting on Tuesday)
pd.date_range(start = '2012-09-09', periods = 50, freq = 'W-TUE')

DatetimeIndex(['2012-09-11', '2012-09-18', '2012-09-25', '2012-10-02',
               '2012-10-09', '2012-10-16', '2012-10-23', '2012-10-30',
               '2012-11-06', '2012-11-13', '2012-11-20', '2012-11-27',
               '2012-12-04', '2012-12-11', '2012-12-18', '2012-12-25',
               '2013-01-01', '2013-01-08', '2013-01-15', '2013-01-22',
               '2013-01-29', '2013-02-05', '2013-02-12', '2013-02-19',
               '2013-02-26', '2013-03-05', '2013-03-12', '2013-03-19',
               '2013-03-26', '2013-04-02', '2013-04-09', '2013-04-16',
               '2013-04-23', '2013-04-30', '2013-05-07', '2013-05-14',
               '2013-05-21', '2013-05-28', '2013-06-04', '2013-06-11',
               '2013-06-18', '2013-06-25', '2013-07-02', '2013-07-09',
               '2013-07-16', '2013-07-23', '2013-07-30', '2013-08-06',
               '2013-08-13', '2013-08-20'],
              dtype='datetime64[ns]', freq='W-TUE')

In [74]:
# Month start (notice it doesn't show Sept 2012, because we started mid-month)
pd.date_range(start = '2012-09-09', periods = 50, freq = 'MS')

DatetimeIndex(['2012-10-01', '2012-11-01', '2012-12-01', '2013-01-01',
               '2013-02-01', '2013-03-01', '2013-04-01', '2013-05-01',
               '2013-06-01', '2013-07-01', '2013-08-01', '2013-09-01',
               '2013-10-01', '2013-11-01', '2013-12-01', '2014-01-01',
               '2014-02-01', '2014-03-01', '2014-04-01', '2014-05-01',
               '2014-06-01', '2014-07-01', '2014-08-01', '2014-09-01',
               '2014-10-01', '2014-11-01', '2014-12-01', '2015-01-01',
               '2015-02-01', '2015-03-01', '2015-04-01', '2015-05-01',
               '2015-06-01', '2015-07-01', '2015-08-01', '2015-09-01',
               '2015-10-01', '2015-11-01', '2015-12-01', '2016-01-01',
               '2016-02-01', '2016-03-01', '2016-04-01', '2016-05-01',
               '2016-06-01', '2016-07-01', '2016-08-01', '2016-09-01',
               '2016-10-01', '2016-11-01'],
              dtype='datetime64[ns]', freq='MS')

# Section 10; Part 135
Create range of dates with `pd.date_range()` (Part III)

 - Using `end` and `periods` parameters
 - This combination works *backwards* compared to the `start`/`period` combination

In [75]:
pd.date_range(end = '1999-12-31', periods = 20, freq='D')

DatetimeIndex(['1999-12-12', '1999-12-13', '1999-12-14', '1999-12-15',
               '1999-12-16', '1999-12-17', '1999-12-18', '1999-12-19',
               '1999-12-20', '1999-12-21', '1999-12-22', '1999-12-23',
               '1999-12-24', '1999-12-25', '1999-12-26', '1999-12-27',
               '1999-12-28', '1999-12-29', '1999-12-30', '1999-12-31'],
              dtype='datetime64[ns]', freq='D')

Other frequencies work identically to the previous sections

# Section 10; Part 136
The `.dt` accessor

 - Similar to the `.str.` accessor. This needs to be passed to work with dates across entire series

In [77]:
bunchofdates = pd.date_range(start = '2000-01-01', end = '2010-12-31', freq = '24D')
bunchofdates

DatetimeIndex(['2000-01-01', '2000-01-25', '2000-02-18', '2000-03-13',
               '2000-04-06', '2000-04-30', '2000-05-24', '2000-06-17',
               '2000-07-11', '2000-08-04',
               ...
               '2010-05-20', '2010-06-13', '2010-07-07', '2010-07-31',
               '2010-08-24', '2010-09-17', '2010-10-11', '2010-11-04',
               '2010-11-28', '2010-12-22'],
              dtype='datetime64[ns]', length=168, freq='24D')

In [79]:
# Create a series with the dates
s = pd.Series(bunchofdates)
s

0     2000-01-01
1     2000-01-25
2     2000-02-18
3     2000-03-13
4     2000-04-06
5     2000-04-30
6     2000-05-24
7     2000-06-17
8     2000-07-11
9     2000-08-04
10    2000-08-28
11    2000-09-21
12    2000-10-15
13    2000-11-08
14    2000-12-02
15    2000-12-26
16    2001-01-19
17    2001-02-12
18    2001-03-08
19    2001-04-01
20    2001-04-25
21    2001-05-19
22    2001-06-12
23    2001-07-06
24    2001-07-30
25    2001-08-23
26    2001-09-16
27    2001-10-10
28    2001-11-03
29    2001-11-27
         ...    
138   2009-01-25
139   2009-02-18
140   2009-03-14
141   2009-04-07
142   2009-05-01
143   2009-05-25
144   2009-06-18
145   2009-07-12
146   2009-08-05
147   2009-08-29
148   2009-09-22
149   2009-10-16
150   2009-11-09
151   2009-12-03
152   2009-12-27
153   2010-01-20
154   2010-02-13
155   2010-03-09
156   2010-04-02
157   2010-04-26
158   2010-05-20
159   2010-06-13
160   2010-07-07
161   2010-07-31
162   2010-08-24
163   2010-09-17
164   2010-10-11
165   2010-11-

In [80]:
# Extract the day from each value in the series
s.dt.day

0       1
1      25
2      18
3      13
4       6
5      30
6      24
7      17
8      11
9       4
10     28
11     21
12     15
13      8
14      2
15     26
16     19
17     12
18      8
19      1
20     25
21     19
22     12
23      6
24     30
25     23
26     16
27     10
28      3
29     27
       ..
138    25
139    18
140    14
141     7
142     1
143    25
144    18
145    12
146     5
147    29
148    22
149    16
150     9
151     3
152    27
153    20
154    13
155     9
156     2
157    26
158    20
159    13
160     7
161    31
162    24
163    17
164    11
165     4
166    28
167    22
dtype: int64

In [81]:
s.dt.weekday_name

0       Saturday
1        Tuesday
2         Friday
3         Monday
4       Thursday
5         Sunday
6      Wednesday
7       Saturday
8        Tuesday
9         Friday
10        Monday
11      Thursday
12        Sunday
13     Wednesday
14      Saturday
15       Tuesday
16        Friday
17        Monday
18      Thursday
19        Sunday
20     Wednesday
21      Saturday
22       Tuesday
23        Friday
24        Monday
25      Thursday
26        Sunday
27     Wednesday
28      Saturday
29       Tuesday
         ...    
138       Sunday
139    Wednesday
140     Saturday
141      Tuesday
142       Friday
143       Monday
144     Thursday
145       Sunday
146    Wednesday
147     Saturday
148      Tuesday
149       Friday
150       Monday
151     Thursday
152       Sunday
153    Wednesday
154     Saturday
155      Tuesday
156       Friday
157       Monday
158     Thursday
159       Sunday
160    Wednesday
161     Saturday
162      Tuesday
163       Friday
164       Monday
165     Thursd

In [82]:
# Which dates fall on the start of a quarter (Jan, April, July, October 1st)
s.dt.is_quarter_start

0       True
1      False
2      False
3      False
4      False
5      False
6      False
7      False
8      False
9      False
10     False
11     False
12     False
13     False
14     False
15     False
16     False
17     False
18     False
19      True
20     False
21     False
22     False
23     False
24     False
25     False
26     False
27     False
28     False
29     False
       ...  
138    False
139    False
140    False
141    False
142    False
143    False
144    False
145    False
146    False
147    False
148    False
149    False
150    False
151    False
152    False
153    False
154    False
155    False
156    False
157    False
158    False
159    False
160    False
161    False
162    False
163    False
164    False
165    False
166    False
167    False
dtype: bool

In [83]:
# Dates, using the above boolean series
s[s.dt.is_quarter_start]

0     2000-01-01
19    2001-04-01
38    2002-07-01
137   2009-01-01
dtype: datetime64[ns]

# Section 10; Part 137
Install `pandas-datereader`

 - Query an online source to build a data set
 - This needs to be done in a terminal window, not this Notebook
 
 - `conda install pandas-datareader`  OR
 - `pip install pandas-datareader`

# Section 10; Part 138
Import Financial data set with pandas_datareader library

*This section requires that the library is installed*

In [85]:
import pandas as pd
import datetime as dt
from pandas_datareader import data

In [88]:
#  name - Company we are pulling data for (Stock symbol)
#  datasource - Where to pull from 
#  start / end - Dates to pull information from
company = 'MSFT'
start = '2010-01-01'
end = '2017-12-31'

stocks = data.DataReader(name=company, data_source='google', start=start, end=end)
stocks.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-01-04,30.62,31.1,30.59,30.95,38414185
2010-01-05,30.85,31.1,30.64,30.96,49758862
2010-01-06,30.88,31.08,30.52,30.77,58182332
2010-01-07,30.63,30.7,30.19,30.45,50564285
2010-01-08,30.28,30.88,30.24,30.66,51201289


In [89]:
stocks.values

array([[  3.06200000e+01,   3.11000000e+01,   3.05900000e+01,
          3.09500000e+01,   3.84141850e+07],
       [  3.08500000e+01,   3.11000000e+01,   3.06400000e+01,
          3.09600000e+01,   4.97588620e+07],
       [  3.08800000e+01,   3.10800000e+01,   3.05200000e+01,
          3.07700000e+01,   5.81823320e+07],
       ..., 
       [  5.90800000e+01,   5.94700000e+01,   5.88000000e+01,
          5.92500000e+01,   2.55156650e+07],
       [  5.97000000e+01,   6.05800000e+01,   5.95600000e+01,
          6.02200000e+01,   2.35526580e+07],
       [  6.04300000e+01,   6.04600000e+01,   5.98000000e+01,
          5.99500000e+01,   1.99070350e+07]])

In [91]:
stocks.columns

Index([u'Open', u'High', u'Low', u'Close', u'Volume'], dtype='object')

In [92]:
stocks.index

DatetimeIndex(['2010-01-04', '2010-01-05', '2010-01-06', '2010-01-07',
               '2010-01-08', '2010-01-11', '2010-01-12', '2010-01-13',
               '2010-01-14', '2010-01-15',
               ...
               '2016-11-22', '2016-11-23', '2016-11-25', '2016-11-28',
               '2016-11-29', '2016-11-30', '2016-12-01', '2016-12-02',
               '2016-12-05', '2016-12-06'],
              dtype='datetime64[ns]', name=u'Date', length=1744, freq=None)

In [93]:
stocks.axes

[DatetimeIndex(['2010-01-04', '2010-01-05', '2010-01-06', '2010-01-07',
                '2010-01-08', '2010-01-11', '2010-01-12', '2010-01-13',
                '2010-01-14', '2010-01-15',
                ...
                '2016-11-22', '2016-11-23', '2016-11-25', '2016-11-28',
                '2016-11-29', '2016-11-30', '2016-12-01', '2016-12-02',
                '2016-12-05', '2016-12-06'],
               dtype='datetime64[ns]', name=u'Date', length=1744, freq=None),
 Index([u'Open', u'High', u'Low', u'Close', u'Volume'], dtype='object')]

# Section 10; Part 139
Selecting rows from a DataFrame with a DateTimeIndex

In [94]:
company = 'MSFT'
start = '2010-01-01'
end = '2017-12-31'

stocks = data.DataReader(name=company, data_source='google', start=start, end=end)
stocks.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-01-04,30.62,31.1,30.59,30.95,38414185
2010-01-05,30.85,31.1,30.64,30.96,49758862
2010-01-06,30.88,31.08,30.52,30.77,58182332
2010-01-07,30.63,30.7,30.19,30.45,50564285
2010-01-08,30.28,30.88,30.24,30.66,51201289


In [95]:
# .loc and .iloc work as expected
stocks.loc["2010-01-05"]

Open            30.85
High            31.10
Low             30.64
Close           30.96
Volume    49758862.00
Name: 2010-01-05 00:00:00, dtype: float64

In [96]:
# Remember, .iloc goes on the numeric index value, even if there isn't one explicitly set
stocks.iloc[1]   # Pulls 2010-01-05

Open            30.85
High            31.10
Low             30.64
Close           30.96
Volume    49758862.00
Name: 2010-01-05 00:00:00, dtype: float64

In [97]:
stocks.ix['2010-01-05']

Open            30.85
High            31.10
Low             30.64
Close           30.96
Volume    49758862.00
Name: 2010-01-05 00:00:00, dtype: float64

In [98]:
# Throws an error if value doesn't exist
stocks.ix['2010-01-01']

KeyError: '2010-01-01'

In [100]:
# Get sequential dates from Oct 1, 2013 to Oct 7, 2013
stocks.ix["2013-10-01": "2013-10-07"]

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2013-10-01,33.35,33.61,33.3,33.58,36718733
2013-10-02,33.36,34.03,33.29,33.92,46946813
2013-10-03,33.88,34.0,33.42,33.86,38707452
2013-10-04,33.69,33.99,33.62,33.88,33008107
2013-10-07,33.6,33.71,33.2,33.3,35069279


In [102]:
# Pull stock prices for every instance of a particular day across years
#    Pull April 12 every year
#  DateOffset will be explained in a later lesson
# This date range will pull all April 12ths since 2000 to 2017
aprils = pd.date_range(start = '2000-04-12', end = '2017-12-31', freq = pd.DateOffset(years = 1))
aprils

DatetimeIndex(['2000-04-12', '2001-04-12', '2002-04-12', '2003-04-12',
               '2004-04-12', '2005-04-12', '2006-04-12', '2007-04-12',
               '2008-04-12', '2009-04-12', '2010-04-12', '2011-04-12',
               '2012-04-12', '2013-04-12', '2014-04-12', '2015-04-12',
               '2016-04-12', '2017-04-12'],
              dtype='datetime64[ns]', freq='<DateOffset: kwds={'years': 1}>')

In [104]:
mask = stocks.index.isin(aprils)
mask

array([False, False, False, ..., False, False, False], dtype=bool)

In [106]:
stocks[mask]

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-04-12,30.25,30.49,30.2,30.32,37074279
2011-04-12,25.83,25.85,25.55,25.64,36936975
2012-04-12,30.48,31.04,30.42,30.98,38306010
2013-04-12,28.85,29.02,28.66,28.79,62888012
2016-04-12,54.37,54.78,53.76,54.65,24574169


# Section 10; Part 140
`Timestamp` object attributes

In [108]:
company = 'MSFT'
start = '2010-01-01'
end = '2017-12-31'

stocks = data.DataReader(name=company, data_source='google', start=start, end=end)
stocks.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-01-04,30.62,31.1,30.59,30.95,38414185
2010-01-05,30.85,31.1,30.64,30.96,49758862
2010-01-06,30.88,31.08,30.52,30.77,58182332
2010-01-07,30.63,30.7,30.19,30.45,50564285
2010-01-08,30.28,30.88,30.24,30.66,51201289


In [109]:
someday = stocks.index[500]
someday

Timestamp('2011-12-28 00:00:00')

In [110]:
# Attributes
#  day - day of month
#  month - month of year
#  year - year
#  weekday_name - Day of week
#  is_month_end - True/False if end of month
#  is_month_start - True/False if start of month
print someday.day
print someday.month
print someday.year
print someday.weekday_name
print someday.is_month_end
print someday.is_month_start

28
12
2011
Wednesday
False
False


In [111]:
# Add a new column to the dataframe that shows the Day of the week and put it as the first column
stocks.insert(0, column = "Day of Week", value = stocks.index.weekday_name)

In [112]:
stocks.head()

Unnamed: 0_level_0,Day of Week,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-01-04,Monday,30.62,31.1,30.59,30.95,38414185
2010-01-05,Tuesday,30.85,31.1,30.64,30.96,49758862
2010-01-06,Wednesday,30.88,31.08,30.52,30.77,58182332
2010-01-07,Thursday,30.63,30.7,30.19,30.45,50564285
2010-01-08,Friday,30.28,30.88,30.24,30.66,51201289


In [113]:
# Add column indicating whether start of month
stocks.insert(1, column = "Month Start", value = stocks.index.is_month_start)

In [114]:
stocks

Unnamed: 0_level_0,Day of Week,Month Start,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-01-04,Monday,False,30.62,31.10,30.59,30.95,38414185
2010-01-05,Tuesday,False,30.85,31.10,30.64,30.96,49758862
2010-01-06,Wednesday,False,30.88,31.08,30.52,30.77,58182332
2010-01-07,Thursday,False,30.63,30.70,30.19,30.45,50564285
2010-01-08,Friday,False,30.28,30.88,30.24,30.66,51201289
2010-01-11,Monday,False,30.71,30.76,30.12,30.27,68754648
2010-01-12,Tuesday,False,30.15,30.40,29.91,30.07,65913228
2010-01-13,Wednesday,False,30.26,30.52,30.01,30.35,51863463
2010-01-14,Thursday,False,30.31,31.10,30.26,30.96,63244767
2010-01-15,Friday,False,31.08,31.24,30.71,30.86,79915648


In [115]:
# Find the stock price at the start of each month (and stock market was open...if closed it's not in the dataframe)
stocks[stocks["Month Start"]]

Unnamed: 0_level_0,Day of Week,Month Start,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-02-01,Monday,True,28.39,28.48,27.92,28.41,85931099
2010-03-01,Monday,True,28.77,29.05,28.53,29.02,43805302
2010-06-01,Tuesday,True,25.53,26.31,25.52,25.89,76155453
2010-07-01,Thursday,True,23.09,23.32,22.73,23.16,92239399
2010-09-01,Wednesday,True,23.67,23.95,23.54,23.9,65235852
2010-10-01,Friday,True,24.77,24.82,24.3,24.38,62672276
2010-11-01,Monday,True,26.88,27.22,26.7,26.95,61916183
2010-12-01,Wednesday,True,25.57,26.25,25.56,26.04,74123490
2011-02-01,Tuesday,True,27.8,28.06,27.61,27.99,62810661
2011-03-01,Tuesday,True,26.6,26.78,26.15,26.16,60054986


# Section 10; Part 141
The `.truncate()` method

 - Used for slicing pandas objects with a Datetimeindex
 - DateTimeIndex **must** be sorted, or unexpected results will be returned

In [116]:
company = 'MSFT'
start = '2010-01-01'
end = '2017-12-31'

stocks = data.DataReader(name=company, data_source='google', start=start, end=end)
stocks.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-01-04,30.62,31.1,30.59,30.95,38414185
2010-01-05,30.85,31.1,30.64,30.96,49758862
2010-01-06,30.88,31.08,30.52,30.77,58182332
2010-01-07,30.63,30.7,30.19,30.45,50564285
2010-01-08,30.28,30.88,30.24,30.66,51201289


In [119]:
#  before - Truncate dates before this
#  after - Truncate dates after this

# Return rows between Feb 5, 2011 and Feb 28, 2011
stocks.truncate(before='2011-02-05', after='2011-02-28')

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2011-02-07,27.8,28.34,27.79,28.2,68980871
2011-02-08,28.1,28.34,28.05,28.28,34910467
2011-02-09,28.19,28.26,27.91,27.97,52905018
2011-02-10,27.93,27.94,27.29,27.5,76672349
2011-02-11,27.76,27.81,27.07,27.25,83939643
2011-02-14,27.2,27.27,26.95,27.23,56766112
2011-02-15,27.04,27.33,26.95,26.96,44120592
2011-02-16,27.05,27.07,26.6,27.02,70817867
2011-02-17,26.97,27.37,26.91,27.21,57211558
2011-02-18,27.13,27.21,26.99,27.06,68672855


# Section 10; Part 142
`pd.DateOffset` objects

 - Way to modify existing times (add, subtract)

In [122]:
company = 'GOOG'
start = dt.date(2000,1,1)
end = dt.datetime.now()

stocks = data.DataReader(name=company, data_source='google', start=start, end=end)
stocks.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2004-08-19,49.96,51.98,47.93,50.12,
2004-08-20,50.69,54.49,50.2,54.1,
2004-08-23,55.32,56.68,54.47,54.65,
2004-08-24,55.56,55.74,51.73,52.38,
2004-08-25,52.43,53.95,51.89,52.95,


In [124]:
# Our datetime index
stocks.index

DatetimeIndex(['2004-08-19', '2004-08-20', '2004-08-23', '2004-08-24',
               '2004-08-25', '2004-08-26', '2004-08-27', '2004-08-30',
               '2004-08-31', '2004-09-01',
               ...
               '2016-11-22', '2016-11-23', '2016-11-25', '2016-11-28',
               '2016-11-29', '2016-11-30', '2016-12-01', '2016-12-02',
               '2016-12-05', '2016-12-06'],
              dtype='datetime64[ns]', name=u'Date', length=3097, freq=None)

In [127]:
# DateOffset generates a length of time which we add/subtract to the pandas object

# Add 5 days
#   Previously the first value was 2004-08-19, after this it is 2004-08-24
stocks.index + pd.DateOffset(days = 5)

# This doesn't modify the index. To do so, need to assign it back to stocks.index

DatetimeIndex(['2004-08-24', '2004-08-25', '2004-08-28', '2004-08-29',
               '2004-08-30', '2004-08-31', '2004-09-01', '2004-09-04',
               '2004-09-05', '2004-09-06',
               ...
               '2016-11-27', '2016-11-28', '2016-11-30', '2016-12-03',
               '2016-12-04', '2016-12-05', '2016-12-06', '2016-12-07',
               '2016-12-10', '2016-12-11'],
              dtype='datetime64[ns]', name=u'Date', length=3097, freq=None)

In [128]:
# Can do weeks
stocks.index - pd.DateOffset(weeks = 2)

DatetimeIndex(['2004-08-05', '2004-08-06', '2004-08-09', '2004-08-10',
               '2004-08-11', '2004-08-12', '2004-08-13', '2004-08-16',
               '2004-08-17', '2004-08-18',
               ...
               '2016-11-08', '2016-11-09', '2016-11-11', '2016-11-14',
               '2016-11-15', '2016-11-16', '2016-11-17', '2016-11-18',
               '2016-11-21', '2016-11-22'],
              dtype='datetime64[ns]', name=u'Date', length=3097, freq=None)

In [129]:
# Can do months or years
stocks.index + pd.DateOffset(months = 3)

DatetimeIndex(['2004-11-19', '2004-11-20', '2004-11-23', '2004-11-24',
               '2004-11-25', '2004-11-26', '2004-11-27', '2004-11-30',
               '2004-11-30', '2004-12-01',
               ...
               '2017-02-22', '2017-02-23', '2017-02-25', '2017-02-28',
               '2017-02-28', '2017-02-28', '2017-03-01', '2017-03-02',
               '2017-03-05', '2017-03-06'],
              dtype='datetime64[ns]', name=u'Date', length=3097, freq=None)

In [130]:
stocks.index - pd.DateOffset(years = 10)

DatetimeIndex(['1994-08-19', '1994-08-20', '1994-08-23', '1994-08-24',
               '1994-08-25', '1994-08-26', '1994-08-27', '1994-08-30',
               '1994-08-31', '1994-09-01',
               ...
               '2006-11-22', '2006-11-23', '2006-11-25', '2006-11-28',
               '2006-11-29', '2006-11-30', '2006-12-01', '2006-12-02',
               '2006-12-05', '2006-12-06'],
              dtype='datetime64[ns]', name=u'Date', length=3097, freq=None)

In [131]:
# Can do hours (even when hours don't previously exist in the index, because it is implicitly set at midnight)
stocks.index + pd.DateOffset(hours = 6)

DatetimeIndex(['2004-08-19 06:00:00', '2004-08-20 06:00:00',
               '2004-08-23 06:00:00', '2004-08-24 06:00:00',
               '2004-08-25 06:00:00', '2004-08-26 06:00:00',
               '2004-08-27 06:00:00', '2004-08-30 06:00:00',
               '2004-08-31 06:00:00', '2004-09-01 06:00:00',
               ...
               '2016-11-22 06:00:00', '2016-11-23 06:00:00',
               '2016-11-25 06:00:00', '2016-11-28 06:00:00',
               '2016-11-29 06:00:00', '2016-11-30 06:00:00',
               '2016-12-01 06:00:00', '2016-12-02 06:00:00',
               '2016-12-05 06:00:00', '2016-12-06 06:00:00'],
              dtype='datetime64[ns]', name=u'Date', length=3097, freq=None)

In [132]:
# Can combine parameters
stocks.index - pd.DateOffset(years = 1, months = 3, days = 10)

DatetimeIndex(['2003-05-09', '2003-05-10', '2003-05-13', '2003-05-14',
               '2003-05-15', '2003-05-16', '2003-05-17', '2003-05-20',
               '2003-05-21', '2003-05-22',
               ...
               '2015-08-12', '2015-08-13', '2015-08-15', '2015-08-18',
               '2015-08-19', '2015-08-20', '2015-08-22', '2015-08-23',
               '2015-08-26', '2015-08-27'],
              dtype='datetime64[ns]', name=u'Date', length=3097, freq=None)

# Section 10; Part 143
More about `pd.DateOffset` objects



In [134]:
import pandas as pd
import datetime as dt
from pandas_datareader import data

In [133]:
company = 'GOOG'
start = dt.date(2000,1,1)
end = dt.datetime.now()

stocks = data.DataReader(name=company, data_source='google', start=start, end=end)
stocks.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2004-08-19,49.96,51.98,47.93,50.12,
2004-08-20,50.69,54.49,50.2,54.1,
2004-08-23,55.32,56.68,54.47,54.65,
2004-08-24,55.56,55.74,51.73,52.38,
2004-08-25,52.43,53.95,51.89,52.95,


In [138]:
# Round to the end of the month
#  Can't add/subtract to the end because each index is off by a different value
# We use the tseries.offset.MonthEnd()
#   If adding to MonthEnd, will go to the end of the current month
#   If Subtracting, will go to the end of previous month

# NOTE: If currently on a row that is the month end, it will go to the NEXT month end
stocks.index + pd.tseries.offsets.MonthEnd()

DatetimeIndex(['2004-08-31', '2004-08-31', '2004-08-31', '2004-08-31',
               '2004-08-31', '2004-08-31', '2004-08-31', '2004-08-31',
               '2004-09-30', '2004-09-30',
               ...
               '2016-11-30', '2016-11-30', '2016-11-30', '2016-11-30',
               '2016-11-30', '2016-12-31', '2016-12-31', '2016-12-31',
               '2016-12-31', '2016-12-31'],
              dtype='datetime64[ns]', name=u'Date', length=3097, freq=None)

In [139]:
# Way to do the above, according to the pandas documentation is to import the offsets into the namespace
# Drawback - if multiple libraries have the same names, there will be a conflict
from pandas.tseries.offsets import *

In [140]:
# Now we can do the same logic as above, but with less code
stocks.index + MonthEnd()

DatetimeIndex(['2004-08-31', '2004-08-31', '2004-08-31', '2004-08-31',
               '2004-08-31', '2004-08-31', '2004-08-31', '2004-08-31',
               '2004-09-30', '2004-09-30',
               ...
               '2016-11-30', '2016-11-30', '2016-11-30', '2016-11-30',
               '2016-11-30', '2016-12-31', '2016-12-31', '2016-12-31',
               '2016-12-31', '2016-12-31'],
              dtype='datetime64[ns]', name=u'Date', length=3097, freq=None)

In [141]:
# MonthBegin
stocks.index + MonthBegin()

DatetimeIndex(['2004-09-01', '2004-09-01', '2004-09-01', '2004-09-01',
               '2004-09-01', '2004-09-01', '2004-09-01', '2004-09-01',
               '2004-09-01', '2004-10-01',
               ...
               '2016-12-01', '2016-12-01', '2016-12-01', '2016-12-01',
               '2016-12-01', '2016-12-01', '2017-01-01', '2017-01-01',
               '2017-01-01', '2017-01-01'],
              dtype='datetime64[ns]', name=u'Date', length=3097, freq=None)

In [142]:
# Last business day (not a weekend)
stocks.index + BMonthEnd()

# This warning means that it's not optimized, but does work



DatetimeIndex(['2004-08-31', '2004-08-31', '2004-08-31', '2004-08-31',
               '2004-08-31', '2004-08-31', '2004-08-31', '2004-08-31',
               '2004-09-30', '2004-09-30',
               ...
               '2016-11-30', '2016-11-30', '2016-11-30', '2016-11-30',
               '2016-11-30', '2016-12-30', '2016-12-30', '2016-12-30',
               '2016-12-30', '2016-12-30'],
              dtype='datetime64[ns]', name=u'Date', length=3097, freq=None)

In [144]:
# End of quarter
stocks.index + QuarterEnd()

DatetimeIndex(['2004-09-30', '2004-09-30', '2004-09-30', '2004-09-30',
               '2004-09-30', '2004-09-30', '2004-09-30', '2004-09-30',
               '2004-09-30', '2004-09-30',
               ...
               '2016-12-31', '2016-12-31', '2016-12-31', '2016-12-31',
               '2016-12-31', '2016-12-31', '2016-12-31', '2016-12-31',
               '2016-12-31', '2016-12-31'],
              dtype='datetime64[ns]', name=u'Date', length=3097, freq=None)

In [146]:
# Start of Quarter
stocks.index + QuarterBegin()

DatetimeIndex(['2004-09-01', '2004-09-01', '2004-09-01', '2004-09-01',
               '2004-09-01', '2004-09-01', '2004-09-01', '2004-09-01',
               '2004-09-01', '2004-12-01',
               ...
               '2016-12-01', '2016-12-01', '2016-12-01', '2016-12-01',
               '2016-12-01', '2016-12-01', '2017-03-01', '2017-03-01',
               '2017-03-01', '2017-03-01'],
              dtype='datetime64[ns]', name=u'Date', length=3097, freq=None)

In [147]:
# End of Year
stocks.index + YearEnd()

DatetimeIndex(['2004-12-31', '2004-12-31', '2004-12-31', '2004-12-31',
               '2004-12-31', '2004-12-31', '2004-12-31', '2004-12-31',
               '2004-12-31', '2004-12-31',
               ...
               '2016-12-31', '2016-12-31', '2016-12-31', '2016-12-31',
               '2016-12-31', '2016-12-31', '2016-12-31', '2016-12-31',
               '2016-12-31', '2016-12-31'],
              dtype='datetime64[ns]', name=u'Date', length=3097, freq=None)

# Section 10; Part 144
The pandas `Timedelta` Object

 - Represents a duration of time

In [151]:
timeA = pd.Timestamp('2016-03-31 04:35:16 PM')
timeB = pd.Timestamp('2016-03-20 02:16:49 AM')

print timeA, timeB

2016-03-31 16:35:16 2016-03-20 02:16:49


In [152]:
# Subtract timestamps 
timeA - timeB

Timedelta('11 days 14:18:27')

In [153]:
# Reverse the order and get negative durations
timeB - timeA

Timedelta('-12 days +09:41:33')

In [155]:
# Other way to create timedelta is to use pd.Timedelta

# Notice that weeks is converted to days (weeks * 7)
# years parameter is not available
pd.Timedelta(days = 3, minutes = 45, hours = 12, weeks = 8)

Timedelta('59 days 12:45:00')

In [156]:
# Can accept a string
pd.Timedelta('5 minutes')

Timedelta('0 days 00:05:00')

In [157]:
pd.Timedelta('6 hours 12 minutes')

Timedelta('0 days 06:12:00')

In [158]:
pd.Timedelta('14 days 6 hours 12 minutes 49 seconds')

Timedelta('14 days 06:12:49')

In [161]:
# Weeks does not work with this method
pd.Timedelta('2 weeks')

ValueError: invalid abbreviation: weeks

# Section 10; Part 145
`Timedelta`s in a dataset

In [166]:
shipping = pd.read_csv('datasets/ecommerce.csv', index_col = "ID", parse_dates = ["order_date", "delivery_date"])
shipping.head()

Unnamed: 0_level_0,order_date,delivery_date
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1998-05-24,1999-02-05
2,1992-04-22,1998-03-06
4,1991-02-10,1992-08-26
5,1992-07-21,1997-11-20
7,1993-09-02,1998-06-10


In [169]:
# Find duration between two date columns
shipping['delivery_date'] - shipping['order_date']

ID
1      257 days
2     2144 days
4      563 days
5     1948 days
7     1742 days
8      154 days
9     1711 days
10    2502 days
11     367 days
18     848 days
19       9 days
20    2180 days
23    2633 days
26     753 days
30      81 days
32    3107 days
33     752 days
35     961 days
36    1371 days
39    1036 days
41    1555 days
46     318 days
50    2997 days
52     985 days
53     937 days
54     605 days
58      64 days
59      78 days
60     903 days
63    1715 days
         ...   
932    258 days
934     50 days
935   2719 days
938    843 days
939     67 days
942   1784 days
943   3228 days
945    737 days
946   1697 days
947   2915 days
949     92 days
951    929 days
953    955 days
954    168 days
956   1236 days
957   1758 days
958   2621 days
969     53 days
972   2097 days
975    137 days
981    397 days
983    942 days
984   2756 days
985    328 days
986    737 days
990   1684 days
991   2394 days
993   2719 days
994     10 days
997    637 days
dtype: timedelta64[ns

In [171]:
# Create new column with this duration
shipping['Delivery Time'] = shipping['delivery_date'] - shipping['order_date']
shipping.head()

Unnamed: 0_level_0,order_date,delivery_date,Delivery Time
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1998-05-24,1999-02-05,257 days
2,1992-04-22,1998-03-06,2144 days
4,1991-02-10,1992-08-26,563 days
5,1992-07-21,1997-11-20,1948 days
7,1993-09-02,1998-06-10,1742 days


In [172]:
# Find out what delivery date + delivery time is  (What is twice the delivery date if the deliver time is doubled?)
#  Performing math across timestamps and timedeltas
shipping['delivery_date'] + shipping['Delivery Time']

ID
1     1999-10-20
2     2004-01-18
4     1994-03-12
5     2003-03-22
7     2003-03-18
8     1994-04-14
9     1999-06-09
10    2005-11-05
11    1998-07-16
18    2000-02-08
19    1998-05-28
20    2004-09-24
23    2006-10-30
26    2000-05-26
30    1999-04-02
32    2007-01-25
33    1998-11-03
35    1998-12-15
36    1997-11-16
39    1995-11-27
41    2000-08-12
46    1997-06-02
50    2007-09-30
52    2000-01-24
53    2001-01-15
54    1999-11-30
58    1995-06-19
59    1996-03-03
60    1998-01-25
63    2000-05-13
         ...    
932   1998-12-23
934   1995-08-30
935   2005-08-16
938   1997-11-04
939   1999-02-12
942   2002-01-20
943   2008-10-03
945   1996-11-09
946   2000-10-16
947   2007-06-04
949   1992-04-08
951   1996-11-02
953   1996-12-18
954   1994-07-10
956   2002-05-30
957   2004-05-17
958   2004-09-01
969   1997-01-08
972   2001-08-02
975   1998-03-19
981   1999-04-05
983   2000-02-26
984   2006-08-27
985   1997-05-12
986   1994-12-23
990   2000-09-12
991   2004-10-18
993   2005-

In [174]:
shipping['Twice as long'] = shipping['delivery_date'] + shipping['Delivery Time']
shipping.head()

Unnamed: 0_level_0,order_date,delivery_date,Delivery Time,Twice as long
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1998-05-24,1999-02-05,257 days,1999-10-20
2,1992-04-22,1998-03-06,2144 days,2004-01-18
4,1991-02-10,1992-08-26,563 days,1994-03-12
5,1992-07-21,1997-11-20,1948 days,2003-03-22
7,1993-09-02,1998-06-10,1742 days,2003-03-18


In [176]:
# Find which took over 3000 days to deliver
mask = shipping['Delivery Time'] > "3000 days"
shipping[mask]

Unnamed: 0_level_0,order_date,delivery_date,Delivery Time,Twice as long
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
32,1990-01-20,1998-07-24,3107 days,2007-01-25
130,1990-04-02,1999-08-16,3423 days,2008-12-29
151,1991-01-29,1999-08-05,3110 days,2008-02-09
229,1990-04-13,1998-11-17,3140 days,2007-06-23
314,1990-03-07,1999-12-25,3580 days,2009-10-13
331,1990-09-18,1999-12-19,3379 days,2009-03-20
348,1990-02-27,1999-01-04,3233 days,2007-11-11
392,1990-12-24,1999-12-04,3267 days,2008-11-13
590,1990-03-25,1998-12-20,3192 days,2007-09-16
634,1991-04-04,1999-07-21,3030 days,2007-11-06


In [177]:
# Shortest delivery time
shipping['Delivery Time'].min()

Timedelta('8 days 00:00:00')