In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data/ETH_1h.csv')

In [3]:
df.head()

Unnamed: 0,Date,Symbol,Open,High,Low,Close,Volume
0,2020-03-13 08-PM,ETHUSD,129.94,131.82,126.87,128.71,1940673.93
1,2020-03-13 07-PM,ETHUSD,119.51,132.02,117.1,129.94,7579741.09
2,2020-03-13 06-PM,ETHUSD,124.47,124.85,115.5,119.51,4898735.81
3,2020-03-13 05-PM,ETHUSD,124.08,127.42,121.63,124.47,2753450.92
4,2020-03-13 04-PM,ETHUSD,124.85,129.51,120.17,124.08,4461424.71


In [4]:
df.shape

(23674, 7)

In [10]:
# Notice that the dtype is object and NOT the date time dtype
df['Date'].unique()

array(['2020-03-13 08-PM', '2020-03-13 07-PM', '2020-03-13 06-PM', ...,
       '2017-07-01 01-PM', '2017-07-01 12-PM', '2017-07-01 11-AM'],
      shape=(23674,), dtype=object)

In [11]:
# Notice that it does not work because we are reading this as a string and NOT as date time.
df.loc[0, 'Date'].day_name()

AttributeError: 'str' object has no attribute 'day_name'

In [12]:
df['Date'] = pd.to_datetime(df['Date'])

  df['Date'] = pd.to_datetime(df['Date'])


DateParseError: Unknown datetime string format, unable to parse: 2020-03-13 08-PM, at position 0

In [13]:
df['Date'] = pd.to_datetime(df['Date'], format = '%Y-%m-%d %I-%p')

In [15]:
# NOtice now that it says a DateTimeArray
df['Date'].unique()

<DatetimeArray>
['2020-03-13 20:00:00', '2020-03-13 19:00:00', '2020-03-13 18:00:00',
 '2020-03-13 17:00:00', '2020-03-13 16:00:00', '2020-03-13 15:00:00',
 '2020-03-13 14:00:00', '2020-03-13 13:00:00', '2020-03-13 12:00:00',
 '2020-03-13 11:00:00',
 ...
 '2017-07-01 20:00:00', '2017-07-01 19:00:00', '2017-07-01 18:00:00',
 '2017-07-01 17:00:00', '2017-07-01 16:00:00', '2017-07-01 15:00:00',
 '2017-07-01 14:00:00', '2017-07-01 13:00:00', '2017-07-01 12:00:00',
 '2017-07-01 11:00:00']
Length: 23674, dtype: datetime64[ns]

In [16]:
df['Date']

0       2020-03-13 20:00:00
1       2020-03-13 19:00:00
2       2020-03-13 18:00:00
3       2020-03-13 17:00:00
4       2020-03-13 16:00:00
                ...        
23669   2017-07-01 15:00:00
23670   2017-07-01 14:00:00
23671   2017-07-01 13:00:00
23672   2017-07-01 12:00:00
23673   2017-07-01 11:00:00
Name: Date, Length: 23674, dtype: datetime64[ns]

In [17]:
# Now we should be able to use methods related to date and time
df.loc[0, 'Date'].day_name()

'Friday'

We can pass in some arguments so that it automatically reads the column as date time. This is so annoying since many parts are already deprecated or remove. 

So we have to import a function of datetime, and then we have to use a lambda function to strip the string. Then, the date_format will convert each row from the column to the proper date time format.

I suggest to not do this though since the date_parser will be deprecated.

In [37]:
from datetime import datetime

d_parser = lambda x : datetime.strptime(x, '%Y-%m-%d %I-%p')
df = pd.read_csv('data/ETH_1h.csv', parse_dates = ['Date'], date_parser = d_parser)

  df = pd.read_csv('data/ETH_1h.csv', parse_dates = ['Date'], date_parser = d_parser)


In [38]:
df.head()

Unnamed: 0,Date,Symbol,Open,High,Low,Close,Volume
0,2020-03-13 20:00:00,ETHUSD,129.94,131.82,126.87,128.71,1940673.93
1,2020-03-13 19:00:00,ETHUSD,119.51,132.02,117.1,129.94,7579741.09
2,2020-03-13 18:00:00,ETHUSD,124.47,124.85,115.5,119.51,4898735.81
3,2020-03-13 17:00:00,ETHUSD,124.08,127.42,121.63,124.47,2753450.92
4,2020-03-13 16:00:00,ETHUSD,124.85,129.51,120.17,124.08,4461424.71


In [36]:
df['Date'].unique()

<DatetimeArray>
['2020-03-13 20:00:00', '2020-03-13 19:00:00', '2020-03-13 18:00:00',
 '2020-03-13 17:00:00', '2020-03-13 16:00:00', '2020-03-13 15:00:00',
 '2020-03-13 14:00:00', '2020-03-13 13:00:00', '2020-03-13 12:00:00',
 '2020-03-13 11:00:00',
 ...
 '2017-07-01 20:00:00', '2017-07-01 19:00:00', '2017-07-01 18:00:00',
 '2017-07-01 17:00:00', '2017-07-01 16:00:00', '2017-07-01 15:00:00',
 '2017-07-01 14:00:00', '2017-07-01 13:00:00', '2017-07-01 12:00:00',
 '2017-07-01 11:00:00']
Length: 23674, dtype: datetime64[ns]

In pandas, the ```.dt``` accessor is used to work with datetime properties on a datetime64[ns] series.

In [47]:
df['Date'].dt.day_name()

0          Friday
1          Friday
2          Friday
3          Friday
4          Friday
           ...   
23669    Saturday
23670    Saturday
23671    Saturday
23672    Saturday
23673    Saturday
Name: Date, Length: 23674, dtype: object

In [46]:
df['Date'].dt.day_name().value_counts()

Date
Thursday     3384
Wednesday    3384
Tuesday      3384
Monday       3384
Sunday       3384
Friday       3381
Saturday     3373
Name: count, dtype: int64

In [50]:
df['DayofWeek'] = df['Date'].dt.day_name()

In [51]:
df

Unnamed: 0,Date,Symbol,Open,High,Low,Close,Volume,Day of Week,DayofWeek
0,2020-03-13 20:00:00,ETHUSD,129.94,131.82,126.87,128.71,1940673.93,Friday,Friday
1,2020-03-13 19:00:00,ETHUSD,119.51,132.02,117.10,129.94,7579741.09,Friday,Friday
2,2020-03-13 18:00:00,ETHUSD,124.47,124.85,115.50,119.51,4898735.81,Friday,Friday
3,2020-03-13 17:00:00,ETHUSD,124.08,127.42,121.63,124.47,2753450.92,Friday,Friday
4,2020-03-13 16:00:00,ETHUSD,124.85,129.51,120.17,124.08,4461424.71,Friday,Friday
...,...,...,...,...,...,...,...,...,...
23669,2017-07-01 15:00:00,ETHUSD,265.74,272.74,265.00,272.57,1500282.55,Saturday,Saturday
23670,2017-07-01 14:00:00,ETHUSD,268.79,269.90,265.00,265.74,1702536.85,Saturday,Saturday
23671,2017-07-01 13:00:00,ETHUSD,274.83,274.93,265.00,268.79,3010787.99,Saturday,Saturday
23672,2017-07-01 12:00:00,ETHUSD,275.01,275.01,271.00,274.83,824362.87,Saturday,Saturday


In [53]:
df.drop(columns = 'Day of Week', inplace = True)

In [56]:
# This will give us the earliest and then also the most recent date.
df['Date'].min()

Timestamp('2017-07-01 11:00:00')

In [57]:
df['Date'].max()

Timestamp('2020-03-13 20:00:00')

In [58]:
# We can also get the amount of time between the two dates. It will return a time delta
df['Date'].max() - df['Date'].min()

Timedelta('986 days 09:00:00')

In [59]:
filt = (df['Date'] >= '2020')
df.loc[filt]

Unnamed: 0,Date,Symbol,Open,High,Low,Close,Volume,DayofWeek
0,2020-03-13 20:00:00,ETHUSD,129.94,131.82,126.87,128.71,1940673.93,Friday
1,2020-03-13 19:00:00,ETHUSD,119.51,132.02,117.10,129.94,7579741.09,Friday
2,2020-03-13 18:00:00,ETHUSD,124.47,124.85,115.50,119.51,4898735.81,Friday
3,2020-03-13 17:00:00,ETHUSD,124.08,127.42,121.63,124.47,2753450.92,Friday
4,2020-03-13 16:00:00,ETHUSD,124.85,129.51,120.17,124.08,4461424.71,Friday
...,...,...,...,...,...,...,...,...
1744,2020-01-01 04:00:00,ETHUSD,129.57,130.00,129.50,129.56,702786.82,Wednesday
1745,2020-01-01 03:00:00,ETHUSD,130.37,130.44,129.38,129.57,496704.23,Wednesday
1746,2020-01-01 02:00:00,ETHUSD,130.14,130.50,129.91,130.37,396315.72,Wednesday
1747,2020-01-01 01:00:00,ETHUSD,128.34,130.14,128.32,130.14,635419.40,Wednesday


In [61]:
# If we want only for 2019
filt = (df['Date'] >= '2019') & (df['Date'] < '2020')
df.loc[filt]

Unnamed: 0,Date,Symbol,Open,High,Low,Close,Volume,DayofWeek
1749,2019-12-31 23:00:00,ETHUSD,128.33,128.69,128.14,128.54,440678.91,Tuesday
1750,2019-12-31 22:00:00,ETHUSD,128.38,128.69,127.95,128.33,554646.02,Tuesday
1751,2019-12-31 21:00:00,ETHUSD,127.86,128.43,127.72,128.38,350155.69,Tuesday
1752,2019-12-31 20:00:00,ETHUSD,127.84,128.34,127.71,127.86,428183.38,Tuesday
1753,2019-12-31 19:00:00,ETHUSD,128.69,128.69,127.60,127.84,1169847.84,Tuesday
...,...,...,...,...,...,...,...,...
10504,2019-01-01 04:00:00,ETHUSD,130.75,133.96,130.74,131.96,2791135.37,Tuesday
10505,2019-01-01 03:00:00,ETHUSD,130.06,130.79,130.06,130.75,503732.63,Tuesday
10506,2019-01-01 02:00:00,ETHUSD,130.79,130.88,129.55,130.06,838183.43,Tuesday
10507,2019-01-01 01:00:00,ETHUSD,131.62,131.62,130.77,130.79,434917.99,Tuesday


In [65]:
# If we need the specific date, we can use pd.Timestamp
filt = (df['Date'] >= pd.Timestamp('2019-01-01')) & (df['Date'] <= pd.Timestamp('2019-11-05'))
df.loc[filt]

Unnamed: 0,Date,Symbol,Open,High,Low,Close,Volume,DayofWeek
3116,2019-11-05 00:00:00,ETHUSD,186.25,186.26,185.37,185.40,706059.84,Tuesday
3117,2019-11-04 23:00:00,ETHUSD,185.53,186.34,185.50,186.25,216435.78,Monday
3118,2019-11-04 22:00:00,ETHUSD,186.14,186.48,185.34,185.53,381294.62,Monday
3119,2019-11-04 21:00:00,ETHUSD,187.28,187.53,185.86,186.14,534089.00,Monday
3120,2019-11-04 20:00:00,ETHUSD,186.37,189.50,186.37,187.28,2594660.01,Monday
...,...,...,...,...,...,...,...,...
10504,2019-01-01 04:00:00,ETHUSD,130.75,133.96,130.74,131.96,2791135.37,Tuesday
10505,2019-01-01 03:00:00,ETHUSD,130.06,130.79,130.06,130.75,503732.63,Tuesday
10506,2019-01-01 02:00:00,ETHUSD,130.79,130.88,129.55,130.06,838183.43,Tuesday
10507,2019-01-01 01:00:00,ETHUSD,131.62,131.62,130.77,130.79,434917.99,Tuesday


In [66]:
# We can set our index so that it using the date time column
df.set_index('Date', inplace = True)
df

Unnamed: 0_level_0,Symbol,Open,High,Low,Close,Volume,DayofWeek
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-03-13 20:00:00,ETHUSD,129.94,131.82,126.87,128.71,1940673.93,Friday
2020-03-13 19:00:00,ETHUSD,119.51,132.02,117.10,129.94,7579741.09,Friday
2020-03-13 18:00:00,ETHUSD,124.47,124.85,115.50,119.51,4898735.81,Friday
2020-03-13 17:00:00,ETHUSD,124.08,127.42,121.63,124.47,2753450.92,Friday
2020-03-13 16:00:00,ETHUSD,124.85,129.51,120.17,124.08,4461424.71,Friday
...,...,...,...,...,...,...,...
2017-07-01 15:00:00,ETHUSD,265.74,272.74,265.00,272.57,1500282.55,Saturday
2017-07-01 14:00:00,ETHUSD,268.79,269.90,265.00,265.74,1702536.85,Saturday
2017-07-01 13:00:00,ETHUSD,274.83,274.93,265.00,268.79,3010787.99,Saturday
2017-07-01 12:00:00,ETHUSD,275.01,275.01,271.00,274.83,824362.87,Saturday


In [72]:
# It needs to be a string apparently???
df.loc['2019']

Unnamed: 0_level_0,Symbol,Open,High,Low,Close,Volume,DayofWeek
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-12-31 23:00:00,ETHUSD,128.33,128.69,128.14,128.54,440678.91,Tuesday
2019-12-31 22:00:00,ETHUSD,128.38,128.69,127.95,128.33,554646.02,Tuesday
2019-12-31 21:00:00,ETHUSD,127.86,128.43,127.72,128.38,350155.69,Tuesday
2019-12-31 20:00:00,ETHUSD,127.84,128.34,127.71,127.86,428183.38,Tuesday
2019-12-31 19:00:00,ETHUSD,128.69,128.69,127.60,127.84,1169847.84,Tuesday
...,...,...,...,...,...,...,...
2019-01-01 04:00:00,ETHUSD,130.75,133.96,130.74,131.96,2791135.37,Tuesday
2019-01-01 03:00:00,ETHUSD,130.06,130.79,130.06,130.75,503732.63,Tuesday
2019-01-01 02:00:00,ETHUSD,130.79,130.88,129.55,130.06,838183.43,Tuesday
2019-01-01 01:00:00,ETHUSD,131.62,131.62,130.77,130.79,434917.99,Tuesday


In [82]:
# For a specific range, we can use a splice
df.sort_index(inplace = True)
df['2020-01':'2020-02']

Unnamed: 0_level_0,Symbol,Open,High,Low,Close,Volume,DayofWeek
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-01-01 00:00:00,ETHUSD,128.54,128.54,128.12,128.34,245119.91,Wednesday
2020-01-01 01:00:00,ETHUSD,128.34,130.14,128.32,130.14,635419.40,Wednesday
2020-01-01 02:00:00,ETHUSD,130.14,130.50,129.91,130.37,396315.72,Wednesday
2020-01-01 03:00:00,ETHUSD,130.37,130.44,129.38,129.57,496704.23,Wednesday
2020-01-01 04:00:00,ETHUSD,129.57,130.00,129.50,129.56,702786.82,Wednesday
...,...,...,...,...,...,...,...
2020-02-29 19:00:00,ETHUSD,225.09,225.85,223.87,225.31,1250856.20,Saturday
2020-02-29 20:00:00,ETHUSD,225.31,225.33,223.50,224.63,511648.65,Saturday
2020-02-29 21:00:00,ETHUSD,224.63,225.14,222.74,223.48,561158.03,Saturday
2020-02-29 22:00:00,ETHUSD,223.48,223.59,222.14,223.35,535998.57,Saturday


In [85]:
df['2020-01':'2020-02']['Close'].mean()

np.float64(195.1655902777778)

In [89]:
# What would happen if I only want daily viewing for the max High price?
df.loc['2020-01-01']['High'].max()

np.float64(132.68)

### Resampling
If you put D then Day, W for Week, so and so forth in the documentation
But if you leave it as that, that's kinda incomplete. You also need the function like the max value.

In [93]:
df['High'].resample('D').max()

Date
2017-07-01    279.99
2017-07-02    293.73
2017-07-03    285.00
2017-07-04    282.83
2017-07-05    274.97
               ...  
2020-03-09    208.65
2020-03-10    206.28
2020-03-11    202.98
2020-03-12    195.64
2020-03-13    148.00
Freq: D, Name: High, Length: 987, dtype: float64

In [94]:
highs = df['High'].resample('D').max()
highs['2020-01-01']

np.float64(132.68)

In [95]:
%matplotlib inline

ModuleNotFoundError: No module named 'matplotlib'