In [0]:
import pandas as pd

# Assignment 1: Date Formats and Date Parts

* First, convert the `date` column to datetime64, by any method.

* Then, create a column representing the time difference between the last date in the data and each date. 

* Next, create columns for the date parts year, month, and weekday.

* Finally, format the date to Year-Month-Day (This will be a string/object).

In [0]:
# conversion with parse dates in read_csv

transactions = pd.read_csv("../retail/transactions.csv", parse_dates=["date"])

In [0]:
# conversion with to_datetime
transactions["date"] = pd.to_datetime(transactions["date"])

In [0]:
# conversion with astype

transactions["date"] = transactions["date"].astype("datetime64")

In [0]:
transactions.info()

In [0]:
transactions.head()

In [0]:
# Calcualte the maximum datetime
transactions["date"].max()

In [0]:
# Difference between date and max date
transactions["time_to_last_date"] = transactions["date"].max() - transactions["date"]

# Dateparts
transactions["year"] = transactions["date"].dt.year
transactions["month"] = transactions["date"].dt.month
transactions["day_of_week"] = transactions["date"].dt.weekday

# Format Date
transactions["date"] = transactions["date"].dt.strftime("%Y-%B-%d")

transactions.head()

# Assignment 2: Time Arithmetic


Hi again,

I just got word the the true max date in our data was three weeks after 2017-08-15.

* Can you add three weeks to the ‘time_to_last_date’ column? 
* Then, calculate ‘weeks_to_last_date’ by dividing the number of days  in ‘time_to_last_date’ by 7.

Thanks

In [0]:
# overwrite previous transactions df

transactions = pd.read_csv("../retail/transactions.csv", parse_dates=["date"])

In [0]:
transactions.tail()

In [0]:
# recreate columns from assignment 1 using assign

transactions = transactions.assign(
    year=transactions["date"].dt.year,
    month=transactions["date"].dt.month,
    day_of_week=transactions["date"].dt.dayofweek,
    time_to_last_date=transactions["date"].max() - transactions["date"],
)

transactions.head()

In [0]:
# Add three weeks to time to last date column
# Then divide the timedelta (converted to integer) into integer weeks

transactions.assign(
    time_to_last_date=pd.to_timedelta(21, unit="D") + transactions["time_to_last_date"],
    weeks_to_last_date=(lambda x: x["time_to_last_date"].dt.days / 7),
).head()

# Assignment 3: Missing Time Series Data

Take a look at the mean value for the oil price using forward fill, backfill, and interpolation. Are they very different?

Then, plot the series with forward fill for:
 * The year 2014.
 * The month of December 2014.
 * The days from December 1st to December 15th, 2014.


In [0]:
# Read in oil csv with date as index (and converted to datetime64)
oil = pd.read_csv("../retail/oil.csv", 
                  index_col="date", 
                  parse_dates=True)

In [0]:
# This is a synonym for datetime64

oil.index.dtype

In [0]:
# mean of original series

oil.mean()

In [0]:
# original plot

oil.plot()

In [0]:
# mean of each type of missing value handling for time series

print(oil.ffill().mean(), 
      oil.bfill().mean(),
      oil.interpolate().mean()
     )

In [0]:
# Filter to 2014 then plot forward filled Series

oil.loc["2014"].ffill().plot()

In [0]:
# Filter to December 2014 then plot forward filled Series

oil.loc["2014-12"].ffill().plot()

In [0]:
# Filter to first two weeks of December 2014 then plot forward filled Series

oil.loc["2014-12-01":"2014-12-15"].ffill().plot()

# Assignment 4: Shift and Diff

Hello,
I’m looking into a few different year over year trends related to changes made at store 47.

Can you plot the sum of monthly of transactions in year 2015 vs the sum of monthly transactions in the year prior for store 47?

Make sure to group your DataFrame by year AND month!

Thanks

In [0]:
# filter df to store 47, 'drop' store_nbr column via loc
transactions_47 = transactions.loc[transactions["store_nbr"] == 47, ["date", "transactions"]]

# Calculate sum of sales by year and month
transactions_47 = (transactions_47
                   .groupby([transactions["date"].dt.year, transactions["date"].dt.month])
                   .sum())

# Calculate a 'year_prior' column by shiftly monthly sales series forward by 12 rows (months)
transactions_47["year_prior"] = transactions_47["transactions"].shift(12)

# Filter to 2015 and plot
transactions_47.loc[2015].plot();

# Assignment 5: Resampling Time Series

Plot the monthly and yearly average oil prices.

In [0]:
oil.head()

In [0]:
# Monthly average oil price

oil.resample("Y").mean().plot()

In [0]:
# A loop to create various time period averages and plot them

for period in ["D", "W", "M", "A"]:
    oil.resample(period).mean().plot()

# Assignment 6: Rolling Averages

Plot the 90-day moving average for transactions for store 47.

This will help remove some of the noise from our series.

Thanks!


In [0]:
# recreate transactions47 with date as index 

transactions_47 = (transactions
                   .loc[transactions["store_nbr"]==47, ["date", "transactions"]]
                   .set_index("date"))

transactions_47.head()

In [0]:
# Create 90 day rolling average column, drop original transactions column and plot.

(transactions_47
 .assign(transactions_rolling_avg_90 = transactions_47.rolling(90).mean())
 .drop(["transactions"], axis=1)
 .plot()
)

In [0]:
# original daily series for comparison

transactions_47.plot()