In [0]:
import pandas as pd

# Assignment 1: Streamlined Data Ingestion

Now that we have a good idea of what we want the data prep on transactions looks like,
let's push that to the read_csv function. 

Keep an eye on the memory usage before and after. 

* Change the column names to 'Date', 'Store_Number', and 'Transaction_Count'.
* Skip the first row of data.
* Convert columns to the appropriate datatypes. 

Then create the columns we created in the assign assignment in Section 3, by chaining assign with read_csv. 

Some starter code has been provided for you below. Because the dataframe object returned by read_csv doesn't have a name, we need to use a lambda function to refer to the dataframe.

`transactions.assign(
    target_pct=transactions["transactions"] / 2500,
    met_target=(transactions["transactions"] / 2500) >= 1,
    bonus_payable=((transactions["transactions"] / 2500) >= 1) * 100,
    month=transactions["date"].dt.month,
    day_of_week=transactions["date"].dt.dayofweek,
)`

The first one should look like:

`target_pct = lambda x: (x["Transaction_Count"] / 2500)`


In [0]:
pd.read_csv("../retail/transactions.csv").info(memory_usage="deep")

In [0]:
transactions = pd.read_csv(
    "../retail/transactions.csv",                                          
    header=0,                                                              # Suppress header to allow custom names
    names=["Date", "Store_Number", "Transaction_Count"],                   # Specify new column names
    skiprows=[0],                                                          # Skip the first row of data
    parse_dates=["Date"],                                                  # parse date column
    dtype={"Store_Number": "Int8", "Transaction_Count": "Int16"}).assign(  # Downcast two integer columns
    target_pct = lambda x: (x["Transaction_Count"] / 2500),
    met_target = lambda x: (x["Transaction_Count"] / 2500 >= 1),
    bonus_payable = lambda x: (x["Transaction_Count"] / 2500 >= 1 * 100),
    month = lambda x: x["Date"].dt.month,
    day_of_week = lambda x: x["Date"].dt.dayofweek,
).astype({                                                                 # Cast new columns to correct dtypes.
    "target_pct": "Float32",                                               # Note this could also be done in assign
    "month": "Int8",                                                      
    "day_of_week": "Int8"
})


In [0]:
# Df is significantly reduced in size! 

transactions.info(memory_usage="deep") 

# Assignment 2: Write to Excel Sheets

Write the data in the transactions dataframe you created above into an Excel workbook.

Write out a separate sheet for each year of the data.

If you prefer, you can write each year of data to a separate csv file.

In [0]:
transactions.head()

In [0]:
transactions.tail()

In [0]:
# Open ExcelWriter to write multiple sheets

with pd.ExcelWriter("DataForChandler.xlsx") as writer:
    for year in range(2013, 2018):                 # Specify years to filter by for each sheet and loop through them
       (transactions
        .loc[transactions["Date"].dt.year == year] # Filter DF to year in current iteration of loop
        .to_excel(writer, sheet_name=str(year)))   # Write each year's DF to sheet named for that year

In [0]:
for year in range(2013, 2018):                     # Specify years to filter by for each sheet and loop through them
    (transactions
     .loc[transactions["Date"].dt.year == year]    # Filter DF to year in current iteration of loop
     .to_csv(f"transactions_{year}.csv")           # Write each year's DF to sheet named for that year
    )