In [0]:
import pandas as pd
import numpy as np

# Assignment 1: DataFrame Basics

Hi there!

Can you read in the transactions dataset and report on:

* The number of rows and columns
* The names of the columns
* The datatypes of each column

In [0]:
# A common practice is to create a path variable to pass to read_csv
path = "../retail/transactions.csv"




In [0]:
transactions = pd.read_csv(path)

In [0]:
transactions

In [0]:
transactions.shape

In [0]:
transactions.dtypes

In [0]:
transactions.index.max()


# Assignment 2: Exploring DataFrames

Hello!

* Can you quickly inspect the first 5 rows of the transactions data? 

* Then, dive a bit more deeply into the data and check if there are any missing values.
* What about the number of unique dates? I want to make sure we didn’t leave any out.
* Finally, can you report the mean, median, min and max of “transactions”?  I want to check for any anomalies in our data.


In [0]:
transactions

In [0]:
transactions.head(5)

In [0]:
transactions.info()

In [0]:
transactions.describe(include='all')

# Exercise 3 - Accessing DataFrames

Hi, starting to dive deeper into this data.

I noticed that the first row is the only one from 2013-01-01.

* Can you get me a copy of the DataFrame that excludes that row, and only includes “store_nbr” and “transactions”?
* Also, can you report the number of unique store numbers?
* Finally, return the total number of transactions in millions


In [0]:
transactions

In [0]:
transactions.head()

In [0]:
 transactions.tail()

# Assignment 4: Dropping Data and Duplicates

Hi there!

Can you:

1. Drop the first row of data? We want it permanently removed. 
2. Drop the date column but not in place
3. Return a dataframe that only includes the last row for each of the stores.

Thanks!

In [0]:
transactions.head()

In [0]:
transactions.info()

In [0]:
transactions.drop(0, axis=0, inplace=True)
transactions

In [0]:
transactions.drop("date", axis=1)

In [0]:
transactions

In [0]:
transactions.drop_duplicates(subset="store_nbr", keep="last").head()

In [0]:
transactions.loc[1:, ['store_nbr', 'transactions']]

In [0]:
transactions.date

In [0]:
transactions.loc[1:, "store_nbr":"transactions"]

In [0]:
transactions.loc[:, 'store_nbr'].nunique()

In [0]:
transactions.loc[:,'transactions'].sum() / 1000000

In [0]:
transactions.head()


# Assignment 5: Missing Data

Hello, 

Can you tell if any dates or prices are missing in the oil dataset?

Then compare the mean of the oil series when filling in with mean vs. filling in with 0.

Thanks!

In [0]:
oil = pd.read_csv("../retail/oil.csv")

In [0]:
oil.info()

In [0]:
oil.isna().sum()


In [0]:
oil.loc[:, 'dcoilwtico'].fillna(0).mean()

In [0]:
oil.loc[:, 'dcoilwtico'].fillna(oil.loc[:,'dcoilwtico'].mean()).mean()

# Assignment 6: Filtering DataFrames

I need some quick research on store 25:

* First, calculate the percentage of times ALL stores had more than 2000 transactions
* Then, calculate the percentage of times store 25 had more than 2000 transactions, and calculate the sum of transactions on these days
* Finally, sum the transactions for stores 25 and 31, that occurred in May or June, and had less than 2000 transactions


In [0]:
transactions.head()

In [0]:
transactions.query("date.str[6] == '1'")

In [0]:
(transactions['transactions'] > 2000).mean()

In [0]:
mask = (transactions['store_nbr'] == 25) & (transactions['transactions'] > 2000)


In [0]:
(transactions.loc[mask, "transactions"].count() / transactions.loc[transactions['store_nbr'] == 25, "transactions"].count())

In [0]:
transactions[mask].loc[:, "transactions"].sum()

In [0]:
transactions.query("store_nbr in [25,31] & date.str[6] in ['5','6'] & transactions < 2000").transactions.sum()

# Assignment 7: Sorting DataFrames

Hi there,
* Can you get me a dataset that includes the 5 days with the highest transactions counts? Any similarities between them?
* Then, can you get me a dataset sorted by date from earliest to most recent, but with the highest transactions first and the lowest transactions last for each day?
* Finally, sort the columns in reverse alphabetical order. 

Thanks!


In [0]:
transactions.head()

In [0]:
transactions.sort_values('transactions', ascending=False).iloc[:5,:]

In [0]:
transactions.sort_values(['date','transactions'], ascending =[True, False])

In [0]:
transactions.sort_index(ascending=False, axis=1)

# Assignment 8: Modifying Columns

Just some quick work, but can you send me the transaction data with the columns renamed?

* Rename `transactions` to `transaction_count` and `store_nbr` to `store_number`.
* Reorder the columns so date is first, then store number, then transaction count.

Thanks!


In [0]:
transactions.head()

In [0]:
transactions.rename(columns={'transactions' : 'transaction_count', 'store_nbr' : 'store_number'})

In [0]:
transactions.reindex(labels=['date','store_number','transactions_count'], axis=1)

# Assignment 9: Column Creation

Just some quick work, but can you send me the transaction data with the columns renamed?

* Create a `pct_to_target` column that divides transactions by 2500.
* Then, create a `met_target` column that returns True if `pct_to_target` is greater than or equal to 1.
* Next, create a `bonus_payable` column that equals 100 if `met_target` is True, and 0 if not. Then sum the bonus payable column.
* Finally, create columns for month and day of week as integers. There is some helper code for these dateparts below.



Thanks!



In [0]:
transactions.head()

In [0]:
transactions['pct_to_target'] = transactions.loc[:, 'transactions'] / 2500

In [0]:
transactions

In [0]:
transactions['met_target'] = transactions.loc[:, 'pct_to_target'] >= 1

In [0]:
transactions

In [0]:
transactions['bonus_payable'] = 100 * transactions['met_target']

In [0]:
transactions

In [0]:
transactions.loc[:, 'bonus_payable'].sum()

In [0]:
transactions['month'] = pd.to_datetime(transactions['date']).dt.month

In [0]:
transactions

In [0]:
transactions['day_of_week'] = pd.to_datetime(transactions['date']).dt.day_of_week

In [0]:
transactions

# Assignment 10: np.select

Hi there! I need a few columns created.

1. Create a ‘seasonal_bonus’ column that applies to these dates: 
    * All days in December (month = 12)
    * Sundays (day_of_week = 6) in May (month = 5)
    * Mondays (day_of_week = 0) in July (month = 7)
2. Call the December bonus ‘Holiday Bonus’, the May bonus ‘Corporate Month’, and the July bonus ‘Summer Special’. If no bonus applies, the column should display ‘None’. 
3. Finally, calculate the total bonus owed at $100 per day.

Thanks!

In [0]:
transactions.head()

In [0]:
conditions = [
    (transactions['month'] == 12),
    (transactions['month'] == 5) & (transactions['day_of_week'] == 6),
    (transactions['month'] == 7) & (transactions['day_of_week'] == 0)
]
choices = ["Hoilday Bonus", "Corporate Month", "Summer Special"]
transactions["seasonal_bonus"] = np.select(conditions, choices, default='None')
transactions.head()

In [0]:
transactions['seasonal_bonus'].value_counts().iloc[1:].sum() * 100

# Assignment 11: Assign 

* Drop the columns that have been created so far (keep only date, store_number, and transaction count), and recreate them using the assign method.
* Then sum the seasonal bonus owed once again to make sure the numbers are correct.


In [0]:
# Drop columns we created in prior exercises


In [0]:
# Create samef.drop('City', axis=1, idnplace=True) columns with assign


In [0]:
transactions

In [0]:
transactions.assign(
    target_pct = transactions['transactions'] / 2500,
    met_target = (transactions['transactions']) / 2500 >= 1,
    bonus_payable = ((transactions['transactions'] / 2500) >= 1) * 100,
    month = transactions.date.dt.month,
    day_of_week = transactions.date.dt.dayofweek,
    seasonal_bonus = np.select(conditions, choices, default='none'),
)
transactions



In [0]:
transactions

# Assignment 12: Memory Optimization

Reduce the memory usage of the transactions DataFrame to below 5MB.

In [0]:
transactions.head()

In [0]:
transactions.info(memory_usage="deep")

In [0]:
transactions['date'] = transactions['date'].astype('datetime64')
transactions['bouns_payable'] = transactions['bouns_payable'].astype('int8')
transactions['day_of_week'] = transactions['day_of_week'].astype('int8')

In [0]:
transactions.info(memory_usage='deep')

In [0]:
transactions