In [None]:
# Import pandas 
import pandas as pd
from datetime import datetime, timedelta
import numpy as np

In [None]:
# Load the customer_data
customer_data = pd.read_csv('datasets/Customer_dataset.csv')

# Load the app_purchases
app_purchases = pd.read_csv('datasets/inapp_purchases_dataset.csv')

# Print the columns of customer data
print(customer_data.columns)

# Print the columns of app_purchases
print(app_purchases.columns)

In [None]:
customer_data.head()

In [None]:
# Changing reg_date column type from string to datetime:
customer_data["reg_date"] = pd.to_datetime(customer_data["reg_date"])  

In [None]:
# Adding new colum as reg_date_1 which is changed type from timestapt to datetime(Y-M-D):
customer_data['reg_date_1'] = customer_data['reg_date'].dt.strftime('%Y-%m-%d')

In [None]:
app_purchases.head()

In [None]:
# Merge on the 'uid' and 'date' field
uid_date_combined_data = app_purchases.merge(customer_data, left_on=['uid','date'],right_on=['uid','reg_date_1'], how='inner')

# Examine the results 
print(uid_date_combined_data.head())
print(len(uid_date_combined_data))

In [None]:
uid_date_combined_data.head()

### Practicing aggregations
It's time to begin exploring the in-app purchase data in more detail. Here, you will practice aggregating the dataset in various ways using the .agg() method and then examine the results to get an understanding of the overall data, as well as a feel for how to aggregate data using pandas.

In [None]:
# Calculate the mean purchase price 
purchase_price_mean = app_purchases.price.agg('mean')

# Examine the output 
print(purchase_price_mean)

In [None]:
# Calculate the mean and median purchase price 
purchase_price_summary = app_purchases.price.agg(['mean', 'median'])

# Examine the output 
print(purchase_price_summary)

In [None]:
# Calculate the mean and median of price and age
purchase_summary = uid_date_combined_data.agg({'price': ['mean', 'median'], 'age': ['mean', 'median']})

# Examine the output 
print(purchase_summary)

In [None]:
# Group the data 
grouped_purchase_data = uid_date_combined_data.groupby(by = ['device', 'gender'])

# Aggregate the data
purchase_summary = grouped_purchase_data.agg({'price': ['mean', 'median', 'std']})

# Examine the results
print(purchase_summary) 

### Calculating KPIs
You're now going to take what you've learned and work through calculating a KPI yourself. Specifically, you'll calculate the average amount paid per purchase within a user's first 28 days using the purchase_data DataFrame from before.

This KPI can provide a sense of the popularity of different in-app purchase price points to users within their first month.

In [None]:
uid_date_combined_data.head(3)

In [None]:
# Define the most recent date in our data
current_date = pd.to_datetime('2018-03-17')

In [None]:
# Compute max_purchase_date
max_purchase_date = current_date - timedelta(days=28)

In [None]:
uid_date_combined_data['reg_date'] = pd.to_datetime(uid_date_combined_data.reg_date ).dt.tz_localize(None)

In [None]:
# Filter to only include users who registered before our max date
purchase_data_filt = uid_date_combined_data[uid_date_combined_data.reg_date < max_purchase_date]

# Filter to contain only purchases within the first 28 days of registration
purchase_data_filt = purchase_data_filt[(purchase_data_filt.date <= 
                        purchase_data_filt.reg_date + timedelta(days=28))]

# Output the mean price paid per purchase
print(purchase_data_filt.price.mean())

### Average purchase price by cohort
Building on the previous exercise, let's look at the same KPI, average purchase price, and a similar one, median purchase price, within the first 28 days. Additionally, let's look at these metrics not limited to 28 days to compare.

We can calculate these metrics across a set of cohorts and see what differences emerge. This is a useful task as it can help us understand how behaviors vary across cohorts.

Note that in our data the price variable is given in cents.

In [None]:
# Set the max registration date to be one month before today
max_reg_date = current_date - timedelta(days=28)

# Find the month 1 values
month1 = np.where((uid_date_combined_data.reg_date < max_reg_date) &
                 (uid_date_combined_data.date < uid_date_combined_data.reg_date + timedelta(days=28)),
                  uid_date_combined_data.price, 
                  np.NaN)
                 
# Update the value in the DataFrame
uid_date_combined_data['month1'] = month1

In [None]:
# Group the data by gender and device 
purchase_data_upd = uid_date_combined_data.groupby(by=['gender', 'device'], as_index=False) 

In [None]:
# Aggregate the month1 and price data 
purchase_summary = purchase_data_upd.agg(
                        {'month1': ['mean', 'median'],
                        'price': ['mean', 'median']})

# Examine the results 
print(purchase_summary)