In [1]:
# Import pandas 
import pandas as pd
from datetime import datetime, timedelta

In [2]:
# Load the customer_data
customer_data = pd.read_csv('datasets/Customer_dataset.csv')

# Load the app_purchases
app_purchases = pd.read_csv('datasets/inapp_purchases_dataset.csv')

# Print the columns of customer data
print(customer_data.columns)

# Print the columns of app_purchases
print(app_purchases.columns)

Index(['uid', 'reg_date', 'device', 'gender', 'country', 'age'], dtype='object')
Index(['date', 'uid', 'sku', 'price'], dtype='object')


In [3]:
customer_data.head()

Unnamed: 0,uid,reg_date,device,gender,country,age
0,54030035.0,2017-06-29T00:00:00Z,and,M,USA,19
1,72574201.0,2018-03-05T00:00:00Z,iOS,F,TUR,22
2,64187558.0,2016-02-07T00:00:00Z,iOS,M,USA,16
3,92513925.0,2017-05-25T00:00:00Z,and,M,BRA,41
4,99231338.0,2017-03-26T00:00:00Z,iOS,M,FRA,59


In [4]:
# Changing reg_date column type from string to datetime:
customer_data["reg_date"] = pd.to_datetime(customer_data["reg_date"])  

In [5]:
# Adding new colum as reg_date_1 which is changed type from timestapt to datetime(Y-M-D):
customer_data['reg_date_1'] = customer_data['reg_date'].dt.strftime('%Y-%m-%d')

In [6]:
app_purchases.head()

Unnamed: 0,date,uid,sku,price
0,2017-07-10,41195147,sku_three_499,499
1,2017-07-15,41195147,sku_three_499,499
2,2017-11-12,41195147,sku_four_599,599
3,2017-09-26,91591874,sku_two_299,299
4,2017-12-01,91591874,sku_four_599,599


In [7]:
# Merge on the 'uid' and 'date' field
uid_date_combined_data = app_purchases.merge(customer_data, left_on=['uid','date'],right_on=['uid','reg_date_1'], how='inner')

# Examine the results 
print(uid_date_combined_data.head())
print(len(uid_date_combined_data))

         date       uid             sku  price                  reg_date  \
0  2016-03-30  94055095    sku_four_599    599 2016-03-30 00:00:00+00:00   
1  2015-10-28  69627745     sku_one_199    199 2015-10-28 00:00:00+00:00   
2  2017-02-02  11604973  sku_seven_1499    499 2017-02-02 00:00:00+00:00   
3  2016-06-05  22495315    sku_four_599    599 2016-06-05 00:00:00+00:00   
4  2018-02-17  51365662     sku_two_299    299 2018-02-17 00:00:00+00:00   

  device gender country  age  reg_date_1  
0    iOS      F     BRA   16  2016-03-30  
1    and      F     BRA   18  2015-10-28  
2    and      F     USA   16  2017-02-02  
3    and      F     USA   19  2016-06-05  
4    iOS      M     TUR   16  2018-02-17  
35


In [8]:
uid_date_combined_data.head()

Unnamed: 0,date,uid,sku,price,reg_date,device,gender,country,age,reg_date_1
0,2016-03-30,94055095,sku_four_599,599,2016-03-30 00:00:00+00:00,iOS,F,BRA,16,2016-03-30
1,2015-10-28,69627745,sku_one_199,199,2015-10-28 00:00:00+00:00,and,F,BRA,18,2015-10-28
2,2017-02-02,11604973,sku_seven_1499,499,2017-02-02 00:00:00+00:00,and,F,USA,16,2017-02-02
3,2016-06-05,22495315,sku_four_599,599,2016-06-05 00:00:00+00:00,and,F,USA,19,2016-06-05
4,2018-02-17,51365662,sku_two_299,299,2018-02-17 00:00:00+00:00,iOS,M,TUR,16,2018-02-17


### Practicing aggregations
It's time to begin exploring the in-app purchase data in more detail. Here, you will practice aggregating the dataset in various ways using the .agg() method and then examine the results to get an understanding of the overall data, as well as a feel for how to aggregate data using pandas.

In [9]:
# Calculate the mean purchase price 
purchase_price_mean = app_purchases.price.agg('mean')

# Examine the output 
print(purchase_price_mean)

406.77259604707973


In [10]:
# Calculate the mean and median purchase price 
purchase_price_summary = app_purchases.price.agg(['mean', 'median'])

# Examine the output 
print(purchase_price_summary)

mean      406.772596
median    299.000000
Name: price, dtype: float64


In [11]:
# Calculate the mean and median of price and age
purchase_summary = uid_date_combined_data.agg({'price': ['mean', 'median'], 'age': ['mean', 'median']})

# Examine the output 
print(purchase_summary)

             price        age
mean    470.428571  22.771429
median  499.000000  20.000000


In [12]:
# Group the data 
grouped_purchase_data = uid_date_combined_data.groupby(by = ['device', 'gender'])

# Aggregate the data
purchase_summary = grouped_purchase_data.agg({'price': ['mean', 'median', 'std']})

# Examine the results
print(purchase_summary) 

                    price                   
                     mean median         std
device gender                               
and    F       477.571429  499.0  200.685638
       M       511.500000  549.0  229.518129
iOS    F       427.571429  599.0  221.466971
       M       449.000000  499.0  164.316767
