Import the right libraries

In [211]:
import pandas as pd
import numpy as np

from bokeh.io import output_notebook
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, CDSView, GroupFilter

Output in jupyter notebook

In [212]:
output_notebook()

Opening and preprocessing the datasets

Concatting the crashes and set the date format for the date

Crashes

In [213]:
csv_files_crashes = ['data/stats_crashes_202106_overview.csv', 
                     'data/stats_crashes_202107_overview.csv', 
                     'data/stats_crashes_202108_overview.csv',
                     'data/stats_crashes_202109_overview.csv',
                     'data/stats_crashes_202110_overview.csv',
                     'data/stats_crashes_202111_overview.csv',
                     'data/stats_crashes_202112_overview.csv']


dfs_crashes = [pd.read_csv(file, encoding='utf-16') for file in csv_files_crashes]
df_crashes = pd.concat(dfs_crashes, ignore_index=True)
df_crashes['Date'] = pd.to_datetime(df_crashes["Date"], format='%Y-%m-%d')
dfs_crashes

[          Date                         Package Name  Daily Crashes  Daily ANRs
 0   2021-06-01  com.vansteinengroentjes.apps.ddfive             15           1
 1   2021-06-02  com.vansteinengroentjes.apps.ddfive             12           1
 2   2021-06-03  com.vansteinengroentjes.apps.ddfive             20           1
 3   2021-06-04  com.vansteinengroentjes.apps.ddfive             13           0
 4   2021-06-05  com.vansteinengroentjes.apps.ddfive             14           0
 5   2021-06-06  com.vansteinengroentjes.apps.ddfive             19           2
 6   2021-06-07  com.vansteinengroentjes.apps.ddfive              4           0
 7   2021-06-08  com.vansteinengroentjes.apps.ddfive             14           0
 8   2021-06-09  com.vansteinengroentjes.apps.ddfive             19           0
 9   2021-06-10  com.vansteinengroentjes.apps.ddfive              7           0
 10  2021-06-11  com.vansteinengroentjes.apps.ddfive             19           0
 11  2021-06-12  com.vansteinengroentjes

Sales

In [293]:

csv_files_sales_1 = ['data/sales_202106.csv',
                   'data/sales_202107.csv',
                   'data/sales_202108.csv',
                   'data/sales_202109.csv',
                   'data/sales_202110.csv',]

csv_files_sales_2 = ['data/sales_202111.csv',
                   'data/sales_202112.csv']

# Convert to DataFrame
dfs_sales_1 = [pd.read_csv(file, encoding='utf-8') for file in csv_files_sales_1]
dfs_sales_2 = [pd.read_csv(file, encoding='utf-8') for file in csv_files_sales_2]

# Concatenate all DataFrames into one, resetting the index
df_sales_1 = pd.concat(dfs_sales_1, ignore_index=True)
df_sales_2 = pd.concat(dfs_sales_2, ignore_index=True)

# Convert 'Transaction Date' to pd atetime
df_sales_1['Transaction Date'] = pd.to_datetime(df_sales_1['Transaction Date'], format='%b %d, %Y')
df_sales_2['Order Charged Date'] = pd.to_datetime(df_sales_2['Order Charged Date'], format='%Y-%m-%d')

# Rename columns to match
df_sales_1 = df_sales_1.rename(columns={'Description': 'Order Number',   
                                        'Transaction Date': 'Order Charged Date',
                                        'Transaction Type': 'Financial Status',
                                        'Product id': 'Product ID',
                                        'Sku Id': 'SKU ID',
                                        'Buyer Country': 'Country of Buyer',
                                        'Buyer Postal Code': 'Postal Code of Buyer',
                                        'Amount (Merchant Currency)': 'Charged Amount'})

columns = ['Order Number', 'Order Charged Date', 'Financial Status', 
           'Product ID', 'Product Title', 'SKU ID', 
           'Postal Code of Buyer', 'Charged Amount']

# Concatenate both DataFrames
df_sales = pd.concat([df_sales_1[columns], df_sales_2[columns]], ignore_index=True)

# Filter the DataFrame
df_sales = df_sales[
    ((df_sales['Financial Status'] == 'Charge') | (df_sales['Financial Status'] == 'Charged')) &
    (df_sales['Product ID'] == 'com.vansteinengroentjes.apps.ddfive')
      ]
df_sales['Year'] = df_sales['Order Charged Date'].astype(str).str[:4]
df_sales['Month'] = df_sales['Order Charged Date'].astype(str).str[5:7]
df_sales['Period'] = df_sales[['Year', 'Month']].apply(lambda x: '-'.join(x), axis=1)
df_sales
# dfs_sales_1[0]['Currency Conversion Rate'].groupby([dfs_sales_1[0]['Buyer Currency']]).count()

Unnamed: 0,Order Number,Order Charged Date,Financial Status,Product ID,Product Title,SKU ID,Postal Code of Buyer,Charged Amount,Year,Month,Period
4,GPA.3370-7096-7934-01916,2021-06-01,Charge,com.vansteinengroentjes.apps.ddfive,Character Manager (Complete Reference for DnD 5),unlockcharactermanager,62011,4.49,2021,06,2021-06
6,GPA.3301-2849-0660-49349,2021-06-01,Charge,com.vansteinengroentjes.apps.ddfive,DM Tools (Complete Reference for DnD 5),premium,55320,2.86,2021,06,2021-06
8,GPA.3372-1497-1097-13226,2021-06-02,Charge,com.vansteinengroentjes.apps.ddfive,Character Manager (Complete Reference for DnD 5),unlockcharactermanager,54220,4.51,2021,06,2021-06
10,GPA.3397-6490-8608-67650,2021-06-02,Charge,com.vansteinengroentjes.apps.ddfive,Character Manager (Complete Reference for DnD 5),unlockcharactermanager,78250,4.5,2021,06,2021-06
12,GPA.3378-4840-7906-77859,2021-06-02,Charge,com.vansteinengroentjes.apps.ddfive,DM Tools (Complete Reference for DnD 5),premium,74830,2.86,2021,06,2021-06
...,...,...,...,...,...,...,...,...,...,...,...
3482,GPA.3358-8615-1171-80349,2021-12-31,Charged,com.vansteinengroentjes.apps.ddfive,DM Tools (Complete Reference for DnD 5),premium,98856,3.8,2021,12,2021-12
3483,GPA.3397-7418-5342-99790,2021-12-31,Charged,com.vansteinengroentjes.apps.ddfive,Character Manager (Complete Reference for DnD 5),unlockcharactermanager,98856,5.97,2021,12,2021-12
3484,GPA.3333-5146-4957-35294,2021-12-31,Charged,com.vansteinengroentjes.apps.ddfive,DM Tools (Complete Reference for DnD 5),premium,,2.59,2021,12,2021-12
3485,GPA.3306-0097-9714-42420,2021-12-31,Charged,com.vansteinengroentjes.apps.ddfive,DM Tools (Complete Reference for DnD 5),premium,83401,3.49,2021,12,2021-12


Ratings

In [215]:

csv_files_ratings = ['data/stats_ratings_202106_country.csv',
                        'data/stats_ratings_202107_country.csv',
                        'data/stats_ratings_202108_country.csv',
                        'data/stats_ratings_202109_country.csv',
                        'data/stats_ratings_202110_country.csv',
                        'data/stats_ratings_202111_country.csv',
                        'data/stats_ratings_202112_country.csv']


dfs_ratings = [pd.read_csv(file, encoding='utf-16') for file in csv_files_ratings]

df_ratings = pd.concat(dfs_ratings, ignore_index=True)

df_ratings['Date'] = pd.to_datetime(df_ratings["Date"], format='%Y-%m-%d')

df_ratings

Unnamed: 0,Date,Package Name,Country,Daily Average Rating,Total Average Rating
0,2021-06-01,com.vansteinengroentjes.apps.ddfive,AR,,4.20
1,2021-06-01,com.vansteinengroentjes.apps.ddfive,AT,,3.91
2,2021-06-01,com.vansteinengroentjes.apps.ddfive,AU,,4.19
3,2021-06-01,com.vansteinengroentjes.apps.ddfive,BA,,5.00
4,2021-06-01,com.vansteinengroentjes.apps.ddfive,BD,,5.00
...,...,...,...,...,...
18612,2021-12-31,com.vansteinengroentjes.apps.ddfive,US,,4.04
18613,2021-12-31,com.vansteinengroentjes.apps.ddfive,UY,,4.86
18614,2021-12-31,com.vansteinengroentjes.apps.ddfive,VE,,4.00
18615,2021-12-31,com.vansteinengroentjes.apps.ddfive,VN,,5.00


[10p] Sales Volume: Visualize the sales over time (for example, per month or per day) in 
terms of at least two measures. For example: real money (Amount) and transaction count 
(row count). 

In [307]:
df_sales_monthly = df_sales[['Order Charged Date', 'Charged Amount']].groupby(pd.Grouper(key='Order Charged Date', freq="ME")).sum()
# df_sales_monthly
fig = figure(background_fill_color='white',
             border_fill_color='white',
             height=300,
             width=500,
             x_axis_label='Months',
             x_axis_type='datetime',
             y_axis_label='Sales',
             y_axis_location='left',
             title='Sales',
             title_location='right',
             toolbar_location='below',
             tools='save')

fig.vbar(x='Order Charged Date', top='Charged Amount', 
         color='Red', width=100,
         source=df_sales_monthly)

show(fig)

[15p] Attribute Segmentation and Filtering: Present sales volume (as above) segmented per 
attribute: at least the SKU id (in-app purchase option) attribute should be included, but you 
can also think of the day of the week, time of the day or the country of the customer. 

In [319]:
df_sales_monthly = df_sales[['SKU ID', 'Charged Amount']].groupby(['SKU ID']).count()
df_sales_monthly
# fig = figure(background_fill_color='white',
#              border_fill_color='white',
#              height=300,
#              width=500,
#              x_axis_label='Months',
#              x_axis_type='datetime',
#              y_axis_label='Sales',
#              y_axis_location='left',
#              title='Sales',
#              title_location='right',
#              toolbar_location='below',
#              tools='save')

# fig.vbar(x='Order Charged Date', top='Charged Amount', 
#          color='Red', width=100,
#          source=df_sales_monthly)

# show(fig)

Unnamed: 0_level_0,Charged Amount
SKU ID,Unnamed: 1_level_1
premium,710
unlockcharactermanager,992


[15p] Ratings vs Stability: Can you come up with some Key Performance Indicators (metrics 
and scores) that help management understand how the app is doing in terms of stability and 
user satisfaction? Visualize them in a nice way. For example, the number of crashes in 
correlation with the daily average rating. 

[10p] Geographical Development: visualize the sales volume (as above) and the average 
rating per country in a geographical setting (using the geopandas package, see more 
information below) , for example the number of customers per country over time. The goal is 
again to give management as much geographic insight as possible.