Import the right libraries

In [369]:
import pandas as pd
import numpy as np
import geopandas as gpd
import json
import pycountry
import datetime
from sklearn.linear_model import LinearRegression
from bokeh.io import output_notebook, curdoc, output_file
from bokeh.plotting import figure, show
from bokeh.models.layouts import TabPanel, Tabs
from bokeh.layouts import gridplot, column, row
from bokeh.models import ColumnDataSource, CDSView, GroupFilter, LinearAxis, FactorRange, GeoJSONDataSource
from bokeh.models import LinearColorMapper, ColorBar, Range1d, Select, FixedTicker, DatetimeTickFormatter, LogColorMapper, NumeralTickFormatter, DataTable, TableColumn
from bokeh.palettes import brewer
from bokeh.models.tools import HoverTool


Output in jupyter notebook

In [370]:
output_notebook()

Opening and preprocessing the datasets

Concatting the crashes and set the date format for the date

Crashes

In [371]:
csv_files_crashes = ['data/stats_crashes_202106_overview.csv', 
                     'data/stats_crashes_202107_overview.csv', 
                     'data/stats_crashes_202108_overview.csv',
                     'data/stats_crashes_202109_overview.csv',
                     'data/stats_crashes_202110_overview.csv',
                     'data/stats_crashes_202111_overview.csv',
                     'data/stats_crashes_202112_overview.csv']


dfs_crashes = [pd.read_csv(file, encoding='utf-16') for file in csv_files_crashes]
df_crashes = pd.concat(dfs_crashes, ignore_index=True)
df_crashes['Date'] = pd.to_datetime(df_crashes["Date"], format='%Y-%m-%d')
df_crashes['Month'] = df_crashes['Date'].dt.month
df_crashes

Unnamed: 0,Date,Package Name,Daily Crashes,Daily ANRs,Month
0,2021-06-01,com.vansteinengroentjes.apps.ddfive,15,1,6
1,2021-06-02,com.vansteinengroentjes.apps.ddfive,12,1,6
2,2021-06-03,com.vansteinengroentjes.apps.ddfive,20,1,6
3,2021-06-04,com.vansteinengroentjes.apps.ddfive,13,0,6
4,2021-06-05,com.vansteinengroentjes.apps.ddfive,14,0,6
...,...,...,...,...,...
209,2021-12-27,com.vansteinengroentjes.apps.ddfive,64,0,12
210,2021-12-28,com.vansteinengroentjes.apps.ddfive,60,0,12
211,2021-12-29,com.vansteinengroentjes.apps.ddfive,37,0,12
212,2021-12-30,com.vansteinengroentjes.apps.ddfive,46,1,12


Sales

In [372]:

csv_files_sales_1 = ['data/sales_202106.csv',
                   'data/sales_202107.csv',
                   'data/sales_202108.csv',
                   'data/sales_202109.csv',
                   'data/sales_202110.csv',]

csv_files_sales_2 = ['data/sales_202111.csv',
                   'data/sales_202112.csv']

dfs_sales_1 = [pd.read_csv(file, encoding='utf-8') for file in csv_files_sales_1]
dfs_sales_2 = [pd.read_csv(file, encoding='utf-8') for file in csv_files_sales_2]

df_sales_1 = pd.concat(dfs_sales_1, ignore_index=True)
df_sales_2 = pd.concat(dfs_sales_2, ignore_index=True)


df_sales_conversion = df_sales_1.groupby('Buyer Currency')["Currency Conversion Rate"].mean()
df_sales_1['Transaction Date'] = pd.to_datetime(df_sales_1['Transaction Date'], format='%b %d, %Y')
df_sales_2['Order Charged Date'] = pd.to_datetime(df_sales_2['Order Charged Date'], format='%Y-%m-%d')
df_sales_2 = df_sales_2.merge(df_sales_conversion, left_on='Currency of Sale', right_on='Buyer Currency')


df_sales_1['Transaction Time'] = df_sales_1['Transaction Time'].str.extract(r'(\d{1,2}:\d{2}:\d{2} [APM]{2})')
df_sales_1['Transaction Time'] = pd.to_datetime(df_sales_1['Transaction Time'], format='%I:%M:%S %p')
df_sales_1['Hour of Day'] = df_sales_1['Transaction Time'].dt.hour # + 1 


df_sales_2['Order Charged Timestamp'] = pd.to_datetime(df_sales_2['Order Charged Timestamp'], unit='s')
df_sales_2['Hour of Day'] = df_sales_2['Order Charged Timestamp'].dt.hour # + 1
# df_sales_2['Hour of Day'] = df_sales_2['Hour of Day'].replace(25, 1)


df_sales_2['Charged Amount'] = df_sales_2['Charged Amount'].astype(float).round(2)
df_sales_1['Amount (Buyer Currency)'] = df_sales_1['Amount (Buyer Currency)'].astype(float).round(2)
df_sales_2['Amount (Merchant Currency)'] = df_sales_2['Charged Amount'].multiply(df_sales_2['Currency Conversion Rate'], axis=0).round(2)
 

df_sales_1 = df_sales_1.rename(columns={'Description': 'Order Number',   
                                        'Transaction Date': 'Order Charged Date',
                                        'Hour of Day': 'Hour of Day',
                                        'Transaction Type': 'Financial Status',
                                        'Product id': 'Product ID',
                                        'Sku Id': 'SKU ID',
                                        'Buyer Currency': 'Currency of Sale',
                                        'Buyer Country': 'Country of Buyer',
                                        'Buyer Postal Code': 'Postal Code of Buyer',
                                        'Amount (Buyer Currency)': 'Charged Amount'})

columns = ['Order Number', 'Order Charged Date', 'Hour of Day', 'Financial Status', 
           'Product ID', 'Product Title', 'SKU ID', 'Country of Buyer',
           'Postal Code of Buyer', 'Charged Amount', 'Currency of Sale', 
           "Currency Conversion Rate", 'Amount (Merchant Currency)']


df_sales = pd.concat([df_sales_1[columns], df_sales_2[columns]], ignore_index=True)

df_sales['Charged Amount'] = df_sales["Charged Amount"].astype(str).str.replace(',','')
df_sales['Charged Amount'] = pd.to_numeric(df_sales['Charged Amount'])

df_sales = df_sales[
    ((df_sales['Financial Status'] == 'Charge') | (df_sales['Financial Status'] == 'Charged')) &
    (df_sales['Product ID'] == 'com.vansteinengroentjes.apps.ddfive')
      ]


df_sales['Year'] = df_sales['Order Charged Date'].dt.year
df_sales['Month'] = df_sales['Order Charged Date'].dt.month
df_sales



Unnamed: 0,Order Number,Order Charged Date,Hour of Day,Financial Status,Product ID,Product Title,SKU ID,Country of Buyer,Postal Code of Buyer,Charged Amount,Currency of Sale,Currency Conversion Rate,Amount (Merchant Currency),Year,Month
4,GPA.3370-7096-7934-01916,2021-06-01,17,Charge,com.vansteinengroentjes.apps.ddfive,Character Manager (Complete Reference for DnD 5),unlockcharactermanager,US,62011,5.49,USD,0.818700,4.49,2021,6
6,GPA.3301-2849-0660-49349,2021-06-01,22,Charge,com.vansteinengroentjes.apps.ddfive,DM Tools (Complete Reference for DnD 5),premium,US,55320,3.49,USD,0.818250,2.86,2021,6
8,GPA.3372-1497-1097-13226,2021-06-02,7,Charge,com.vansteinengroentjes.apps.ddfive,Character Manager (Complete Reference for DnD 5),unlockcharactermanager,US,54220,5.49,USD,0.820650,4.51,2021,6
10,GPA.3397-6490-8608-67650,2021-06-02,9,Charge,com.vansteinengroentjes.apps.ddfive,Character Manager (Complete Reference for DnD 5),unlockcharactermanager,US,78250,5.49,USD,0.819250,4.50,2021,6
12,GPA.3378-4840-7906-77859,2021-06-02,10,Charge,com.vansteinengroentjes.apps.ddfive,DM Tools (Complete Reference for DnD 5),premium,US,74830,3.49,USD,0.818750,2.86,2021,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3474,GPA.3358-8615-1171-80349,2021-12-31,13,Charged,com.vansteinengroentjes.apps.ddfive,DM Tools (Complete Reference for DnD 5),premium,US,98856,3.80,USD,0.847129,3.22,2021,12
3475,GPA.3397-7418-5342-99790,2021-12-31,13,Charged,com.vansteinengroentjes.apps.ddfive,Character Manager (Complete Reference for DnD 5),unlockcharactermanager,US,98856,5.97,USD,0.847129,5.06,2021,12
3476,GPA.3333-5146-4957-35294,2021-12-31,19,Charged,com.vansteinengroentjes.apps.ddfive,DM Tools (Complete Reference for DnD 5),premium,GB,,2.59,GBP,1.170363,3.03,2021,12
3477,GPA.3306-0097-9714-42420,2021-12-31,22,Charged,com.vansteinengroentjes.apps.ddfive,DM Tools (Complete Reference for DnD 5),premium,US,83401,3.49,USD,0.847129,2.96,2021,12


Ratings

In [373]:

csv_files_ratings = ['data/stats_ratings_202106_country.csv',
                        'data/stats_ratings_202107_country.csv',
                        'data/stats_ratings_202108_country.csv',
                        'data/stats_ratings_202109_country.csv',
                        'data/stats_ratings_202110_country.csv',
                        'data/stats_ratings_202111_country.csv',
                        'data/stats_ratings_202112_country.csv']



dfs_ratings = [pd.read_csv(file, encoding='utf-16') for file in csv_files_ratings]

df_ratings = pd.concat(dfs_ratings, ignore_index=True)

df_ratings['Date'] = pd.to_datetime(df_ratings["Date"], format='%Y-%m-%d')
df_ratings['Month'] = df_ratings['Date'].dt.month

df_ratings


Unnamed: 0,Date,Package Name,Country,Daily Average Rating,Total Average Rating,Month
0,2021-06-01,com.vansteinengroentjes.apps.ddfive,AR,,4.20,6
1,2021-06-01,com.vansteinengroentjes.apps.ddfive,AT,,3.91,6
2,2021-06-01,com.vansteinengroentjes.apps.ddfive,AU,,4.19,6
3,2021-06-01,com.vansteinengroentjes.apps.ddfive,BA,,5.00,6
4,2021-06-01,com.vansteinengroentjes.apps.ddfive,BD,,5.00,6
...,...,...,...,...,...,...
18612,2021-12-31,com.vansteinengroentjes.apps.ddfive,US,,4.04,12
18613,2021-12-31,com.vansteinengroentjes.apps.ddfive,UY,,4.86,12
18614,2021-12-31,com.vansteinengroentjes.apps.ddfive,VE,,4.00,12
18615,2021-12-31,com.vansteinengroentjes.apps.ddfive,VN,,5.00,12


[10p] Sales Volume: Visualize the sales over time (for example, per month or per day) in 
terms of at least two measures. For example: real money (Amount) and transaction count 
(row count). 

In [374]:
# df_sales_monthly = df_sales[['Order Charged Date', 'Charged Amount']].groupby(pd.Grouper(key='Order Charged Date', freq="ME")).sum()
df_sales_monthly = df_sales.groupby('Month')[['Amount (Merchant Currency)']].sum().reset_index()
df_transactions_monthly = df_sales.groupby('Month')[['Order Number']].count().reset_index()

df_combined_sales = df_sales_monthly.merge(df_transactions_monthly, on='Month', suffixes=('_sales', '_transactions'))
source = ColumnDataSource(df_combined_sales)

monthly_sales_fig = figure(
    background_fill_color='white',
    border_fill_color='white',
    height=400,
    width=900,
    x_axis_label='Months',
    y_axis_label='Sales (Amount)',
    title='Monthly Sales & Transactions',
    toolbar_location='below',
    tools='save'
)

monthly_sales_fig.line('Month', 'Amount (Merchant Currency)',
                       color='red', legend_label='Sales Amount',
                       source=source, line_width=2)

monthly_sales_fig.extra_y_ranges = {
    'transactions': Range1d(start=0, end=df_combined_sales['Order Number'].max() * 1.2)
}
monthly_sales_fig.add_layout(LinearAxis(y_range_name='transactions', axis_label='Number of Transactions'), 'right')

monthly_sales_fig.line('Month', 'Order Number',
                       color='blue', legend_label='Transactions',
                       source=source, y_range_name='transactions', line_width=2)

monthly_sales_fig.add_layout(monthly_sales_fig.legend[0], 'right')
monthly_sales_fig.legend.label_text_font_size = '8pt'

monthly_sales_fig.min_border_right = 100

show(monthly_sales_fig)


[15p] Attribute Segmentation and Filtering: Present sales volume (as above) segmented per 
attribute: at least the SKU id (in-app purchase option) attribute should be included, but you 
can also think of the day of the week, time of the day or the country of the customer. 

In [375]:
df_sales_premium = df_sales.loc[df_sales['SKU ID'] == 'premium'].groupby(['Month'])[['Amount (Merchant Currency)']].sum().reset_index()
df_sales_ucm = df_sales.loc[df_sales['SKU ID'] == 'unlockcharactermanager'].groupby(['Month'])[['Amount (Merchant Currency)']].sum().reset_index()

source_premium = ColumnDataSource(df_sales_premium)
source_ucm = ColumnDataSource(df_sales_ucm)


sales_by_sku_fig = figure(
    background_fill_color='white',
    border_fill_color='white',
    height=400,
    width=900,
    x_axis_label='Months',
    y_axis_label='Sales (Amount)',
    title='Sales by SKU',
    toolbar_location='below',
    tools='save'
)


sales_by_sku_fig.line('Month', 'Amount (Merchant Currency)',
                      color='red', source=source_premium,
                      legend_label='Premium', line_width=2)

sales_by_sku_fig.line('Month', 'Amount (Merchant Currency)',
                      color='blue', source=source_ucm,
                      legend_label='Unlock Character Manager', line_width=2)


sales_by_sku_fig.add_layout(sales_by_sku_fig.legend[0], 'right')
sales_by_sku_fig.legend.label_text_font_size = '8pt'


sales_by_sku_fig.min_border_right = 150

show(sales_by_sku_fig)

In [376]:
df_sales_by_country = df_sales.groupby(['Country of Buyer'])[['Amount (Merchant Currency)']].sum().reset_index()
df_sales_by_country = df_sales_by_country.sort_values(['Amount (Merchant Currency)'], ascending=False)

df_top_15_countries = df_sales_by_country.head(15)


source_country = ColumnDataSource(df_top_15_countries)

sales_by_country_fig = figure(
    background_fill_color='white',
    border_fill_color='white',
    height=400,
    width=900,
    y_range=df_top_15_countries['Country of Buyer'][::-1], 
    x_axis_label='Sales Amount (Merchant Currency)',
    y_axis_label='Country',
    title='Top 15 Countries by Sales',
    toolbar_location='below',
    tools='save'
)

sales_by_country_fig.hbar(
    y='Country of Buyer',
    left=0,
    right='Amount (Merchant Currency)',
    height=0.8,
    color='red',
    source=source_country
)

hover = HoverTool(
    tooltips=[
        ("Country", "@{Country of Buyer}"),
        ("Sales", "@{Amount (Merchant Currency)}")
    ]
)

sales_by_country_fig.add_tools(hover)

show(sales_by_country_fig)


In [377]:
df_sales['Day of Week'] = pd.to_datetime(df_sales['Order Charged Date']).dt.day_name()
df_sales_by_day = df_sales.groupby(['Day of Week'])[['Amount (Merchant Currency)']].sum().reset_index()
days_order = ["Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"]
df_sales_by_day['Day of Week'] = pd.Categorical(df_sales_by_day['Day of Week'], categories=days_order, ordered=True)
df_sales_by_day = df_sales_by_day.sort_values('Day of Week')

df_sales_by_hour = df_sales.groupby(['Hour of Day'])[['Amount (Merchant Currency)']].sum().reset_index()
bins = np.arange(0, 25, 1)
hist_values, edges = np.histogram(df_sales_by_hour['Hour of Day'], bins=bins, weights=df_sales_by_hour['Amount (Merchant Currency)'])

source = ColumnDataSource(data={'left': edges[:-1], 'right': edges[1:], 'top': hist_values})
source_country = ColumnDataSource(df_sales_by_country)
source_day = ColumnDataSource(df_sales_by_day)
source_hour = ColumnDataSource(df_sales_by_hour)



sales_by_day_fig = figure(background_fill_color='white',
             border_fill_color='white',
             height=400,
             width=900,
             x_axis_label='Day of the week',
             x_range=df_sales_by_day['Day of Week'].tolist(),
             y_axis_label='Sales',
             y_axis_location='left',
             y_range=(0, df_sales_by_day['Amount (Merchant Currency)'].max() * 1.1),
             title='Sales by day of the week',
             title_location='above',
             toolbar_location='below',
             tools='save')

sales_by_hour_fig = figure(
            background_fill_color='white',
            border_fill_color='white',
            height=400,
            width=900,
            x_axis_label='Hour of the day',
            y_axis_label='Sales',
            y_axis_location='left',
            title='Sales by hour of the day',
            title_location='above',
            toolbar_location='below',
            tools='save',
            y_range=(0, df_sales_by_hour['Amount (Merchant Currency)'].max() * 1.1),
            x_range=(0, 24))


sales_by_day_fig.vbar(x='Day of Week', top='Amount (Merchant Currency)', 
         color='Red', width=0.8, source=source_day)

sales_by_hour_fig.xaxis.ticker = FixedTicker(ticks=list(range(0, 25)))  # Labels 0-24

sales_by_hour_fig.quad(
    top='top', bottom=0, left='left', right='right',
    source=source, color='red', line_color='white')

show(sales_by_day_fig)
show(sales_by_hour_fig)


[15p] Ratings vs Stability: Can you come up with some Key Performance Indicators (metrics 
and scores) that help management understand how the app is doing in terms of stability and 
user satisfaction? Visualize them in a nice way. For example, the number of crashes in 
correlation with the daily average rating. 

In [378]:
df_crashes['Date'] = pd.to_datetime(df_crashes['Date'])

df_crashes_by_date = df_crashes.groupby('Date')[['Daily Crashes']].sum().reset_index()

source = ColumnDataSource(df_crashes_by_date)

crashes_by_date_fig = figure(
    background_fill_color='white',
    border_fill_color='white',
    height=400,
    width=900,
    x_axis_type='datetime',
    x_axis_label='Date',
    y_axis_label='Daily Crashes',
    y_axis_location='left',
    title='Daily Crashes Over Time',
    title_location='right',
    toolbar_location='below',
    tools='pan,wheel_zoom,box_zoom,reset,save'
)

crashes_by_date_fig.line(
    x='Date', y='Daily Crashes',
    source=source, 
    line_width=2, color='red', legend_label='Daily Crashes'
)

hover = HoverTool(tooltips=[
    ('Date', '@Date{%F}'),
    ('Daily Crashes', '@{Daily Crashes}')
], formatters={'@Date': 'datetime'}, mode='vline')

crashes_by_date_fig.add_tools(hover)
crashes_by_date_fig.legend.location = "top_left"
crashes_by_date_fig.xaxis.formatter = DatetimeTickFormatter(days='%b %d, %Y', months='%b %Y')

show(crashes_by_date_fig)

In [379]:
df_crashes['Date'] = df_crashes['Date'].astype(str)
df_ratings['Date'] = df_ratings['Date'].astype(str)


merged_df = pd.merge(df_crashes, df_ratings, on='Date', how='inner')
merged_df = merged_df.dropna(subset=['Daily Crashes', 'Daily Average Rating'])
merged_df['Date'] = pd.to_datetime(merged_df['Date'])

avg_crashes = merged_df['Daily Crashes'].mean()
avg_anrs = merged_df['Daily ANRs'].mean()

correlation = merged_df[['Daily Crashes', 'Daily Average Rating']].corr().iloc[0, 1]
X = merged_df[['Daily Crashes']]
y = merged_df['Daily Average Rating']
reg = LinearRegression().fit(X, y)
merged_df['Regression Line'] = reg.predict(X)

merged_df['Stability Score'] = merged_df['Daily Average Rating'] / (1 + merged_df['Daily Crashes'])

sorted_df = merged_df.sort_values(by='Daily Average Rating')

source = ColumnDataSource(merged_df)
regression_source = ColumnDataSource(sorted_df)

p1 = figure(title=f'Correlation between Crashes and Ratings: {correlation:.2f}',
           x_axis_label='Daily Crashes', y_axis_label='Daily Average Rating',
           tools='pan,wheel_zoom,box_zoom,reset,save', width=900, height=400)


p1.scatter('Daily Crashes', 'Daily Average Rating', source=source, size=8, color='navy', alpha=0.6, legend_label="Data Points")


p1.line('Daily Crashes', 'Regression Line', source=regression_source, line_width=2, color='red', legend_label="Trend Line")

show(column(p1))

# unique_dates = merged_df['Date'].unique().tolist()

# p2 = figure(title='Stability Score Over Time', x_axis_label='Date', y_axis_label='Stability Score', x_range=FactorRange(*unique_dates), 
#             tools='pan,wheel_zoom,box_zoom,reset,save', width=500, height=300)
# p2.line('Date', 'Stability Score', source=source, line_width=2, color='green')
# p2.scatter('Date', 'Stability Score', source=source, size=6, color='red')
# p2.xaxis.major_label_orientation = 1.2



In [380]:
merged_df['Month'] = merged_df['Date'].dt.strftime('%Y-%m')
monthly_df = merged_df.groupby('Month').agg({
    'Daily Average Rating': 'mean',
    'Daily Crashes': 'sum'
}).reset_index()

source = ColumnDataSource(monthly_df)

fig_crashes_ratings = figure(
    x_range=monthly_df['Month'],
    x_axis_label='Month',
    y_axis_label='Average Rating',
    height=400, width=900,
    title='Monthly Average Rating and Total Crashes',
    toolbar_location='above'
)

fig_crashes_ratings.y_range = Range1d(0, 5.5)
fig_crashes_ratings.extra_y_ranges = {"crashes": Range1d(0, monthly_df['Daily Crashes'].max()*1.2)}
fig_crashes_ratings.add_layout(LinearAxis(y_range_name="crashes", axis_label='Total Crashes'), 'right')

fig_crashes_ratings.line(
    x='Month', y='Daily Average Rating', source=source,
    line_width=2, color='navy', legend_label='Avg Rating'
)

fig_crashes_ratings.vbar(
    x='Month', top='Daily Crashes', source=source,
    width=0.7, color='firebrick', y_range_name='crashes',
    alpha=0.7, legend_label='Total Crashes'
)

fig_crashes_ratings.add_tools(HoverTool(
    tooltips=[('Month', '@Month'), 
              ('Avg Rating', '@{Daily Average Rating}{0.00}'), 
              ('Total Crashes', '@{Daily Crashes}')]
))

fig_crashes_ratings.legend.location = 'top_left'
fig_crashes_ratings.min_border_right = 80
fig_crashes_ratings.min_border_left = 80
fig_crashes_ratings.xaxis.major_label_orientation = 0.75

show(fig_crashes_ratings)

In [381]:
merged_df['Stability Score'] = merged_df['Daily Average Rating'] / (1 + merged_df['Daily Crashes'])
source = ColumnDataSource(merged_df)

stability_fig = figure(
    title="Stability Score Over Time",
    x_axis_label='Date',
    y_axis_label='Stability Score',
    x_axis_type='datetime',
    height=400, width=900,
    tools='pan,wheel_zoom,box_zoom,reset,save'
)

stability_fig.line(x='Date', y='Stability Score', source=source, line_width=2, color='Red')

hover = HoverTool(tooltips=[
    ('Date', '@Date{%F}'),
    ('Stability Score', '@{Stability Score}{0.00}')
], formatters={'@Date': 'datetime'}, mode='vline')

stability_fig.add_tools(hover)
stability_fig.xaxis.formatter = DatetimeTickFormatter(days='%b %d, %Y', months='%b %Y')

show(stability_fig)


In [382]:
df_ratings_by_month = df_ratings.groupby('Month')[['Total Average Rating']].mean().reset_index()

df_ratings_by_month = df_ratings_by_month.sort_values('Month')
df_ratings_by_month['Month'] = df_ratings_by_month['Month'].astype(str)

source = ColumnDataSource(df_ratings_by_month)

ratings_by_month_fig = figure(
    background_fill_color='white',
    border_fill_color='white',
    height=400,
    width=900,
    x_range=df_ratings_by_month['Month'].tolist(),  
    x_axis_label='Month',
    y_axis_label='Average Rating',
    y_axis_location='left',
    title='Average Rating by Month',
    title_location='right',
    toolbar_location='below',
    tools='save'
)

ratings_by_month_fig.vbar(
    x='Month',
    top='Total Average Rating',
    width=0.8,
    color='red',
    source=source
)


show(ratings_by_month_fig)

In [383]:
df_ratings_by_country = df_ratings.groupby('Country')[['Total Average Rating']].mean().reset_index()

df_sales_by_country = df_sales.groupby(['Country of Buyer'])[['Amount (Merchant Currency)']].sum().reset_index()
df_sales_by_country.rename(columns={'Country of Buyer': 'Country', 'Amount (Merchant Currency)': 'Total Revenue'}, inplace=True)
df_combined = df_ratings_by_country.merge(df_sales_by_country, on='Country', how='left')

df_combined['Total Revenue'] = df_combined['Total Revenue'].fillna(0)

df_combined['Total Revenue (Formatted)'] = df_combined['Total Revenue'].apply(lambda x: f"${x:,.2f}")


source_ratings_table = ColumnDataSource(df_combined.sort_values('Total Revenue', ascending=False)) 

columns = [
    TableColumn(field="Country", title="Country"),
    TableColumn(field="Total Average Rating", title="Average Rating"),
    TableColumn(field="Total Revenue (Formatted)", title="Total Revenue")
]

ratings_table = DataTable(source=source_ratings_table, columns=columns, width=900, height=400)

layout = column(ratings_table)

show(layout)

[10p] Geographical Development: visualize the sales volume (as above) and the average 
rating per country in a geographical setting (using the geopandas package, see more 
information below) , for example the number of customers per country over time. The goal is 
again to give management as much geographic insight as possible.

In [384]:
shapefile = 'data/ne_110m_admin_0_countries.shp'
gdf = gpd.read_file(shapefile)[['ADMIN', 'ADM0_A3', 'geometry']]
gdf.columns = ['country', 'country_code', 'geometry']
gdf = gdf.drop(gdf.index[159])

df_sales_per_country = df_sales.groupby(['Month','Country of Buyer'])[['Amount (Merchant Currency)']].sum().reset_index()
df_ratings_monthly = df_ratings.groupby(['Month', 'Country'])[['Total Average Rating']].mean().reset_index()
df_sales_monthly['Month'] = df_sales_monthly['Month'].astype(int)

def convert_country_code(alpha_2):
    try:
        return pycountry.countries.get(alpha_2=alpha_2).alpha_3
    except AttributeError:
        return None  

df_sales_per_country['country_code'] = df_sales_per_country['Country of Buyer'].apply(convert_country_code)
df_ratings_monthly['country_code'] = df_ratings_monthly['Country'].apply(convert_country_code)

def get_dataset(month):
    """Filter data for a specific month and merge with GeoDataFrame"""
    filtered_df = df_sales_per_country[df_sales_per_country['Month'] == month]
    merged = gdf.merge(filtered_df, on="country_code", how="left")
    merged['Amount (Merchant Currency)'] = merged['Amount (Merchant Currency)'].fillna(0)
    return merged
    
def get_geodatasource(gdf):    
    """Convert GeoDataFrame to Bokeh-compatible GeoJSONDataSource"""
    json_data = json.dumps(json.loads(gdf.to_json()))
    return GeoJSONDataSource(geojson=json_data)

# initial_month = df_sales_monthly['Month'].min()
# merged_gdf = get_dataset(initial_month)

def bokeh_plot_map(gdf, column=None, title='Choropleth Map'):
    """Plot choropleth map using Bokeh"""
    
    filtered_values = gdf[column].replace(0, None).dropna()

    color_mapper = LogColorMapper(
        palette='Viridis256',
        low=filtered_values.min(),
        high=filtered_values.max()
    )

    color_bar = ColorBar(
        color_mapper=color_mapper, 
        label_standoff=8, 
        width=500, 
        height=20,
        location=(0,0), 
        orientation='horizontal',
        formatter=NumeralTickFormatter(format="0,0")
    )

    p = figure(title=title, tools='wheel_zoom,pan,reset', toolbar_location='right', height=500, width=1000)
    p.xgrid.grid_line_color = None
    p.ygrid.grid_line_color = None

    geosource = GeoJSONDataSource(geojson=gdf.to_json())

    patches = p.patches('xs', 'ys', source=geosource, fill_alpha=1, line_width=0.5, line_color='black',
                        fill_color={'field': column, 'transform': color_mapper})

    hover = HoverTool(renderers=[patches],
                      tooltips=[
                          ("Month", "@Month"),
                          ("Country", "@country"), 
                          ("Sales", f"@{{{column}}}{{0,0.00}}")
                      ])
    p.add_tools(hover)

    p.add_layout(color_bar, 'below')

    return p


df_sales_per_country['country_code'] = df_sales_per_country['Country of Buyer'].apply(convert_country_code)
df_ratings_monthly['country_code'] = df_ratings_monthly['Country'].apply(convert_country_code)

merged_gdf_sales = gdf.merge(df_sales_per_country, on="country_code", how="left")
merged_gdf_ratings = gdf.merge(df_ratings_monthly, on="country_code", how="left")

bokeh_map_sales = bokeh_plot_map(merged_gdf_sales, column='Amount (Merchant Currency)', title="Global Sales")
bokeh_map_ratings = bokeh_plot_map(merged_gdf_ratings, column='Total Average Rating', title="Global App Ratings")

show(bokeh_map_sales)
show(bokeh_map_ratings)

Put it all together

In [385]:

monthly_sales_panel = TabPanel(child=monthly_sales_fig, title="Monthly Sales & Transactions")
sales_by_attribute_panel = TabPanel(child=sales_by_sku_fig, title="Sales per Attribute")
sales_by_country_panel = TabPanel(child=sales_by_country_fig, title="Sales per Country")
sales_by_day_panel = TabPanel(child=sales_by_day_fig, title="Sales per Day")   
sales_by_hour_panel = TabPanel(child=sales_by_hour_fig, title="Sales per Hour")
sales_tabs = Tabs(tabs=[monthly_sales_panel, sales_by_attribute_panel, sales_by_country_panel, sales_by_day_panel, sales_by_hour_panel])

crashes_panel = TabPanel(child=crashes_by_date_fig, title="Crashed per date")
# rating_panel = TabPanel(child=ratings_by_month_fig, title='AVG Rating per Month')
p1_panel = TabPanel(child=p1, title='Correlation Crashes and Ratings')
p2_panel = TabPanel(child=fig_crashes_ratings, title='Daily Crashes and Ratings')
stability_panel = TabPanel(child=stability_fig, title='Stability Score')

map_sales_tab = TabPanel(child=bokeh_map_sales, title='Sales & Transactions')
map_ratings_tab = TabPanel(child=bokeh_map_ratings, title='Ratings')
map_tabs = Tabs(tabs=[map_sales_tab, map_ratings_tab])

tabs = Tabs(tabs=[crashes_panel, p1_panel, p2_panel, stability_panel])

dashboard_grid = column(
    row(sales_tabs, tabs), 
    row(map_tabs, ratings_table))


show(dashboard_grid)