In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import numpy as np
sns.set_style('darkgrid')

In [None]:
top100 = pd.read_csv("../data/curated/top100.csv")
full = pd.read_csv("../data/curated/ranking.csv")

value_count = top100['segment'].value_counts()
value_count

In [None]:
top100['tags'].value_counts()

In [None]:
tag_lst = []
for segments in top100['segment'].unique():
    sub_lst = []
    for tags in top100.loc[top100['segment'] == segments]['tags'].unique():
        tag_lst.append(len(top100.loc[top100['tags'] == tags]))
        sub_lst.append(tags)
    print(segments, ": ", sub_lst)
tag_lst

In [None]:
cmap1 = plt.colormaps['Blues']
inner_colors1 = cmap1(np.arange(9)*30 + 100)
cmap2 = plt.colormaps['Oranges']
inner_colors2 = cmap2(np.arange(5)*30 + 100)
cmap3 = plt.colormaps['Greens']
inner_colors3 = cmap3(np.arange(3)*30 + 100)
cmap4 = plt.colormaps['Purples']
inner_colors4 = cmap4(np.arange(2)*30 + 100)

inner_colors = np.vstack((inner_colors1, inner_colors2, inner_colors3, inner_colors4))

In [None]:
fig, ax = plt.subplots(figsize = (20, 20))
size = 0.3

[9, 8, 2, 5, 5, 5, 2, 4, 3, 14, 7, 1, 5, 3, 3, 12, 5, 5, 2]
cmap = plt.colormaps["tab20c"]
outer_colors = cmap(np.arange(4)*4)

labels1 = [r'Personal & Household (43%)', r'Recreational (30%)', r'Technical & Machinery (20%)', r'Health Service (7%)']
size1 = [43, 30, 20, 7]
patches1, texts1= ax.pie(size1, radius=1, colors=outer_colors,
       wedgeprops=dict(width=size, edgecolor='w'))
legend1 = plt.legend(patches1, labels1, loc = 'upper left', prop={'size': 20})

labels2 = [r'watch (9)', r'gift (8)', r'music (2)', r'florists (5)', r'artist supply (5)', r'furniture (5)', 
        r'antique (2)', r'garden supply(4)', r'shoe (3)', r'tent (14)', r'digital goods (7)', r'hobby (1)',\
        r'books (5)', r'stationery (3)', r'cable (3)', r'computer (12)', r'motor (5)', r'opticians (5)', r'health (2)']
size2 = tag_lst
patches2, texts2= ax.pie(size2, radius=1-size, colors=inner_colors,
       wedgeprops=dict(width=size, edgecolor='w'))
plt.legend(patches2, labels2, loc='center left', bbox_to_anchor=(1, 0.5), prop={'size': 20})
plt.gca().add_artist(legend1)

ax.set(aspect="equal")
fig.suptitle('Distribution of Tags and Segments in Top 100 Merchants', fontsize=30)
plt.savefig("../plots/Tag and Segments Distribution in TOP100.png", bbox_inches = 'tight')
plt.show()

In [None]:
# tent and computer merchants take up the most weight in top 100 ranking
# compare the 3 predicted attributes of tent and computer merchants 
top_two = top100.loc[(top100['tags'] == 'tent') | (top100['tags'] == 'computer')]

sns.scatterplot(y=top_two['pred_total_revenue'], x=top_two['rank'], hue=top_two['tags'])
plt.xlabel("Rank", fontsize=12)
plt.ylabel("Predicted total revenue", fontsize=12)
plt.title("Predicted Total Revenue of Tent and Computer Merchants", fontsize=13)
plt.show()

In [None]:
sns.scatterplot(x = 'rank', y = 'pred_total_num_consumer', data = top_two, hue = 'tags')
plt.xlabel("Rank", fontsize=12)
plt.ylabel("Predicted number of consumers", fontsize=12)
plt.title("Predicted Number of Consumer of Tent and Computer Merchants", fontsize=13)
plt.show()

In [None]:
sns.scatterplot(x = 'rank', y = 'pred_total_num_transaction', data = top_two, hue = 'tags')
plt.xlabel("Rank", fontsize=12)
plt.ylabel("Predicted number of transactions", fontsize=12)
plt.title("Predicted Transaction Volume of Tent and Computer Merchants", fontsize=13)
plt.show()

**Observation:**  
It can be seen that tent merchants often have high customer volume and flow while computer merchants are related to greater transaction values.

In [None]:
# compute average value of each of 3 predictions amont top 100 merchants
average_top = top100[['segment', 'pred_total_num_consumer', 'pred_total_num_transaction', 'pred_total_revenue']]\
                        .groupby('segment').mean()
average_top.rename({'pred_total_num_consumer': 'pred_total_num_consumer_top', 
                            'pred_total_num_transaction': 'pred_total_num_transaction_top', 
                            'pred_total_revenue': 'pred_total_revenue_top'}, axis=1, inplace=True)

# compute average value of each of 3 predictions amont all merchants
average_full = full[['segment', 'pred_total_num_consumer', 'pred_total_num_transaction', 'pred_total_revenue']]\
                        .groupby('segment').mean()

average_compare = average_full.merge(average_top, on='segment', how='left').dropna()

In [None]:
# set the figure size
plt.figure(figsize=(8, 5))

x_axis = np.arange(len(average_compare.index))
bar1 = plt.bar(x_axis -0.2, average_compare['pred_total_revenue'], width=0.4, label = 'All merchants', color=(0.2, 0.4, 0.6, 0.6))
bar2 = plt.bar(x_axis +0.2, average_compare['pred_total_revenue_top'], width=0.4, label = 'TOP 100', color = (0.5, 0.1, 0.4, 0.4))
plt.xticks(x_axis, ['Health', 'Personal & Household', 'Recreational', 'Technical & Machinery'], size = 12)
plt.ylabel("Total Revenue", size = 12)

# add legend
top_bar = mpatches.Patch(color=(0.2, 0.4, 0.6, 0.6), label='All Merchants')
bottom_bar = mpatches.Patch(color=(0.5, 0.1, 0.4, 0.4), label='TOP 100')
plt.legend(handles=[top_bar, bottom_bar])
plt.tight_layout()
plt.title('Average Total Revenue of Merchants for Each Segment', size = 14)
plt.savefig('../plots/Average Total Revenue of Merchants for Each Segment.png')
plt.show()

**Observation:**  
By comparing the average total revenue of the TOP 100 merchants with that of all other merchants, we see that the BNPL company’s profits can be more than ten times higher if they focus their resources on cooperating with the top merchants. Consequently, our final result is reliable in detecting what merchants are more likely to bring high return in each segment. 