In [66]:
import pandas as pd
import datetime
import plotly.express as px
import plotly.graph_objects as go

## Read the data and have a snapshot

In [67]:
# Snapshot of the raw data merged file extracted thanks to the join SQL requests
df = pd.read_csv('d:/Projets/Code/Python/Global-Electronics-Retailer-Analysis/SQL_requests/global-electronics-retailer-analysis-raw-dataset.csv')
df.head()

Unnamed: 0,StoreKey,StoreState,StoreCountry,StoreOpenDate,StoreSurfaceAsSquareMeters,Order Number,Order Date,ProductKey,Product Name,Brand,...,Currency Code,ExchangeVsUSD,Quantity,CustomerKey,Birthday,Gender,CustomerCity,CustomerState,CustomerCountry,CustomerContinent
0,5,Victoria,Australia,12/9/2015,2000,1520023,2/28/2019,1044,A. Datum SLR Camera X135 Black,A. Datum,...,AUD,1.4005,5,1585,8/12/1990,Female,GREEN LAKE,Victoria,Australia,Australia
1,4,Tasmania,Australia,1/1/2010,2000,1120007,1/24/2018,2131,Contoso Coffee Maker Auto 5C E0900 White,Contoso,...,AUD,1.2395,1,3203,8/18/1998,Male,ARGENTON,New South Wales,Australia,Australia
2,6,Western Australia,Australia,1/1/2010,2000,370013,1/5/2016,5,Contoso 2G MP3 Player E200 Red,Contoso,...,AUD,1.3942,2,5097,11/13/1942,Female,Mount Pleasant,Queensland,Australia,Australia
3,6,Western Australia,Australia,1/1/2010,2000,759002,1/28/2017,8,Contoso 4G MP3 Player E400 Silver,Contoso,...,AUD,1.3266,1,10299,5/27/1937,Male,BEELBANGERA,New South Wales,Australia,Australia
4,6,Western Australia,Australia,1/1/2010,2000,428009,3/3/2016,144,"Adventure Works 15.6 LCD TV M130W Brown""""",Adventure Works,...,AUD,1.3624,2,12160,9/5/1956,Male,Oakhurst,New South Wales,Australia,Australia


## Extract the main information about the dataframe

In [68]:
# Number of rows and columns
df.shape

(62884, 24)

In [69]:
# Column list
df.keys()

Index(['StoreKey', 'StoreState', 'StoreCountry', 'StoreOpenDate',
       'StoreSurfaceAsSquareMeters', 'Order Number', 'Order Date',
       'ProductKey', 'Product Name', 'Brand', 'Category', 'Subcategory',
       'Unit Cost USD', 'Unit Price USD', 'Currency Code', 'ExchangeVsUSD',
       'Quantity', 'CustomerKey', 'Birthday', 'Gender', 'CustomerCity',
       'CustomerState', 'CustomerCountry', 'CustomerContinent'],
      dtype='object')

In [70]:
# Are there missing values ?
df.isnull().any()

StoreKey                      False
StoreState                    False
StoreCountry                  False
StoreOpenDate                 False
StoreSurfaceAsSquareMeters    False
Order Number                  False
Order Date                    False
ProductKey                    False
Product Name                  False
Brand                         False
Category                      False
Subcategory                   False
Unit Cost USD                 False
Unit Price USD                False
Currency Code                 False
ExchangeVsUSD                 False
Quantity                      False
CustomerKey                   False
Birthday                      False
Gender                        False
CustomerCity                  False
CustomerState                 False
CustomerCountry               False
CustomerContinent             False
dtype: bool

In [71]:
# Main metrics of the dataset
df.describe(include='all')

Unnamed: 0,StoreKey,StoreState,StoreCountry,StoreOpenDate,StoreSurfaceAsSquareMeters,Order Number,Order Date,ProductKey,Product Name,Brand,...,Currency Code,ExchangeVsUSD,Quantity,CustomerKey,Birthday,Gender,CustomerCity,CustomerState,CustomerCountry,CustomerContinent
count,62884.0,62884,62884,62884,62884.0,62884.0,62884,62884.0,62884,62884,...,62884,62884.0,62884.0,62884.0,62884,62884,62884,62884,62884,62884
unique,,58,9,24,,,1641,,2492,11,...,5,,,,9382,2,6570,491,8,3
top,,Online,United States,1/1/2000,,,12/21/2019,,Adventure Works Desktop PC2.30 MD230 Black,Contoso,...,USD,,,,5/5/1950,Male,Toronto,California,United States,North America
freq,,13165,26555,13165,,,222,,162,15953,...,33767,,,,44,31804,683,3629,33767,39182
mean,31.802144,,,,1260.640306,1430905.0,,1125.859344,,,...,,0.992533,3.14479,1180797.0,,,,,,
std,22.978188,,,,767.739494,453296.3,,709.24401,,,...,,0.161601,2.256371,585963.4,,,,,,
min,0.0,,,,0.0,366000.0,,1.0,,,...,,0.6725,1.0,301.0,,,,,,
25%,8.0,,,,840.0,1121017.0,,437.0,,,...,,0.8945,1.0,680858.0,,,,,,
50%,37.0,,,,1330.0,1498016.0,,1358.0,,,...,,1.0,2.0,1261200.0,,,,,,
75%,53.0,,,,2000.0,1788010.0,,1650.0,,,...,,1.0,4.0,1686496.0,,,,,,


In [72]:
# What is the type of each column of the dataframe ?
df.dtypes

StoreKey                        int64
StoreState                     object
StoreCountry                   object
StoreOpenDate                  object
StoreSurfaceAsSquareMeters      int64
Order Number                    int64
Order Date                     object
ProductKey                      int64
Product Name                   object
Brand                          object
Category                       object
Subcategory                    object
Unit Cost USD                  object
Unit Price USD                 object
Currency Code                  object
ExchangeVsUSD                 float64
Quantity                        int64
CustomerKey                     int64
Birthday                       object
Gender                         object
CustomerCity                   object
CustomerState                  object
CustomerCountry                object
CustomerContinent              object
dtype: object

## Data Cleaning

In [73]:
# StoreKey, OrderNumber, ProductKey and CustomerKey are IDs,
# no need to make operations on it, so let's transform theù into string columns
df["StoreKey"] = df["StoreKey"].apply(lambda x : str(x))
df["Order Number"] = df["Order Number"].apply(lambda x : str(x))
df["ProductKey"] = df["ProductKey"].apply(lambda x : str(x))
df["CustomerKey"] = df["CustomerKey"].apply(lambda x : str(x))
df.dtypes

StoreKey                       object
StoreState                     object
StoreCountry                   object
StoreOpenDate                  object
StoreSurfaceAsSquareMeters      int64
Order Number                   object
Order Date                     object
ProductKey                     object
Product Name                   object
Brand                          object
Category                       object
Subcategory                    object
Unit Cost USD                  object
Unit Price USD                 object
Currency Code                  object
ExchangeVsUSD                 float64
Quantity                        int64
CustomerKey                    object
Birthday                       object
Gender                         object
CustomerCity                   object
CustomerState                  object
CustomerCountry                object
CustomerContinent              object
dtype: object

In [74]:
# Conversion of all element of the StoreOpenDate, Order Date and Birthday columns from string into date object
df["StoreOpenDate"] = df["StoreOpenDate"].apply(lambda x : pd.to_datetime(x, format="%m/%d/%Y").date())
df["Order Date"] = df["Order Date"].apply(lambda x : pd.to_datetime(x, format="%m/%d/%Y").date())
df["Birthday"] = df["Birthday"].apply(lambda x : pd.to_datetime(x, format="%m/%d/%Y").date())
df

Unnamed: 0,StoreKey,StoreState,StoreCountry,StoreOpenDate,StoreSurfaceAsSquareMeters,Order Number,Order Date,ProductKey,Product Name,Brand,...,Currency Code,ExchangeVsUSD,Quantity,CustomerKey,Birthday,Gender,CustomerCity,CustomerState,CustomerCountry,CustomerContinent
0,5,Victoria,Australia,2015-12-09,2000,1520023,2019-02-28,1044,A. Datum SLR Camera X135 Black,A. Datum,...,AUD,1.4005,5,1585,1990-08-12,Female,GREEN LAKE,Victoria,Australia,Australia
1,4,Tasmania,Australia,2010-01-01,2000,1120007,2018-01-24,2131,Contoso Coffee Maker Auto 5C E0900 White,Contoso,...,AUD,1.2395,1,3203,1998-08-18,Male,ARGENTON,New South Wales,Australia,Australia
2,6,Western Australia,Australia,2010-01-01,2000,370013,2016-01-05,5,Contoso 2G MP3 Player E200 Red,Contoso,...,AUD,1.3942,2,5097,1942-11-13,Female,Mount Pleasant,Queensland,Australia,Australia
3,6,Western Australia,Australia,2010-01-01,2000,759002,2017-01-28,8,Contoso 4G MP3 Player E400 Silver,Contoso,...,AUD,1.3266,1,10299,1937-05-27,Male,BEELBANGERA,New South Wales,Australia,Australia
4,6,Western Australia,Australia,2010-01-01,2000,428009,2016-03-03,144,"Adventure Works 15.6 LCD TV M130W Brown""""",Adventure Works,...,AUD,1.3624,2,12160,1956-09-05,Male,Oakhurst,New South Wales,Australia,Australia
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62879,50,Kansas,United States,2008-03-06,2000,1715000,2019-09-11,1470,The Phone Company Smart phones without camera ...,The Phone Company,...,USD,1.0000,1,2051398,1979-04-15,Male,Fort Washington,Pennsylvania,United States,North America
62880,53,Montana,United States,2012-06-06,1260,1428025,2018-11-28,1644,Contoso DVD External DVD Burner M200 Blue,Contoso,...,USD,1.0000,6,2068819,1977-08-19,Female,Utica,New York,United States,North America
62881,53,Montana,United States,2012-06-06,1260,1428025,2018-11-28,2109,Contoso Water Heater 1.5GPM E0800 Grey,Contoso,...,USD,1.0000,1,2068819,1977-08-19,Female,Utica,New York,United States,North America
62882,53,Montana,United States,2012-06-06,1260,1428025,2018-11-28,1621,Contoso DVD Movies E100 Yellow,Contoso,...,USD,1.0000,4,2068819,1977-08-19,Female,Utica,New York,United States,North America


In [75]:
# Define a function to remove the '$' symbol and the comma separating thousands
def remove_dollar_sign(value):
    return float(value.replace("$", "").replace(",", ""))

# Apply the function to the “Unit Cost USD” column with apply
df["Unit Cost USD"] = df["Unit Cost USD"].apply(remove_dollar_sign)
df["Unit Price USD"] = df["Unit Price USD"].apply(remove_dollar_sign)

# Show updated columns
df[["Unit Cost USD", "Unit Price USD"]]

Unnamed: 0,Unit Cost USD,Unit Price USD
0,207.74,627.00
1,83.10,163.00
2,11.00,21.57
3,30.58,59.99
4,152.94,299.99
...,...,...
62879,65.77,129.00
62880,26.62,57.88
62881,131.28,257.50
62882,6.62,12.99


In [76]:
df

Unnamed: 0,StoreKey,StoreState,StoreCountry,StoreOpenDate,StoreSurfaceAsSquareMeters,Order Number,Order Date,ProductKey,Product Name,Brand,...,Currency Code,ExchangeVsUSD,Quantity,CustomerKey,Birthday,Gender,CustomerCity,CustomerState,CustomerCountry,CustomerContinent
0,5,Victoria,Australia,2015-12-09,2000,1520023,2019-02-28,1044,A. Datum SLR Camera X135 Black,A. Datum,...,AUD,1.4005,5,1585,1990-08-12,Female,GREEN LAKE,Victoria,Australia,Australia
1,4,Tasmania,Australia,2010-01-01,2000,1120007,2018-01-24,2131,Contoso Coffee Maker Auto 5C E0900 White,Contoso,...,AUD,1.2395,1,3203,1998-08-18,Male,ARGENTON,New South Wales,Australia,Australia
2,6,Western Australia,Australia,2010-01-01,2000,370013,2016-01-05,5,Contoso 2G MP3 Player E200 Red,Contoso,...,AUD,1.3942,2,5097,1942-11-13,Female,Mount Pleasant,Queensland,Australia,Australia
3,6,Western Australia,Australia,2010-01-01,2000,759002,2017-01-28,8,Contoso 4G MP3 Player E400 Silver,Contoso,...,AUD,1.3266,1,10299,1937-05-27,Male,BEELBANGERA,New South Wales,Australia,Australia
4,6,Western Australia,Australia,2010-01-01,2000,428009,2016-03-03,144,"Adventure Works 15.6 LCD TV M130W Brown""""",Adventure Works,...,AUD,1.3624,2,12160,1956-09-05,Male,Oakhurst,New South Wales,Australia,Australia
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62879,50,Kansas,United States,2008-03-06,2000,1715000,2019-09-11,1470,The Phone Company Smart phones without camera ...,The Phone Company,...,USD,1.0000,1,2051398,1979-04-15,Male,Fort Washington,Pennsylvania,United States,North America
62880,53,Montana,United States,2012-06-06,1260,1428025,2018-11-28,1644,Contoso DVD External DVD Burner M200 Blue,Contoso,...,USD,1.0000,6,2068819,1977-08-19,Female,Utica,New York,United States,North America
62881,53,Montana,United States,2012-06-06,1260,1428025,2018-11-28,2109,Contoso Water Heater 1.5GPM E0800 Grey,Contoso,...,USD,1.0000,1,2068819,1977-08-19,Female,Utica,New York,United States,North America
62882,53,Montana,United States,2012-06-06,1260,1428025,2018-11-28,1621,Contoso DVD Movies E100 Yellow,Contoso,...,USD,1.0000,4,2068819,1977-08-19,Female,Utica,New York,United States,North America


In [77]:
df.dtypes

StoreKey                       object
StoreState                     object
StoreCountry                   object
StoreOpenDate                  object
StoreSurfaceAsSquareMeters      int64
Order Number                   object
Order Date                     object
ProductKey                     object
Product Name                   object
Brand                          object
Category                       object
Subcategory                    object
Unit Cost USD                 float64
Unit Price USD                float64
Currency Code                  object
ExchangeVsUSD                 float64
Quantity                        int64
CustomerKey                    object
Birthday                       object
Gender                         object
CustomerCity                   object
CustomerState                  object
CustomerCountry                object
CustomerContinent              object
dtype: object

## Exploratory Data Analysis

In [78]:
# Let's calculate the age of the customers by difference between the latest order date and their birthdate
max_date = max(df["Order Date"])
df['Age'] = df['Birthday'].apply(lambda x: int((max_date - x).days/365.25))
df['Age']

0        30
1        22
2        78
3        83
4        64
         ..
62879    41
62880    43
62881    43
62882    43
62883    43
Name: Age, Length: 62884, dtype: int64

In [79]:
df["TotalSalesAmount"] = df["Unit Price USD"] * df["Quantity"]
df[["Unit Price USD", "Quantity", "TotalSalesAmount"]]

Unnamed: 0,Unit Price USD,Quantity,TotalSalesAmount
0,627.00,5,3135.00
1,163.00,1,163.00
2,21.57,2,43.14
3,59.99,1,59.99
4,299.99,2,599.98
...,...,...,...
62879,129.00,1,129.00
62880,57.88,6,347.28
62881,257.50,1,257.50
62882,12.99,4,51.96


### Store analysis

sales / profitability store vs online
sales / profitability by store country
(sales / profitability by store state / key)

In [80]:
# Let's first calculate total cost
df["TotalCost"] = df["Unit Cost USD"] * df["Quantity"]
df[["Unit Cost USD", "Quantity", "TotalCost"]]

Unnamed: 0,Unit Cost USD,Quantity,TotalCost
0,207.74,5,1038.70
1,83.10,1,83.10
2,11.00,2,22.00
3,30.58,1,30.58
4,152.94,2,305.88
...,...,...,...
62879,65.77,1,65.77
62880,26.62,6,159.72
62881,131.28,1,131.28
62882,6.62,4,26.48


In [81]:
# Now let's calculate profit
df["Profit"] = df["TotalSalesAmount"] - df["TotalCost"]
df[["TotalSalesAmount", "TotalCost", "Profit"]]

Unnamed: 0,TotalSalesAmount,TotalCost,Profit
0,3135.00,1038.70,2096.30
1,163.00,83.10,79.90
2,43.14,22.00,21.14
3,59.99,30.58,29.41
4,599.98,305.88,294.10
...,...,...,...
62879,129.00,65.77,63.23
62880,347.28,159.72,187.56
62881,257.50,131.28,126.22
62882,51.96,26.48,25.48


In [82]:
# And now we can calculate total profitability
df["TotalProfitability"] = df["Profit"] / df["TotalSalesAmount"]
df[["TotalSalesAmount", "Profit", "TotalProfitability"]]

Unnamed: 0,TotalSalesAmount,Profit,TotalProfitability
0,3135.00,2096.30,0.668676
1,163.00,79.90,0.490184
2,43.14,21.14,0.490032
3,59.99,29.41,0.490248
4,599.98,294.10,0.490183
...,...,...,...
62879,129.00,63.23,0.490155
62880,347.28,187.56,0.540083
62881,257.50,126.22,0.490175
62882,51.96,25.48,0.490377


In [83]:
# We also want to distinguish online vs in-store sales. A sale with a StoreKey value that is 0 is an online sale
df["StoreKey"].unique()

array(['5', '4', '6', '1', '0', '10', '9', '8', '27', '21', '23', '26',
       '22', '20', '24', '19', '12', '13', '17', '16', '14', '18', '29',
       '30', '28', '32', '31', '33', '34', '42', '39', '41', '37', '38',
       '40', '36', '62', '45', '50', '56', '44', '59', '55', '51', '48',
       '63', '43', '54', '57', '49', '47', '61', '64', '66', '53', '65',
       '15', '2'], dtype=object)

In [84]:
# Let's count the number of sales by each storeKey
df["StoreKey"].value_counts()

StoreKey
0     13165
9      1577
50     1519
55     1518
54     1498
61     1485
59     1472
45     1471
57     1442
44     1436
65     1395
8      1360
51     1356
64     1353
47     1348
43     1340
66     1295
48     1289
56     1287
53     1284
10     1269
39     1054
40     1050
36     1049
49     1041
38     1034
42     1017
29     1011
37      995
30      976
63      948
5       892
22      800
62      778
27      721
24      703
23      674
19      635
6       615
33      498
32      454
34      448
41      444
20      443
4       431
31      417
21      391
26      295
1       292
15      235
12      224
16      214
28      214
17      210
18      194
13      184
14      126
2        18
Name: count, dtype: int64

In [85]:
# To make the analysis easier, let's create a "Online / In-Store" column,
# indicating if this is an online or an In-Store sale
df["OnlineVsInstore"] = df["StoreKey"].apply(lambda x: "Online" if x=='0' else "In-Store")
df[["StoreKey", "OnlineVsInstore"]]

Unnamed: 0,StoreKey,OnlineVsInstore
0,5,In-Store
1,4,In-Store
2,6,In-Store
3,6,In-Store
4,6,In-Store
...,...,...
62879,50,In-Store
62880,53,In-Store
62881,53,In-Store
62882,53,In-Store


In [86]:
df.keys()

Index(['StoreKey', 'StoreState', 'StoreCountry', 'StoreOpenDate',
       'StoreSurfaceAsSquareMeters', 'Order Number', 'Order Date',
       'ProductKey', 'Product Name', 'Brand', 'Category', 'Subcategory',
       'Unit Cost USD', 'Unit Price USD', 'Currency Code', 'ExchangeVsUSD',
       'Quantity', 'CustomerKey', 'Birthday', 'Gender', 'CustomerCity',
       'CustomerState', 'CustomerCountry', 'CustomerContinent', 'Age',
       'TotalSalesAmount', 'TotalCost', 'Profit', 'TotalProfitability',
       'OnlineVsInstore'],
      dtype='object')

In [87]:
# Now we can make online vs in-store analysis, looking at the sales :
df.groupby("OnlineVsInstore")["TotalSalesAmount"].sum().reset_index()

Unnamed: 0,OnlineVsInstore,TotalSalesAmount
0,In-Store,44351154.96
1,Online,11404324.63


In [88]:
# and the corresponding pie chart :
px_pie_chart = px.pie(df, names = "OnlineVsInstore", title = "Online vs In-Store Sales")
px_pie_chart.show()





In [89]:
# Let's have a look at the sales per store :
sales_by_store_country = df.groupby("StoreCountry")["TotalSalesAmount"].sum().reset_index()
sales_by_store_country = sales_by_store_country.sort_values(by="TotalSalesAmount", ascending=False)
sales_by_store_country

Unnamed: 0,StoreCountry,TotalSalesAmount
8,United States,23764425.86
6,Online,11404324.63
7,United Kingdom,5749769.78
3,Germany,4246279.22
1,Canada,3611561.79
0,Australia,2099141.07
4,Italy,2059086.81
5,Netherlands,1591344.48
2,France,1229545.95


In [90]:
# and the corresponding horizontal bar chart :
fig = go.Figure(go.Bar(
            
# For a reason I ignore, I have to sort by ascending values of the TotalSalesAmount
# to have the bars by descending order of the TotalSalesAmount value
            x=sales_by_store_country["TotalSalesAmount"].sort_values(ascending=True),
            y=sales_by_store_country["StoreCountry"],
            orientation='h'))

fig.show()

In [91]:
# Let's have a look at the profits per store :
profitability_by_store_country = df.groupby("StoreCountry")["TotalProfitability"].mean().reset_index()
profitability_by_store_country = profitability_by_store_country.sort_values(by="TotalProfitability", ascending=False)
profitability_by_store_country

Unnamed: 0,StoreCountry,TotalProfitability
2,France,0.552046
0,Australia,0.551826
4,Italy,0.550946
5,Netherlands,0.550048
3,Germany,0.549935
8,United States,0.548829
7,United Kingdom,0.548577
6,Online,0.548499
1,Canada,0.547922


### Orders & Sales analysis

Orders number by month and year
Sales by month and year

In [92]:
print(df["Order Date"].min())
print(df["Order Date"].max())

2016-01-01
2021-02-20


In [93]:
total_sales_per_date = df.groupby("Order Date")["TotalSalesAmount"].sum().reset_index()
total_sales_per_date

Unnamed: 0,Order Date,TotalSalesAmount
0,2016-01-01,37442.42
1,2016-01-02,91366.94
2,2016-01-03,362.64
3,2016-01-04,9524.94
4,2016-01-05,22996.38
...,...,...
1636,2021-02-16,27700.99
1637,2021-02-17,77408.70
1638,2021-02-18,35307.64
1639,2021-02-19,36476.44


In [94]:
total_sales_per_date["Year"] = total_sales_per_date["Order Date"].apply(lambda x : x.year)
total_sales_per_date["Month"] = total_sales_per_date["Order Date"].apply(lambda x : "Jan" if x.month == 1
                                                                                else "Feb" if x.month == 2
                                                                                else "Mar" if x.month == 3
                                                                                else "Apr" if x.month == 4
                                                                                else "May" if x.month == 5
                                                                                else "Jun" if x.month == 6
                                                                                else "Jul" if x.month == 7
                                                                                else "Aug" if x.month == 8
                                                                                else "Sep" if x.month == 9
                                                                                else "Oct" if x.month == 10
                                                                                else "Nov" if x.month == 11
                                                                                else "Dec")
total_sales_per_date["Month_Number"] = total_sales_per_date["Order Date"].apply(lambda x : x.month)
total_sales_per_date

Unnamed: 0,Order Date,TotalSalesAmount,Year,Month,Month_Number
0,2016-01-01,37442.42,2016,Jan,1
1,2016-01-02,91366.94,2016,Jan,1
2,2016-01-03,362.64,2016,Jan,1
3,2016-01-04,9524.94,2016,Jan,1
4,2016-01-05,22996.38,2016,Jan,1
...,...,...,...,...,...
1636,2021-02-16,27700.99,2021,Feb,2
1637,2021-02-17,77408.70,2021,Feb,2
1638,2021-02-18,35307.64,2021,Feb,2
1639,2021-02-19,36476.44,2021,Feb,2


In [95]:
total_sales_per_date_aggregated = total_sales_per_date.groupby(["Year", "Month", "Month_Number"])["TotalSalesAmount"].sum().reset_index()
total_sales_per_date_aggregated = total_sales_per_date_aggregated.sort_values(by=["Year", "Month_Number"], ascending=True)
total_sales_per_date_aggregated

Unnamed: 0,Year,Month,Month_Number,TotalSalesAmount
4,2016,Jan,1,649918.78
3,2016,Feb,2,891098.30
7,2016,Mar,3,338407.36
0,2016,Apr,4,110591.63
8,2016,May,5,595986.18
...,...,...,...,...
58,2020,Oct,10,245647.59
57,2020,Nov,11,256701.02
50,2020,Dec,12,651526.44
61,2021,Jan,1,513021.58


In [96]:
# Sales by month and year
fivelastyears_dropdown = go.Figure()

fivelastyears_dropdown.add_trace(
    go.Bar(
        x = total_sales_per_date_aggregated["Month"],
        y = total_sales_per_date_aggregated['TotalSalesAmount'].loc[total_sales_per_date_aggregated['Year'] == 2016],
        visible = True))

fivelastyears_dropdown.add_trace(
    go.Bar(
        x = total_sales_per_date_aggregated["Month"],
        y = total_sales_per_date_aggregated['TotalSalesAmount'].loc[total_sales_per_date_aggregated['Year'] == 2017],
        visible = False))

fivelastyears_dropdown.add_trace(
    go.Bar(
        x = total_sales_per_date_aggregated["Month"],
        y = total_sales_per_date_aggregated['TotalSalesAmount'].loc[total_sales_per_date_aggregated['Year'] == 2018],
        visible = False))

fivelastyears_dropdown.add_trace(
    go.Bar(
        x = total_sales_per_date_aggregated["Month"],
        y = total_sales_per_date_aggregated['TotalSalesAmount'].loc[total_sales_per_date_aggregated['Year'] == 2019],
        visible = False))

fivelastyears_dropdown.add_trace(
    go.Bar(
        x = total_sales_per_date_aggregated["Month"],
        y = total_sales_per_date_aggregated['TotalSalesAmount'].loc[total_sales_per_date_aggregated['Year'] == 2020],
        visible = False))

fivelastyears_dropdown.update_layout(
        title = go.layout.Title(text = "Sales per month per year 2016 - 2020", x = 0.5),
        showlegend = False)

fivelastyears_dropdown.update_layout(
    updatemenus = [go.layout.Updatemenu(
        active = 0,
        buttons = [
                    go.layout.updatemenu.Button(
                        label = "2016",
                        method = "update",
                        args = [{"visible" : [True, False, False, False, False]}]),
                    go.layout.updatemenu.Button(
                            label = "2017",
                            method = "update",
                            args = [{"visible" : [False, True, False, False, False]}]),
                    go.layout.updatemenu.Button(
                            label = "2018",
                            method = "update",
                            args = [{"visible" : [False, False, True, False, False]}]),
                    go.layout.updatemenu.Button(
                            label = "2019",
                            method = "update",
                            args = [{"visible" : [False, False, False, True, False]}]),
                    go.layout.updatemenu.Button(
                            label = "2020",
                            method = "update",
                            args = [{"visible" : [False, False, False, False, True]}]),
                ]
    )]
)

fivelastyears_dropdown.show()

In [105]:
# Orders by month and year
order_number_per_date = df.groupby("Order Date")["Order Number"].nunique().reset_index()
order_number_per_date

Unnamed: 0,Order Date,Order Number
0,2016-01-01,14
1,2016-01-02,30
2,2016-01-03,2
3,2016-01-04,9
4,2016-01-05,13
...,...,...
1636,2021-02-16,15
1637,2021-02-17,24
1638,2021-02-18,24
1639,2021-02-19,18


In [106]:
order_number_per_date["Year"] = order_number_per_date["Order Date"].apply(lambda x : x.year)
order_number_per_date["Month"] = order_number_per_date["Order Date"].apply(lambda x : "Jan" if x.month == 1
                                                                                else "Feb" if x.month == 2
                                                                                else "Mar" if x.month == 3
                                                                                else "Apr" if x.month == 4
                                                                                else "May" if x.month == 5
                                                                                else "Jun" if x.month == 6
                                                                                else "Jul" if x.month == 7
                                                                                else "Aug" if x.month == 8
                                                                                else "Sep" if x.month == 9
                                                                                else "Oct" if x.month == 10
                                                                                else "Nov" if x.month == 11
                                                                                else "Dec")
order_number_per_date["Month_Number"] = order_number_per_date["Order Date"].apply(lambda x : x.month)
order_number_per_date

Unnamed: 0,Order Date,Order Number,Year,Month,Month_Number
0,2016-01-01,14,2016,Jan,1
1,2016-01-02,30,2016,Jan,1
2,2016-01-03,2,2016,Jan,1
3,2016-01-04,9,2016,Jan,1
4,2016-01-05,13,2016,Jan,1
...,...,...,...,...,...
1636,2021-02-16,15,2021,Feb,2
1637,2021-02-17,24,2021,Feb,2
1638,2021-02-18,24,2021,Feb,2
1639,2021-02-19,18,2021,Feb,2


In [107]:
order_number_per_date_aggregated = order_number_per_date.groupby(["Year", "Month", "Month_Number"])["Order Number"].sum().reset_index()
order_number_per_date_aggregated = order_number_per_date_aggregated.sort_values(by=["Year", "Month_Number"], ascending=True)
order_number_per_date_aggregated

Unnamed: 0,Year,Month,Month_Number,Order Number
4,2016,Jan,1,292
3,2016,Feb,2,334
7,2016,Mar,3,109
0,2016,Apr,4,41
8,2016,May,5,220
...,...,...,...,...
58,2020,Oct,10,159
57,2020,Nov,11,149
50,2020,Dec,12,318
61,2021,Jan,1,256


In [108]:
# Sales by month and year
fivelastyears_dropdown_orders = go.Figure()

fivelastyears_dropdown_orders.add_trace(
    go.Bar(
        x = order_number_per_date_aggregated["Month"],
        y = order_number_per_date_aggregated['Order Number'].loc[order_number_per_date_aggregated['Year'] == 2016],
        visible = True))

fivelastyears_dropdown_orders.add_trace(
    go.Bar(
        x = order_number_per_date_aggregated["Month"],
        y = order_number_per_date_aggregated['Order Number'].loc[order_number_per_date_aggregated['Year'] == 2017],
        visible = False))

fivelastyears_dropdown_orders.add_trace(
    go.Bar(
        x = order_number_per_date_aggregated["Month"],
        y = order_number_per_date_aggregated['Order Number'].loc[order_number_per_date_aggregated['Year'] == 2018],
        visible = False))

fivelastyears_dropdown_orders.add_trace(
    go.Bar(
        x = order_number_per_date_aggregated["Month"],
        y = order_number_per_date_aggregated['Order Number'].loc[order_number_per_date_aggregated['Year'] == 2019],
        visible = False))

fivelastyears_dropdown_orders.add_trace(
    go.Bar(
        x = order_number_per_date_aggregated["Month"],
        y = order_number_per_date_aggregated['Order Number'].loc[order_number_per_date_aggregated['Year'] == 2020],
        visible = False))

fivelastyears_dropdown_orders.update_layout(
        title = go.layout.Title(text = "Number of orders per month per year 2016 - 2020", x = 0.5),
        showlegend = False)

fivelastyears_dropdown_orders.update_layout(
    updatemenus = [go.layout.Updatemenu(
        active = 0,
        buttons = [
                    go.layout.updatemenu.Button(
                        label = "2016",
                        method = "update",
                        args = [{"visible" : [True, False, False, False, False]}]),
                    go.layout.updatemenu.Button(
                            label = "2017",
                            method = "update",
                            args = [{"visible" : [False, True, False, False, False]}]),
                    go.layout.updatemenu.Button(
                            label = "2018",
                            method = "update",
                            args = [{"visible" : [False, False, True, False, False]}]),
                    go.layout.updatemenu.Button(
                            label = "2019",
                            method = "update",
                            args = [{"visible" : [False, False, False, True, False]}]),
                    go.layout.updatemenu.Button(
                            label = "2020",
                            method = "update",
                            args = [{"visible" : [False, False, False, False, True]}]),
                ]
    )]
)

fivelastyears_dropdown_orders.show()

### Products & Category analysis

Sales / profitability by category / subcategory / product with a dropdown
Categories / subcategories / product bought together
(optional : sales / profitability by brand ?)

### Customer analysis

Sales by genre
Sales by age
Sales by country / state / city with a dropdown

In [99]:
df["CustomerCountry"].value_counts()

CustomerCountry
United States     33767
United Kingdom     8140
Germany            5956
Canada             5415
Australia          2941
Italy              2685
Netherlands        2250
France             1730
Name: count, dtype: int64