In [None]:
# Data Source: https://www.kaggle.com/datasets/lakshmi25npathi/online-retail-dataset
# Folder: Online Retail Sales
# Description:
##This Online Retail II data set contains all the transactions occurring for a UK-based and registered, non-store online retail between 01/12/2009 and 09/12/2011.
##The company mainly sells unique all-occasion gift-ware. Many customers of the company are wholesalers.


# Analysis of Online retail sales dataset

## Overall goals:
- See the shape of the dataset and explore it
- Take a quick look at the data and decide on what to focus
- Manipulate data(changing data, creating new data, cleaning data etc...)
- Visualize findings to tell a story and back up analysis conducted
    - Visualizations include:
        - Geographical map
        - Bar Graphs, Histograms, Scatterplots etc...

## Libraries used throughout
- Pandas
- Folium
- NumPy
- Matplotlib

### In the case of errors
- Not all python libraries may be on your machine and or within your directory. Ensure to install them.
- You ran a cell with an edit that you made to it(This notebook is designed to run seamlessly with no edits)
- Not running a python kernel or you're using an old version of python kernel

In [1]:
#Libraries to be used
# pip install "name of library"(incase there is an error where the library)
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import folium


In [2]:
#convert the excel file to csv to create the main DataFrame
online_retail = pd.read_excel('Online_Retail.xlsx')
online_retail.to_csv('Online_Retail.csv', index=False)
online_retail.shape

(541909, 8)

In [55]:
#Quick look of the DataFrame
online_retail.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


### Attribute Information:
- InvoiceNo: Invoice number. Nominal. A 6-digit integral number uniquely assigned to each transaction. If this code starts with the letter 'c', it indicates a cancellation.
- StockCode: Product (item) code. Nominal. A 5-digit integral number uniquely assigned to each distinct product.
- Description: Product (item) name. Nominal.
- Quantity: The quantities of each product (item) per transaction. Numeric.
- InvoiceDate: Invice date and time. Numeric. The day and time when a transaction was generated.
- UnitPrice: Unit price. Numeric. Product price per unit in sterling (Â£).
- CustomerID: Customer number. Nominal. A 5-digit integral number uniquely assigned to each customer.
- Country: Country name. Nominal. The name of the country where a customer resides.

### We will want to see what is unique about each of the qualitative columns
#### This will allow us to get an idea of distinction within the dataset where it matters

In [34]:
#utilizing NumPy to find the count of unique values in the description column
description = online_retail['Description'].unique().tolist() #tolist isnt necessary but for safety it stores the array of values to a list
print(type(description))
len(description)


<class 'list'>


4224

In [35]:
cust_id = online_retail['CustomerID'].unique().tolist() #tolist isnt necessary but for safety it stores the array of values to a list
print(type(cust_id))
len(cust_id)


<class 'list'>


4373

In [36]:
country = online_retail['Country'].unique().tolist() #tolist isnt necessary but for safety it stores the array of values to a list
print(type(country))
len(country)


<class 'list'>


38

### We want to see if there are any null values within the data and analyze them to see whether they should be deleted or changed, or a mix of both

In [37]:
online_retail[online_retail.isnull().any(axis=1)]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
622,536414,22139,,56,2010-12-01 11:52:00,0.00,,United Kingdom
1443,536544,21773,DECORATIVE ROSE BATHROOM BOTTLE,1,2010-12-01 14:32:00,2.51,,United Kingdom
1444,536544,21774,DECORATIVE CATS BATHROOM BOTTLE,2,2010-12-01 14:32:00,2.51,,United Kingdom
1445,536544,21786,POLKADOT RAIN HAT,4,2010-12-01 14:32:00,0.85,,United Kingdom
1446,536544,21787,RAIN PONCHO RETROSPOT,2,2010-12-01 14:32:00,1.66,,United Kingdom
...,...,...,...,...,...,...,...,...
541536,581498,85099B,JUMBO BAG RED RETROSPOT,5,2011-12-09 10:26:00,4.13,,United Kingdom
541537,581498,85099C,JUMBO BAG BAROQUE BLACK WHITE,4,2011-12-09 10:26:00,4.13,,United Kingdom
541538,581498,85150,LADIES & GENTLEMEN METAL SIGN,1,2011-12-09 10:26:00,4.96,,United Kingdom
541539,581498,85174,S/4 CACTI CANDLES,1,2011-12-09 10:26:00,10.79,,United Kingdom


In [38]:
#Interestingly 135080 rows/541909 rows have null values. There is a lot of cleaning to be done.
#Lets check what columns have null values
online_retail.isnull().any()

InvoiceNo      False
StockCode      False
Description     True
Quantity       False
InvoiceDate    False
UnitPrice      False
CustomerID      True
Country        False
dtype: bool

In [39]:
#Here we can see that the focus is to be placed on the Description and CustomerID column.
#From this its possible to infer that maybe its orders that never went through or had errors. Lets check to see the counts of null values for each
online_retail['Description'].isnull().sum()


1454

In [40]:
#Same for customerID
online_retail['CustomerID'].isnull().sum()

135080

## Overall more customerID rows are null in comparison to description
### Additionally, I've noticed that there are rows of data where the unit price is listed as 0.
- This isn't null, but to me I view it as that. Therefore, I will clean this data to remove rows with these conditions.

In [41]:
#First find all the rows
cust_desc_errors = online_retail[online_retail['Description'].isnull() & online_retail['CustomerID'].isnull()]
cust_desc_errors


#new_online_retail = online_retail[online_retail[[cust_desc_errors & ]]]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
622,536414,22139,,56,2010-12-01 11:52:00,0.0,,United Kingdom
1970,536545,21134,,1,2010-12-01 14:32:00,0.0,,United Kingdom
1971,536546,22145,,1,2010-12-01 14:33:00,0.0,,United Kingdom
1972,536547,37509,,1,2010-12-01 14:33:00,0.0,,United Kingdom
1987,536549,85226A,,1,2010-12-01 14:34:00,0.0,,United Kingdom
...,...,...,...,...,...,...,...,...
535322,581199,84581,,-2,2011-12-07 18:26:00,0.0,,United Kingdom
535326,581203,23406,,15,2011-12-07 18:31:00,0.0,,United Kingdom
535332,581209,21620,,6,2011-12-07 18:35:00,0.0,,United Kingdom
536981,581234,72817,,27,2011-12-08 10:33:00,0.0,,United Kingdom


In [44]:
#Now we delet those rows which is essentially the opposite of our previous operation but we change from using & and use the or condition
new_online_retail = online_retail[online_retail['Description'].notnull() | online_retail['CustomerID'].notnull()]
new_online_retail

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
...,...,...,...,...,...,...,...,...
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,2011-12-09 12:50:00,0.85,12680.0,France
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.10,12680.0,France
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France


### We now have a slightly more cleaned up df which now has the issue with the null descriptions gone.
### We'll now sort the issue with the null customerID values. Note that its possible they may be guest purchases 

In [46]:
new_online_retail.isnull().any()

InvoiceNo      False
StockCode      False
Description    False
Quantity       False
InvoiceDate    False
UnitPrice      False
CustomerID      True
Country        False
dtype: bool

In [49]:
#Going forward, I'm gonna conduct some tests on other columns to find places where there may be errors like a negative quantity and also remove them as well
new_online_retail = new_online_retail[new_online_retail['Quantity'] > 0]
new_online_retail.shape

(530693, 8)

In [50]:
#Next I'll check the unique values in each column that we arent sure of if there are valid values for
new_online_retail['Country'].unique()

array(['United Kingdom', 'France', 'Australia', 'Netherlands', 'Germany',
       'Norway', 'EIRE', 'Switzerland', 'Spain', 'Poland', 'Portugal',
       'Italy', 'Belgium', 'Lithuania', 'Japan', 'Iceland',
       'Channel Islands', 'Denmark', 'Cyprus', 'Sweden', 'Finland',
       'Austria', 'Bahrain', 'Israel', 'Greece', 'Hong Kong', 'Singapore',
       'Lebanon', 'United Arab Emirates', 'Saudi Arabia',
       'Czech Republic', 'Canada', 'Unspecified', 'Brazil', 'USA',
       'European Community', 'Malta', 'RSA'], dtype=object)

In [51]:
#Quick check on the uncspecified country to see if anything is wrong from first glance
new_online_retail[new_online_retail['Country'].str.contains('Unspecified')]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
152712,549687,20685,DOORMAT RED RETROSPOT,2,2011-04-11 13:29:00,7.95,12363.0,Unspecified
152713,549687,22691,DOORMAT WELCOME SUNRISE,2,2011-04-11 13:29:00,7.95,12363.0,Unspecified
152714,549687,48116,DOORMAT MULTICOLOUR STRIPE,2,2011-04-11 13:29:00,7.95,12363.0,Unspecified
152715,549687,21213,PACK OF 72 SKULL CAKE CASES,24,2011-04-11 13:29:00,0.55,12363.0,Unspecified
152716,549687,21977,PACK OF 60 PINK PAISLEY CAKE CASES,24,2011-04-11 13:29:00,0.55,12363.0,Unspecified
...,...,...,...,...,...,...,...,...
498778,578539,22560,TRADITIONAL MODELLING CLAY,24,2011-11-24 14:55:00,1.25,,Unspecified
498779,578539,23570,TRADITIONAL PICK UP STICKS GAME,12,2011-11-24 14:55:00,1.25,,Unspecified
498780,578539,23571,TRADITIONAL NAUGHTS & CROSSES,12,2011-11-24 14:55:00,1.65,,Unspecified
498781,578539,84992,72 SWEETHEART FAIRY CAKE CASES,24,2011-11-24 14:55:00,0.55,,Unspecified


In [69]:
#All the countries have no issues, however, there is EIRE which is actually Ireland and RSA which is actual South Africa.
#Therefore I will replace it in the df so it makes more sense to someone who checks the new df and any visualizations on countries
new_online_retail['Country'].replace({'EIRE': 'Ireland', 'RSA': 'South Africa'}, inplace=True) #inplace=True just modifies the df rather than creating a new one if it was false
new_online_retail

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_online_retail['Country'].replace({'EIRE': 'Ireland', 'RSA': 'South Africa'}, inplace=True) #inplace=True just modifies the df rather than creating a new one if it was false


In [78]:
#Next there should be a match between the unique values
unique_stock_code = new_online_retail['StockCode'].unique()
# Define the regular expression pattern
pattern = r'^[^\d]*$'
matching_values = unique_stock_code[pd.Series(unique_stock_code).str.contains(pattern, regex=True)]
matching_values

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

### Focus on creating some new dataframes for storing pieces of the data

#### We will start with simple analysis of the items that were sold

In [8]:
#Item dataframe. Remove duplicates of items
item = online_retail[['StockCode', 'Description','UnitPrice']].drop_duplicates('StockCode')
item.shape


(4070, 3)

In [9]:
#Small check on our new df to see if there is any null data and there is
item.isnull().any()

StockCode      False
Description     True
UnitPrice      False
dtype: bool

In [13]:
#Now, I want to see more into what comes with these null descriptions and we get to see something very unique
null_values = item[item['Description'].isnull()]
null_values.head()

Unnamed: 0,StockCode,Description,UnitPrice
1970,21134,,0.0
1971,22145,,0.0
1972,37509,,0.0
1987,85226A,,0.0
1988,85044,,0.0
2024,20950,,0.0
2025,37461,,0.0
2026,84670,,0.0
4347,84952C,,0.0
7187,35951,,0.0


In [52]:
#From here we can see that it is possible these are mistakes or errors with orders.
#Therefore I'll conduct a test on the original dataset to see more into it
online_retail[online_retail['StockCode'] == 21763]
online_retail[online_retail['Description'].isnull()]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
106,536381,22139,RETROSPOT TEA SET CERAMIC 11 PC,23,2010-12-01 09:41:00,4.25,15311.0,United Kingdom
622,536414,22139,,56,2010-12-01 11:52:00,0.00,,United Kingdom
6392,536942,22139,amazon,15,2010-12-03 12:08:00,0.00,,United Kingdom
6885,536982,22139,RETROSPOT TEA SET CERAMIC 11 PC,10,2010-12-03 14:27:00,11.02,,United Kingdom
7203,537011,22139,,-5,2010-12-03 15:38:00,0.00,,United Kingdom
...,...,...,...,...,...,...,...,...
538411,581405,22139,RETROSPOT TEA SET CERAMIC 11 PC,1,2011-12-08 13:50:00,4.95,13521.0,United Kingdom
539531,581439,22139,RETROSPOT TEA SET CERAMIC 11 PC,1,2011-12-08 16:30:00,10.79,,United Kingdom
540441,581486,22139,RETROSPOT TEA SET CERAMIC 11 PC,6,2011-12-09 09:38:00,4.95,17001.0,United Kingdom
541387,581498,22139,RETROSPOT TEA SET CERAMIC 11 PC,2,2011-12-09 10:26:00,10.79,,United Kingdom


In [None]:
# We get to see that the NaN values are orders where the quantity was negative. So we will now clean up the rest of our dataset.
#

In [108]:
#For our analysis, we do not want to keep any null data, but we also noticed that the stock codes arent null and same with the price
#There is probably an error but we still want to keep this within or dataset so we will create a placeholder for null values
item['Description'].fillna('')
item.isnull().any()


StockCode      False
Description     True
UnitPrice      False
dtype: bool

In [37]:
#Customer info table
customer_info = online_retail[['CustomerID', 'InvoiceNo','Country']]
customer_info.head() #note this table will be fixed up a bit later

Unnamed: 0,CustomerID,InvoiceNo,Country
0,17850.0,536365,United Kingdom
1,17850.0,536365,United Kingdom
2,17850.0,536365,United Kingdom
3,17850.0,536365,United Kingdom
4,17850.0,536365,United Kingdom
