In [1]:
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

### Load the dataset

In [2]:
# load the dataset
df_transactions = pd.read_csv('data/sample_transactions_with_missing_value.csv', header=0)

# count number of missing values
print('Count number of missing values')
print(df_transactions.isna().sum())

Count number of missing values
CustomerID       0
InvoiceNo      250
UnitPrice       33
Quantity         0
InvoiceDate     40
dtype: int64


In [3]:
# get a summary of the dataframe with not-null-count for each column
df_transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   CustomerID   1000 non-null   int64  
 1   InvoiceNo    750 non-null    float64
 2   UnitPrice    967 non-null    float64
 3   Quantity     1000 non-null   float64
 4   InvoiceDate  960 non-null    object 
dtypes: float64(3), int64(1), object(1)
memory usage: 39.2+ KB


### Handling missing values

In [4]:
# show records with missing value
print('Records with missing unit-price')
df_missing = df_transactions[df_transactions['UnitPrice'].isna() == True]
df_missing.head()


Records with missing unit-price


Unnamed: 0,CustomerID,InvoiceNo,UnitPrice,Quantity,InvoiceDate
2,3,100003.0,,34.0,2022-04-21
3,4,100004.0,,40.0,2022-08-17
4,5,100005.0,,38.0,2022-10-12
10,11,100011.0,,36.0,2022-02-05
11,12,100012.0,,29.0,2022-07-03


In [5]:
# impute missing values with median of that column
print('Impute missing values with median of that column')
df_imputed = df_missing.fillna(df_transactions['UnitPrice'].median())
df_imputed.head()

Impute missing values with median of that column


Unnamed: 0,CustomerID,InvoiceNo,UnitPrice,Quantity,InvoiceDate
2,3,100003.0,9.769554,34.0,2022-04-21
3,4,100004.0,9.769554,40.0,2022-08-17
4,5,100005.0,9.769554,38.0,2022-10-12
10,11,100011.0,9.769554,36.0,2022-02-05
11,12,100012.0,9.769554,29.0,2022-07-03


### Transposing

In [6]:
# normal descriptive stats
df_transactions.describe()

Unnamed: 0,CustomerID,InvoiceNo,UnitPrice,Quantity
count,1000.0,750.0,967.0,1000.0
mean,500.5,100510.384,9.838061,30.075
std,288.819436,287.547624,2.00775,6.251276
min,1.0,100001.0,3.460791,11.0
25%,250.75,100262.25,8.442837,26.0
50%,500.5,100516.0,9.769554,30.0
75%,750.25,100761.75,11.215482,34.0
max,1000.0,100998.0,17.373629,47.0


In [7]:
# transformed descriptive stats
df_transactions.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
CustomerID,1000.0,500.5,288.819436,1.0,250.75,500.5,750.25,1000.0
InvoiceNo,750.0,100510.384,287.547624,100001.0,100262.25,100516.0,100761.75,100998.0
UnitPrice,967.0,9.838061,2.00775,3.460791,8.442837,9.769554,11.215482,17.373629
Quantity,1000.0,30.075,6.251276,11.0,26.0,30.0,34.0,47.0


### Grouping

In [8]:
# calculate total sale
df_imputed['TotalSale'] = df_imputed['UnitPrice'] * df_imputed['Quantity']

# aggregate total sale per customer
df_summary = df_imputed.groupby('CustomerID').agg({'TotalSale': 'sum', 'InvoiceDate':'first'})
print('Total sale per customer')
df_summary.head()

Total sale per customer


Unnamed: 0_level_0,TotalSale,InvoiceDate
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1
3,332.164853,2022-04-21
4,390.782179,2022-08-17
5,371.24307,2022-10-12
11,351.703962,2022-02-05
12,283.31708,2022-07-03
