In [7]:
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

In [24]:
# load the dataset
df_transactions = pd.read_csv('synthetic_transactions_with_null.csv', header=0)

# count number of missing values
print('Count number of missing values')
print(df_transactions.isna().sum())

Count number of missing values
CustomerID       0
InvoiceNo        9
UnitPrice        0
Quantity       410
InvoiceDate     16
dtype: int64


In [25]:
# get a summary of the dataframe with not-null-count for each column
df_transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   CustomerID   1000 non-null   int64  
 1   InvoiceNo    991 non-null    float64
 2   UnitPrice    1000 non-null   float64
 3   Quantity     590 non-null    float64
 4   InvoiceDate  984 non-null    object 
dtypes: float64(3), int64(1), object(1)
memory usage: 39.2+ KB


In [9]:

# show records with missing value
print('Records with missing unit-price')
df_missing = df_transactions[df_transactions['UnitPrice'].isna() == True]
print(df_missing)

# impute missing values with median of that column
print('Impute missing values with median of that column')
df_imputed = df_missing.fillna(df_transactions['UnitPrice'].median())
print(df_imputed)

Records with missing unit-price
Empty DataFrame
Columns: [CustomerID, InvoiceNo, UnitPrice, Quantity, InvoiceDate, Country]
Index: []
Impute missing values with median of that column
Empty DataFrame
Columns: [CustomerID, InvoiceNo, UnitPrice, Quantity, InvoiceDate, Country]
Index: []


### Fake Data

In [26]:
import pandas as pd
import numpy as np
import datetime

# Generate synthetic data
n_samples = 1000

# Customer IDs
customer_ids = np.arange(1, n_samples + 1)

# Invoice numbers
invoice_numbers = np.arange(100001, 100001 + n_samples)

# Unit prices (randomly sampled from a normal distribution)
unit_prices = np.random.normal(loc=10, scale=2, size=n_samples)

# Quantities (positively correlated with unit prices)
quantities = np.round(50 - 2 * unit_prices + np.random.normal(scale=5, size=n_samples))

# Invoice dates (simulated time series)
start_date = datetime.datetime(2022, 1, 1)
end_date = datetime.datetime(2022, 12, 31)
invoice_dates = np.random.choice(pd.date_range(start_date, end_date, freq='D'), size=n_samples)

# Create the DataFrame
df_transactions = pd.DataFrame({
    'CustomerID': customer_ids,
    'InvoiceNo': invoice_numbers,
    'UnitPrice': unit_prices,
    'Quantity': quantities,
    'InvoiceDate': invoice_dates
})

# Introduce random missing values
missing_prob = 0.15  # 10% probability of missing values
rows_to_set_missing = np.random.choice(df_transactions.index, size=int(missing_prob * len(df_transactions)), replace=False)
cols_to_set_missing = np.random.choice(df_transactions.columns[1:], size=int(missing_prob * (len(df_transactions.columns) - 1)), replace=False)
df_transactions.loc[rows_to_set_missing, cols_to_set_missing] = None

# Save to CSV
df_transactions.to_csv('synthetic_transactions_with_null.csv', index=False)


### OLD

In [24]:
# show those records with missing value
print('Show those records with missing value')
df_missing = df_transactions[df_transactions['UnitPrice'].isna() == True]
print(df_missing)


Show those records with missing value
Empty DataFrame
Columns: [InvoiceNo, StockCode, Description, Quantity, InvoiceDate, UnitPrice, CustomerID, Country]
Index: []


In [25]:
# impute missing values with median of that column
print('Impute missing values with median of that column')
df_imputed = df_missing.fillna(df_transactions['UnitPrice'].median())
print(df_imputed)


Impute missing values with median of that column
Empty DataFrame
Columns: [InvoiceNo, StockCode, Description, Quantity, InvoiceDate, UnitPrice, CustomerID, Country]
Index: []


In [6]:
# load the dataset
df_transactions = pd.read_csv('sample_transactions.csv', header=0)

# calculate total sale in a new column
df_transactions['TotalSale'] = df_transactions['UnitPrice'] * df_transactions['Quantity']

# aggregate total sale per customer
df_summary = df_transactions.groupby('CustomerID').agg({'TotalSale': 'sum', 'InvoiceDate':'first'})

print('Total sale per customer:')
df_summary.head()

Total sale per customer:


Unnamed: 0_level_0,TotalSale,InvoiceDate
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,772.925226,2022-01-23
2,572.898079,2022-08-19
3,627.359128,2022-11-20
4,520.55826,2022-04-27
5,662.582657,2022-02-25


In [18]:
df_transactions.describe()

Unnamed: 0,InvoiceNo,Quantity,UnitPrice,CustomerID,TotalSale
count,397884.0,397884.0,397884.0,397884.0,397884.0
mean,560616.934451,12.988238,3.116488,15294.423453,22.397
std,13106.117773,179.331775,22.097877,1713.14156,309.071041
min,536365.0,1.0,0.001,12346.0,0.001
25%,549234.0,2.0,1.25,13969.0,4.68
50%,561893.0,6.0,1.95,15159.0,11.8
75%,572090.0,12.0,3.75,16795.0,19.8
max,581587.0,80995.0,8142.75,18287.0,168469.6


In [19]:
# Accessing single column
col = df_transactions['InvoiceNo']
print('data type:', type(col))

data type: <class 'pandas.core.series.Series'>


In [20]:
# count of unique customer
customer_count = df_transactions['CustomerID'].nunique()
print(f"The number of unique customers is: {customer_count}")

The number of unique customers is: 4338


In [21]:
# Show the top 10 customers based on the number of transactions
plt.figure(figsize=(10, 6))
top_10_customers = customer_count[:10]
top_10_customers.plot(kind='bar')
plt.xlabel('Count')
plt.ylabel('CustomerID')
plt.show()

TypeError: 'int' object is not subscriptable

<Figure size 1000x600 with 0 Axes>

In [13]:
# Filtering
result = df_transactions.query('Quantity >= 20 & Country == "United Kingdom"')
result.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
2,548648,71459,HANGING JAM JAR T-LIGHT HOLDER,24,2020-04-01 13:20:00,0.85,12949,United Kingdom
5,565836,85152,HAND OVER THE CHOCOLATE SIGN,48,2020-09-07 11:41:00,1.85,17511,United Kingdom
23,575622,23309,SET OF 60 I LOVE LONDON CAKE CASES,24,2020-11-10 13:13:00,0.55,16743,United Kingdom
33,555352,21242,RED RETROSPOT PLATE,96,2020-06-02 12:46:00,1.45,15838,United Kingdom
35,542373,21058,PARTY INVITES WOODLAND,24,2020-01-27 13:38:00,0.85,16767,United Kingdom


In [15]:
mask = df_transactions['Quantity'] >= 20
print('Step 1: masking')
print(mask)

Step 1: masking
0         False
1         False
2          True
3         False
4         False
          ...  
397879     True
397880     True
397881    False
397882    False
397883    False
Name: Quantity, Length: 397884, dtype: bool


In [17]:
result = df_transactions[mask]
print('\nStep 2: filter with mask')
result.head()


Step 2: filter with mask


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
2,548648,71459,HANGING JAM JAR T-LIGHT HOLDER,24,2020-04-01 13:20:00,0.85,12949,United Kingdom
5,565836,85152,HAND OVER THE CHOCOLATE SIGN,48,2020-09-07 11:41:00,1.85,17511,United Kingdom
21,573511,22195,LARGE HEART MEASURING SPOONS,24,2020-10-31 12:25:00,1.65,12347,Iceland
23,575622,23309,SET OF 60 I LOVE LONDON CAKE CASES,24,2020-11-10 13:13:00,0.55,16743,United Kingdom
33,555352,21242,RED RETROSPOT PLATE,96,2020-06-02 12:46:00,1.45,15838,United Kingdom
