# CUSTOMER SEGMENTATION

In [2]:
# Imports

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from category_encoders import OneHotEncoder
from ipywidgets import Dropdown, FloatSlider, IntSlider, interact
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, ConfusionMatrixDisplay, confusion_matrix
import joblib

In [3]:
df = pd.read_excel("online_retail.xlsx")
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [6]:
df.tail()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,2011-12-09 12:50:00,0.85,12680.0,France
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.1,12680.0,France
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France
541908,581587,22138,BAKING SET 9 PIECE RETROSPOT,3,2011-12-09 12:50:00,4.95,12680.0,France


In [5]:
df2 = pd.read_excel("online_retail_II.xlsx")
df2.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom


In [7]:
df2.tail()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
525456,538171,22271,FELTCRAFT DOLL ROSIE,2,2010-12-09 20:01:00,2.95,17530.0,United Kingdom
525457,538171,22750,FELTCRAFT PRINCESS LOLA DOLL,1,2010-12-09 20:01:00,3.75,17530.0,United Kingdom
525458,538171,22751,FELTCRAFT PRINCESS OLIVIA DOLL,1,2010-12-09 20:01:00,3.75,17530.0,United Kingdom
525459,538171,20970,PINK FLORAL FELTCRAFT SHOULDER BAG,2,2010-12-09 20:01:00,3.75,17530.0,United Kingdom
525460,538171,21931,JUMBO STORAGE BAG SUKI,2,2010-12-09 20:01:00,1.95,17530.0,United Kingdom


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    541909 non-null  object        
 1   StockCode    541909 non-null  object        
 2   Description  540455 non-null  object        
 3   Quantity     541909 non-null  int64         
 4   InvoiceDate  541909 non-null  datetime64[ns]
 5   UnitPrice    541909 non-null  float64       
 6   CustomerID   406829 non-null  float64       
 7   Country      541909 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 33.1+ MB


In [9]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 525461 entries, 0 to 525460
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   Invoice      525461 non-null  object        
 1   StockCode    525461 non-null  object        
 2   Description  522533 non-null  object        
 3   Quantity     525461 non-null  int64         
 4   InvoiceDate  525461 non-null  datetime64[ns]
 5   Price        525461 non-null  float64       
 6   Customer ID  417534 non-null  float64       
 7   Country      525461 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 32.1+ MB


In [11]:
df.isna().sum()

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

In [12]:
df2.isna().sum()

Invoice             0
StockCode           0
Description      2928
Quantity            0
InvoiceDate         0
Price               0
Customer ID    107927
Country             0
dtype: int64

In [14]:
x = df[df["CustomerID"].isna()]
x.head(10)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
622,536414,22139,,56,2010-12-01 11:52:00,0.0,,United Kingdom
1443,536544,21773,DECORATIVE ROSE BATHROOM BOTTLE,1,2010-12-01 14:32:00,2.51,,United Kingdom
1444,536544,21774,DECORATIVE CATS BATHROOM BOTTLE,2,2010-12-01 14:32:00,2.51,,United Kingdom
1445,536544,21786,POLKADOT RAIN HAT,4,2010-12-01 14:32:00,0.85,,United Kingdom
1446,536544,21787,RAIN PONCHO RETROSPOT,2,2010-12-01 14:32:00,1.66,,United Kingdom
1447,536544,21790,VINTAGE SNAP CARDS,9,2010-12-01 14:32:00,1.66,,United Kingdom
1448,536544,21791,VINTAGE HEADS AND TAILS CARD GAME,2,2010-12-01 14:32:00,2.51,,United Kingdom
1449,536544,21801,CHRISTMAS TREE DECORATION WITH BELL,10,2010-12-01 14:32:00,0.43,,United Kingdom
1450,536544,21802,CHRISTMAS TREE HEART DECORATION,9,2010-12-01 14:32:00,0.43,,United Kingdom
1451,536544,21803,CHRISTMAS TREE STAR DECORATION,11,2010-12-01 14:32:00,0.43,,United Kingdom


In [17]:
df.nunique()

InvoiceNo      25900
StockCode       4070
Description     4223
Quantity         722
InvoiceDate    23260
UnitPrice       1630
CustomerID      4372
Country           38
dtype: int64

In [16]:
df["Country"].unique()

array(['United Kingdom', 'France', 'Australia', 'Netherlands', 'Germany',
       'Norway', 'EIRE', 'Switzerland', 'Spain', 'Poland', 'Portugal',
       'Italy', 'Belgium', 'Lithuania', 'Japan', 'Iceland',
       'Channel Islands', 'Denmark', 'Cyprus', 'Sweden', 'Austria',
       'Israel', 'Finland', 'Bahrain', 'Greece', 'Hong Kong', 'Singapore',
       'Lebanon', 'United Arab Emirates', 'Saudi Arabia',
       'Czech Republic', 'Canada', 'Unspecified', 'Brazil', 'USA',
       'European Community', 'Malta', 'RSA'], dtype=object)

In [35]:
# Check if each InvoiceNo has a single unique CustomerID
is_unique = df.groupby("InvoiceNo")["CustomerID"].unique()
is_unique

InvoiceNo
536365     [17850.0]
536366     [17850.0]
536367     [13047.0]
536368     [13047.0]
536369     [13047.0]
             ...    
C581484    [16446.0]
C581490    [14397.0]
C581499    [15498.0]
C581568    [15311.0]
C581569    [17315.0]
Name: CustomerID, Length: 25900, dtype: object

In [37]:
def detailed_invoice_customer_analysis(df):
    # Total number of invoices
    total_invoices = df['InvoiceNo'].nunique()
    
    # Group by InvoiceNo and count unique CustomerIDs
    invoice_customer_counts = df.groupby("InvoiceNo")["CustomerID"].nunique()
    
    # Invoices with multiple CustomerIDs
    multi_customer_invoices = invoice_customer_counts[invoice_customer_counts > 1]
    
    # Detailed report
    print("Invoice-Customer Uniqueness Analysis:")
    print(f"Total Unique Invoices: {total_invoices}")
    print(f"Invoices with Multiple CustomerIDs: {len(multi_customer_invoices)}")
    
    if len(multi_customer_invoices) > 0:
        print("\nDetailed Breakdown:")
        for invoice, count in multi_customer_invoices.items():
            # Get the actual CustomerIDs for each problematic invoice
            customers = df[df['InvoiceNo'] == invoice]['CustomerID'].unique()
            print(f"InvoiceNo: {invoice}, Unique CustomerIDs: {count}")
            print(f"CustomerIDs: {customers}\n")
    
    return invoice_customer_counts

# Run the detailed analysis
analysis_results = detailed_invoice_customer_analysis(df)

Invoice-Customer Uniqueness Analysis:
Total Unique Invoices: 25900
Invoices with Multiple CustomerIDs: 0


In [43]:
import pandas as pd
import numpy as np

# Find InvoiceNo where CustomerID is NaN
def find_invoicenos_with_null_customerid(df):
    # Filter rows where CustomerID is NaN
    null_customerid_rows = df[df['CustomerID'].isna()]
    
    # Get unique InvoiceNo with null CustomerID
    null_customerid_invoicenos = null_customerid_rows['InvoiceNo'].unique()
    
    return null_customerid_invoicenos

# Check if each InvoiceNo has a single unique CustomerID
def check_invoiceno_customerid_uniqueness(df):
    # Group by InvoiceNo and count unique CustomerIDs
    invoiceno_customerid_counts = df.groupby('InvoiceNo')['CustomerID'].nunique()
    
    # Find InvoiceNo with more than one unique CustomerID
    multi_customerid_invoicenos = invoiceno_customerid_counts[invoiceno_customerid_counts > 1]
    
    return multi_customerid_invoicenos

# Autofill null CustomerID
def autofill_null_customerid(df):
    # Create a copy of the dataframe to avoid modifying the original
    df_filled = df.copy()
    
    # Find InvoiceNo with unique non-null CustomerID
    invoiceno_customerid = df_filled.dropna(subset=['CustomerID']).groupby('InvoiceNo')['CustomerID'].first()
    
    # Update null CustomerID with the unique CustomerID for that InvoiceNo
    df_filled.loc[df_filled['CustomerID'].isna(), 'CustomerID'] = df_filled.loc[df_filled['CustomerID'].isna(), 'InvoiceNo'].map(invoiceno_customerid)
    
    return df_filled

# Example usage:
# Assuming 'df' is your original dataframe
# 1. Find InvoiceNo with null CustomerID
null_customerid_invoicenos = find_invoicenos_with_null_customerid(df)
print("InvoiceNo with null CustomerID:", null_customerid_invoicenos)

# 2. Check InvoiceNo CustomerID uniqueness
multi_customerid_invoicenos = check_invoiceno_customerid_uniqueness(df)
print("\nInvoiceNo with multiple CustomerIDs:")
print(len(multi_customerid_invoicenos))

# 3. Autofill null CustomerID
df_filled = autofill_null_customerid(df)
print("\nNull CustomerID before filling:", df['CustomerID'].isna().sum())
print("Null CustomerID after filling:", df_filled['CustomerID'].isna().sum())

InvoiceNo with null CustomerID: [536414 536544 536545 ... 581492 581497 581498]

InvoiceNo with multiple CustomerIDs:
0

Null CustomerID before filling: 135080
Null CustomerID after filling: 135080


In [40]:
# Check if each InvoiceNo has a single unique CustomerID
def check_invoiceno_customerid_uniqueness(df):
    # Group by InvoiceNo and count unique CustomerIDs
    invoiceno_customerid_counts = df.groupby('InvoiceNo')['CustomerID'].nunique()
    
    # Find InvoiceNo with more than one unique CustomerID
    multi_customerid_invoicenos = invoiceno_customerid_counts[invoiceno_customerid_counts > 1]
    
    return multi_customerid_invoicenos

check_invoiceno_customerid_uniqueness(df)

Series([], Name: CustomerID, dtype: int64)