# Import required libraries

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import silhouette_score
import pickle

# About the data

Source:

Dr Daqing Chen, Director: Public Analytics group. chend '@' lsbu.ac.uk, School of Engineering, London South Bank University, London SE1 0AA, UK.




Data Set Information:

This is a transnational data set which contains all the transactions occurring between 01/12/2010 and 09/12/2011 for a UK-based and registered non-store online retail.The company mainly sells unique all-occasion gifts. Many customers of the company are wholesalers.




Attribute Information:

InvoiceNo: Invoice number. Nominal, a 6-digit integral number uniquely assigned to each transaction. If this code starts with letter 'c', it indicates a cancellation.

StockCode: Product (item) code. Nominal, a 5-digit integral number uniquely assigned to each distinct product.

Description: Product (item) name. Nominal.

Quantity: The quantities of each product (item) per transaction. Numeric.

InvoiceDate: Invice Date and time. Numeric, the day and time when each transaction was generated.

UnitPrice: Unit price. Numeric, Product price per unit in sterling.

CustomerID: Customer number. Nominal, a 5-digit integral number uniquely assigned to each customer.

Country: Country name. Nominal, the name of the country where each customer resides.




Relevant Papers:

The evolution of direct, data and digital marketing, Richard Webber, Journal of Direct, Data and Digital Marketing Practice (2013) 14, 291â€“309.

Clustering Experiments on Big Transaction Data for Market Segmentation,
Ashishkumar Singh, Grace Rumantir, Annie South, Blair Bethwaite, Proceedings of the 2014 International Conference on Big Data Science and Computing.

A decision-making framework for precision marketing, Zhen You, Yain-Whar Si, Defu Zhang, XiangXiang Zeng, Stephen C.H. Leung c, Tao Li, Expert Systems with Applications, 42 (2015) 3357â€“3367.



Citation Request:

Daqing Chen, Sai Liang Sain, and Kun Guo, Data mining for the online retail industry: A case study of RFM model-based customer segmentation using data mining, Journal of Database Marketing and Customer Strategy Management, Vol. 19, No. 3, pp. 197â€“208, 2012 (Published online before print: 27 August 2012. doi: 10.1057/dbm.2012.17).

In [2]:
df = pd.read_excel('Online Retail.xlsx')
df

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
...,...,...,...,...,...,...,...,...
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,2011-12-09 12:50:00,0.85,12680.0,France
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.10,12680.0,France
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France


# Data Preprocessing

### Check the data types of columns

In [3]:
df.dtypes

InvoiceNo              object
StockCode              object
Description            object
Quantity                int64
InvoiceDate    datetime64[ns]
UnitPrice             float64
CustomerID            float64
Country                object
dtype: object

### Drop rows containing missing values

In [4]:
df.dropna(inplace = True)

### Convert categorical variables to numerical using label encoding

In [5]:
# Create a LabelEncoder object
le = LabelEncoder()

# Fit the LabelEncoder object to the 'country' column
le.fit(df['Country'])

# Transform the 'country' column using the fitted LabelEncoder object
df['country_encoded'] = le.transform(df['Country'])

In [6]:
# Create a LabelEncoder object
le = LabelEncoder()

# The column contains 2 different data types: int & str. Needs to be one only.
df['InvoiceNo'] = df['InvoiceNo'].astype(str)

# Fit the LabelEncoder object to the 'InvoiceNo' column
le.fit(df['InvoiceNo'])

# Transform the 'country' column using the fitted LabelEncoder object
df['InvoiceNo_encoded'] = le.transform(df['InvoiceNo'])

In [7]:
# Create a LabelEncoder object
le = LabelEncoder()

# The column contains 2 different data types: int & str. Needs to be one only.
df['StockCode'] = df['StockCode'].astype(str)

# Fit the LabelEncoder object to the 'StockCode' column
le.fit(df['StockCode'])

# Transform the 'country' column using the fitted LabelEncoder object
df['StockCode_encoded'] = le.transform(df['StockCode'])

In [8]:
# Create a LabelEncoder object
le = LabelEncoder()

# The column contains 2 different data types: float, int, & str. Needs to be one only.
df['Description'] = df['Description'].astype(str)

# Fit the LabelEncoder object to the 'Description' column
le.fit(df['Description'])

# Transform the 'country' column using the fitted LabelEncoder object
df['Description_encoded'] = le.transform(df['Description'])

### Standardize numerical features

In [9]:
sc = StandardScaler()

df['Quantity'] = sc.fit_transform(df['Quantity'].values.reshape(-1, 1))
df['UnitPrice'] = sc.fit_transform(df['UnitPrice'].values.reshape(-1, 1))

### Feature selection

The selected features are the most representative of customers' behavior. Like, purchased goods' barcode, quantity, and price.

In [10]:
customer_id = df['CustomerID']

features = ['InvoiceNo_encoded', 'StockCode_encoded', 'Description_encoded', 'Quantity', 'UnitPrice']
X = df[features]

# Developing the model

### hyperparameter tuning using GridSearchCV

In [11]:
params = {
    'n_clusters': [2, 3, 4, 5, 6, 7, 8, 9, 10]
}

kmeans = KMeans()
grid_search = GridSearchCV(kmeans, params, cv = 10)
grid_search.fit(X)

### The best parameters and score

In [12]:
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Best parameters: {'n_clusters': 10}
Best score: -200769916964.94598


### Fit the KMeans model with the best parameters

In [13]:
kmeans = KMeans(n_clusters = grid_search.best_params_['n_clusters'])
kmeans.fit(X)

### Save the model

In [15]:
with open("Best_KMeans_model.pkl", 'wb') as file:
    pickle.dump(kmeans, file)

# Summarizing the results

In [17]:
results_df = pd.DataFrame(
    {
    'Customer': customer_id,
    'Predicted Segment': kmeans.labels_
    }
)

results_df

Unnamed: 0,Customer,Predicted Segment
0,17850.0,0
1,17850.0,0
2,17850.0,0
3,17850.0,0
4,17850.0,0
...,...,...
541904,12680.0,2
541905,12680.0,6
541906,12680.0,6
541907,12680.0,6


In [18]:
results_df.to_csv('Results.csv', index = False)

### Hereby, We managed to group each customer based on their purchase beahvior.

# Thanks for reading