In [1]:
# Import our dependencies
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
from datetime import date
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score, classification_report
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf

In [2]:
# Read the data
retail_df = pd.read_excel("Online_Retail.xlsx")
RFM_df = pd.read_csv("RFM.csv")

In [3]:
retail_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    541909 non-null  object        
 1   StockCode    541909 non-null  object        
 2   Description  540455 non-null  object        
 3   Quantity     541909 non-null  int64         
 4   InvoiceDate  541909 non-null  datetime64[ns]
 5   UnitPrice    541909 non-null  float64       
 6   CustomerID   406829 non-null  float64       
 7   Country      541909 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 33.1+ MB


In [4]:
RFM_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4339 entries, 0 to 4338
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   CustomerID             4339 non-null   int64  
 1   Frequency              4339 non-null   int64  
 2   MonetaryValue          4339 non-null   float64
 3   UnitPrice              4339 non-null   float64
 4   Quantity               4339 non-null   float64
 5   Recency                4339 non-null   int64  
 6   RecencyScore           4339 non-null   int64  
 7   FrequencyScore         4339 non-null   int64  
 8   MonetaryScore          4339 non-null   int64  
 9   RFM_Score              4339 non-null   int64  
 10  Value Segment          4339 non-null   object 
 11  RFM Customer Segments  4339 non-null   object 
dtypes: float64(3), int64(7), object(2)
memory usage: 406.9+ KB


In [5]:
# Droping rows having missing values

retail_df = retail_df.dropna()
retail_df.shape

(406829, 8)

In [6]:
# Drop rows with "POST" in the StockCode column
retail_df = retail_df[retail_df['StockCode'] != 'POST']
retail_df['StockCode'] = retail_df['StockCode'].astype(str)

# Drop rows where InvoiceNo column starts with "C"
retail_df = retail_df[~retail_df['InvoiceNo'].astype(str).str.startswith('C')]

In [7]:
# Convert the Customer ID field to remove the 0s
retail_df['CustomerID'] = retail_df['CustomerID'].astype(int)
retail_df['CustomerID'] = retail_df['CustomerID'].astype(str)

# Convert 'InvoiceDate' column to datetime data type
retail_df['InvoiceDate'] = pd.to_datetime(retail_df['InvoiceDate'])

# Verify the data types of the DataFrame
retail_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 396825 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    396825 non-null  object        
 1   StockCode    396825 non-null  object        
 2   Description  396825 non-null  object        
 3   Quantity     396825 non-null  int64         
 4   InvoiceDate  396825 non-null  datetime64[ns]
 5   UnitPrice    396825 non-null  float64       
 6   CustomerID   396825 non-null  object        
 7   Country      396825 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(5)
memory usage: 27.2+ MB


In [8]:
retail_df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom


In [9]:
#add new column Total Price
retail_df['TotalPrice'] = retail_df['Quantity'] * retail_df['UnitPrice']

In [10]:
# Convert 'InvoiceDate' column to datetime format if it's not already
retail_df['InvoiceDate'] = pd.to_datetime(retail_df['InvoiceDate'])

# Extract month and week from 'InvoiceDate' and create new columns
retail_df['InvoiceMonth'] = retail_df['InvoiceDate'].dt.month
retail_df['InvoiceDay'] = retail_df['InvoiceDate'].dt.day_of_week

# Display the DataFrame with the new invoice columns
retail_df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,TotalPrice,InvoiceMonth,InvoiceDay
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom,15.3,12,2
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34,12,2
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom,22.0,12,2
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34,12,2
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34,12,2


In [11]:
# Change the Customer ID field into string to match retail_df
RFM_df['CustomerID'] = RFM_df['CustomerID'].astype(str)

In [12]:
# Merge the two dataframes
RFM_merged_df = pd.merge(retail_df, RFM_df, on=['CustomerID'], how='inner')
RFM_merged_df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity_x,InvoiceDate,UnitPrice_x,CustomerID,Country,TotalPrice,InvoiceMonth,...,MonetaryValue,UnitPrice_y,Quantity_y,Recency,RecencyScore,FrequencyScore,MonetaryScore,RFM_Score,Value Segment,RFM Customer Segments
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom,15.3,12,...,5391.21,3.96037,5.835017,373,1,1,1,3,Low-Value,Gone
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34,12,...,5391.21,3.96037,5.835017,373,1,1,1,3,Low-Value,Gone
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom,22.0,12,...,5391.21,3.96037,5.835017,373,1,1,1,3,Low-Value,Gone
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34,12,...,5391.21,3.96037,5.835017,373,1,1,1,3,Low-Value,Gone
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34,12,...,5391.21,3.96037,5.835017,373,1,1,1,3,Low-Value,Gone


In [13]:
# Double check the shape and columns of merged dataframe
print(RFM_merged_df.shape)
print(RFM_merged_df.columns)

(396825, 22)
Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity_x', 'InvoiceDate',
       'UnitPrice_x', 'CustomerID', 'Country', 'TotalPrice', 'InvoiceMonth',
       'InvoiceDay', 'Frequency', 'MonetaryValue', 'UnitPrice_y', 'Quantity_y',
       'Recency', 'RecencyScore', 'FrequencyScore', 'MonetaryScore',
       'RFM_Score', 'Value Segment', 'RFM Customer Segments'],
      dtype='object')


In [14]:
# Narrow down the dataframe for feature engineering
customer_evaluation_df = RFM_merged_df[['StockCode', 'Quantity_x', 'UnitPrice_x', 'TotalPrice', 'CustomerID', 'Country',
                              'InvoiceMonth', 'InvoiceDay', 'Value Segment']]
customer_evaluation_df.head()

Unnamed: 0,StockCode,Quantity_x,UnitPrice_x,TotalPrice,CustomerID,Country,InvoiceMonth,InvoiceDay,Value Segment
0,85123A,6,2.55,15.3,17850,United Kingdom,12,2,Low-Value
1,71053,6,3.39,20.34,17850,United Kingdom,12,2,Low-Value
2,84406B,8,2.75,22.0,17850,United Kingdom,12,2,Low-Value
3,84029G,6,3.39,20.34,17850,United Kingdom,12,2,Low-Value
4,84029E,6,3.39,20.34,17850,United Kingdom,12,2,Low-Value


In [15]:
# Transform the Value Segment column to binary where 0 is Low-value while 1 is Mid- to High-value
customer_evaluation_df.loc[ customer_evaluation_df['Value Segment'] == 'Low-Value', 'Value Segment'] = 0
customer_evaluation_df.loc[ customer_evaluation_df['Value Segment'] == 'Mid-Value', 'Value Segment'] = 1
customer_evaluation_df.loc[ customer_evaluation_df['Value Segment'] == 'High-Value', 'Value Segment'] = 1
customer_evaluation_df.sample(5)

Unnamed: 0,StockCode,Quantity_x,UnitPrice_x,TotalPrice,CustomerID,Country,InvoiceMonth,InvoiceDay,Value Segment
342598,84988,12,1.45,17.4,12885,United Kingdom,10,4,1
211275,23301,3,1.65,4.95,18125,United Kingdom,11,3,0
82305,20713,20,2.08,41.6,15078,United Kingdom,11,0,0
81778,21739,1,2.95,2.95,17596,United Kingdom,7,1,0
104021,22970,3,2.55,7.65,15529,United Kingdom,11,3,0


In [16]:
#customer_evaluation_df['CustomerID'] = customer_evaluation_df['CustomerID'].astype(int)
customer_evaluation_df['Value Segment'] = customer_evaluation_df['Value Segment'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  customer_evaluation_df['Value Segment'] = customer_evaluation_df['Value Segment'].astype(int)


In [17]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
customer_evaluation_df['Country'] = encoder.fit_transform(customer_evaluation_df['Country'])
customer_evaluation_df['StockCode'] = encoder.fit_transform(customer_evaluation_df['StockCode'])
customer_evaluation_df['CustomerID'] = encoder.fit_transform(customer_evaluation_df['CustomerID'])
customer_evaluation_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  customer_evaluation_df['Country'] = encoder.fit_transform(customer_evaluation_df['Country'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  customer_evaluation_df['StockCode'] = encoder.fit_transform(customer_evaluation_df['StockCode'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  customer_evalua

Unnamed: 0,StockCode,Quantity_x,UnitPrice_x,TotalPrice,CustomerID,Country,InvoiceMonth,InvoiceDay,Value Segment
0,3233,6,2.55,15.3,4017,35,12,2,0
1,2643,6,3.39,20.34,4017,35,12,2,0
2,2847,8,2.75,22.0,4017,35,12,2,0
3,2795,6,3.39,20.34,4017,35,12,2,0
4,2794,6,3.39,20.34,4017,35,12,2,0


In [18]:
# Split target column from dataset
y = customer_evaluation_df['Value Segment'].values.reshape(-1, 1)
X = customer_evaluation_df.drop(columns='Value Segment')

In [19]:
# Preview the feature data
X.head()

Unnamed: 0,StockCode,Quantity_x,UnitPrice_x,TotalPrice,CustomerID,Country,InvoiceMonth,InvoiceDay
0,3233,6,2.55,15.3,4017,35,12,2
1,2643,6,3.39,20.34,4017,35,12,2
2,2847,8,2.75,22.0,4017,35,12,2
3,2795,6,3.39,20.34,4017,35,12,2
4,2794,6,3.39,20.34,4017,35,12,2


In [20]:
# Preview the target data
y[:5]

array([[0],
       [0],
       [0],
       [0],
       [0]])

In [21]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [22]:
print(f"X_train shape = {X_train.shape}")
print(f"X_train shape = {X_test.shape}")

X_train shape = (297618, 8)
X_train shape = (99207, 8)


# Logistical Regression Model

In [23]:
# Creat a Logistic Regression Model
lr_model = LogisticRegression(random_state=42)
lr_model

In [24]:
# Train the data
lr_model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [25]:
# Predict outcomes for test data set
predictions = lr_model.predict(X_test)

In [26]:
# Print classification report
print(classification_report(predictions,y_test))

              precision    recall  f1-score   support

           0       0.96      0.70      0.81     92623
           1       0.12      0.56      0.19      6584

    accuracy                           0.69     99207
   macro avg       0.54      0.63      0.50     99207
weighted avg       0.90      0.69      0.77     99207



# KNeighbors Classifier Model

In [27]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Define the parameter distributions to sample from
param_dist = {'n_neighbors': randint(1, 10), 'weights': ['uniform', 'distance']}

# Create a KNN classifier
knn = KNeighborsClassifier()

# Instantiate RandomizedSearchCV
random_search = RandomizedSearchCV(knn, param_distributions=param_dist, n_iter=5, cv=5)

# Fit the RandomizedSearchCV object to find the best hyperparameters
random_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = random_search.best_params_
print("Best Hyperparameters:", best_params)

# Get the best model
best_model = random_search.best_estimator_

# Make predictions using the best model
y_pred = best_model.predict(X_test)

  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)


Best Hyperparameters: {'n_neighbors': 1, 'weights': 'uniform'}


In [28]:
# Create and fit a KNN model
knn = KNeighborsClassifier(n_neighbors=2)  # Define the number of neighbors
knn.fit(X_train, y_train)

# Make predictions on the test data
knn_predictions = knn.predict(X_test)

# Evaluate the model
knn_accuracy = accuracy_score(y_test, knn_predictions)
print("Accuracy:", knn_accuracy)
print(classification_report(knn_predictions,y_test))

  return self._fit(X, y)


Accuracy: 0.8482163557007066
              precision    recall  f1-score   support

           0       0.96      0.84      0.90     77484
           1       0.61      0.87      0.71     21723

    accuracy                           0.85     99207
   macro avg       0.78      0.86      0.81     99207
weighted avg       0.88      0.85      0.86     99207



# Neural Network Model

In [29]:
# Create the StandardScaler instance
scaler = StandardScaler()
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [44]:
# Create a Keras Sequential model and add more than one Dense hidden layer
nn_model = tf.keras.models.Sequential()

nn_model.add(tf.keras.layers.Dense(units=16, activation="relu", input_dim=8))

nn_model.add(tf.keras.layers.Dense(units=40, activation="relu"))

nn_model.add(tf.keras.layers.Dense(units=40, activation="relu"))

nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the Sequential model
nn_model.summary()

In [46]:
# Compile the Sequential model together and customize metrics
nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Fit the model to the training data
fit_model = nn_model.fit(X_train_scaled, y_train, epochs=50)

Epoch 1/50
[1m9301/9301[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 570us/step - accuracy: 0.7978 - loss: 0.4092
Epoch 2/50
[1m9301/9301[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 647us/step - accuracy: 0.7968 - loss: 0.4110
Epoch 3/50
[1m9301/9301[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 565us/step - accuracy: 0.7969 - loss: 0.4099
Epoch 4/50
[1m9301/9301[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 556us/step - accuracy: 0.7962 - loss: 0.4099
Epoch 5/50
[1m9301/9301[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 566us/step - accuracy: 0.7968 - loss: 0.4089
Epoch 6/50
[1m9301/9301[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 569us/step - accuracy: 0.7978 - loss: 0.4078
Epoch 7/50
[1m9301/9301[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 551us/step - accuracy: 0.7969 - loss: 0.4074
Epoch 8/50
[1m9301/9301[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 554us/step - accuracy: 0.7972 - loss: 0.4076
Epoch 9/

In [47]:
# Evaluate the performance of model using the loss and predictive accuracy of the model on the test dataset.
model_loss, model_accuracy = nn_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

3101/3101 - 1s - 434us/step - accuracy: 0.8047 - loss: 0.3958
Loss: 0.3958050608634949, Accuracy: 0.8047214150428772
