In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
data = pd.read_excel("Online Retail.xlsx")

In [3]:
data.shape

(541909, 8)

In [4]:
data.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [5]:
data.isnull().sum()

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

#### Preprocessing techniques

In [6]:
eda_data = data.dropna(subset=['Description'])

In [7]:
eda_data = eda_data[eda_data['Quantity'] > 0]
eda_data = eda_data[eda_data['UnitPrice'] > 0]

In [8]:
eda_data['Description'] = eda_data['Description'].apply(lambda x: re.sub(r'[^a-zA-Z0-9-\s]', '', x))

In [9]:
eda_data = eda_data.dropna(subset=['Description'])

In [10]:
eda_data.isnull().sum()

InvoiceNo           0
StockCode           0
Description         0
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     132220
Country             0
dtype: int64

In [11]:
eda_data['Totalsales'] = eda_data['Quantity'] * eda_data['UnitPrice']

In [12]:
eda_data['CustomerID'] = pd.to_numeric(eda_data['CustomerID'], errors='coerce').astype('Int64')

In [13]:
eda_data.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Totalsales
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom,15.3
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom,22.0
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34


#### Machine Learning Models

In [14]:
#### Predict Customer Purchase Probability

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn import tree
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import confusion_matrix, matthews_corrcoef
from sklearn.metrics import roc_auc_score

In [16]:
predict_data = eda_data.copy()

In [17]:
predict_data = predict_data.dropna(subset=['CustomerID'])

In [18]:
predict_data['RepeatPurchase'] = predict_data.duplicated(subset=['CustomerID', 'Description'], keep=False).astype(int)

#####  defining the target variable:
        1: Customer repurchased the product.
        0: Customer did not.

In [19]:
predict_data['RepeatPurchase'].value_counts()

0    203399
1    194485
Name: RepeatPurchase, dtype: int64

In [20]:
X = predict_data[['CustomerID','Description']]
y = predict_data['RepeatPurchase']

In [21]:
X['Description'] = pd.factorize(X['Description'])[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Description'] = pd.factorize(X['Description'])[0]


In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
algorithms = {'Logistic Regression': 
              {"model": LogisticRegression(),
               "params": {}
              },
              
              'Decision Tree': 
              {"model": tree.DecisionTreeClassifier(),
               "params": {"criterion": ['gini','entropy'],"max_depth": [1, 3, 5, 7, 9, 10],"min_samples_split": [2, 4, 6, 8, 10],"min_samples_leaf": [i for i in range(1, 10)]}
              },
              
              'Random Forest': 
              {"model": RandomForestClassifier(),
               "params": {"n_estimators": [100, 200, 300],"max_features": ["auto", "log2", "sqrt"],"max_depth": [1, 3, 5, 7, 9, 10],"min_samples_split": [2, 4, 6, 8, 10]}
              },
              
              'NaiveBayes' :
              {"model": GaussianNB(),
               "params": {}
              },
              
              'K-Nearest Neighbors' :
              {"model": KNeighborsClassifier(),
               "params": {"n_neighbors": [3,5,10],"weights": ["uniform", "distance"],"metric": ["euclidean", "manhattan","minkowski"]}
              },
              
              'Gradient Boost' :
              {"model": GradientBoostingClassifier(),
               "params": {"learning_rate": np.arange(0.1, 1, 0.1),"n_estimators": [100, 200, 300],"criterion": ['friedman_mse', 'mse'],"min_samples_split": [2, 4, 6, 8, 10],"min_samples_leaf": [i for i in range(1, 10)],"max_depth": [1, 3, 5, 7, 9, 10],"max_features": ["auto", "log2", "sqrt"]}
              }
             }

In [30]:
best_model = {}
best_model_details = []

for model_name, values in algorithms.items():
    rscv = RandomizedSearchCV(values["model"], values["params"], cv=5, n_iter=15, n_jobs=-1, verbose=0, random_state=42)
    rscv.fit(X_train, y_train)

    # Update with the actual best score
    best_score = rscv.best_score_

    best_model[model_name] = rscv
    best_model_details.append({
        "Model Name": model_name, 
        "Best Score": best_score, 
        "Best Parameters": rscv.best_params_
    })

    print(f"{model_name}: Best Score = {best_score}")

# Display the model details sorted by performance
best_model_details = sorted(best_model_details, key=lambda x: x['Best Score'], reverse=True)

# Print the results neatly
for detail in best_model_details:
    print(f"Model: {detail['Model Name']}, Best Score: {detail['Best Score']:.4f}, Best Params: {detail['Best Parameters']}")




Logistic Regression: Best Score = 0.5585833861705181
Decision Tree: Best Score = 0.6336524203385748


  warn(


Random Forest: Best Score = 0.6450910581404999




NaiveBayes: Best Score = 0.5558941561853009
K-Nearest Neighbors: Best Score = 0.7958386010566396


25 fits failed out of a total of 75.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklearn\ensemble\_gb.py", line 420, in fit
    self._validate_params()
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklearn\base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 97, in validate_parameter_constraints
    raise InvalidParameterEr

Gradient Boost: Best Score = 0.8009028986663773
Model: Gradient Boost, Best Score: 0.8009, Best Params: {'n_estimators': 300, 'min_samples_split': 4, 'min_samples_leaf': 8, 'max_features': 'log2', 'max_depth': 10, 'learning_rate': 0.7000000000000001, 'criterion': 'friedman_mse'}
Model: K-Nearest Neighbors, Best Score: 0.7958, Best Params: {'weights': 'distance', 'n_neighbors': 3, 'metric': 'manhattan'}
Model: Random Forest, Best Score: 0.6451, Best Params: {'n_estimators': 300, 'min_samples_split': 2, 'max_features': 'auto', 'max_depth': 10}
Model: Decision Tree, Best Score: 0.6337, Best Params: {'min_samples_split': 10, 'min_samples_leaf': 1, 'max_depth': 10, 'criterion': 'gini'}
Model: Logistic Regression, Best Score: 0.5586, Best Params: {}
Model: NaiveBayes, Best Score: 0.5559, Best Params: {}


In [31]:
pd.set_option('display.max_colwidth', None)
pd.DataFrame(best_model_details)

Unnamed: 0,Model Name,Best Score,Best Parameters
0,Gradient Boost,0.800903,"{'n_estimators': 300, 'min_samples_split': 4, 'min_samples_leaf': 8, 'max_features': 'log2', 'max_depth': 10, 'learning_rate': 0.7000000000000001, 'criterion': 'friedman_mse'}"
1,K-Nearest Neighbors,0.795839,"{'weights': 'distance', 'n_neighbors': 3, 'metric': 'manhattan'}"
2,Random Forest,0.645091,"{'n_estimators': 300, 'min_samples_split': 2, 'max_features': 'auto', 'max_depth': 10}"
3,Decision Tree,0.633652,"{'min_samples_split': 10, 'min_samples_leaf': 1, 'max_depth': 10, 'criterion': 'gini'}"
4,Logistic Regression,0.558583,{}
5,NaiveBayes,0.555894,{}


In [33]:
test_model = []

for model_name, model in best_model.items():
    test_model.append({"Model Name": model_name, "Test Score": model.score(X_test, y_test)})

pd.DataFrame(test_model)

Unnamed: 0,Model Name,Test Score
0,Logistic Regression,0.554117
1,Decision Tree,0.631037
2,Random Forest,0.643264
3,NaiveBayes,0.552182
4,K-Nearest Neighbors,0.82167
5,Gradient Boost,0.809794


In [34]:
train_model_metrics = []
test_model_metrics = []

for model_name, model in best_model.items():
    y_pred_train = model.predict(X_train)
    y_pred_train_probs = model.predict_proba(X_train)[:, 1]
    tn_train, fp_train, fn_train, tp_train = confusion_matrix(y_train, y_pred_train).ravel()
    
    y_pred_test = model.predict(X_test)
    y_pred_test_probs = model.predict_proba(X_test)[:, 1]
    tn_test, fp_test, fn_test, tp_test = confusion_matrix(y_test, y_pred_test).ravel()
    
    train_mae = mean_absolute_error(y_train, y_pred_train)
    train_mse = mean_squared_error(y_train, y_pred_train)
    train_rmse = np.sqrt(train_mse)
    train_auc = roc_auc_score(y_train, y_pred_train_probs)
    train_sensitivity = tp_train / (tp_train + fn_train)
    train_specificity = tn_train / (tn_train + fp_train)
    train_mcc = matthews_corrcoef(y_train, y_pred_train)

    test_mae = mean_absolute_error(y_test, y_pred_test)
    test_mse = mean_squared_error(y_test, y_pred_test)
    test_rmse = np.sqrt(test_mse)
    test_auc = roc_auc_score(y_test, y_pred_test_probs)
    test_sensitivity = tp_test / (tp_test + fn_test)
    test_specificity = tn_test / (tn_test + fp_test)
    test_mcc = matthews_corrcoef(y_test, y_pred_test)
    
    train_model_metrics.append({"Model Name": model_name, "Mean Absolute Error": train_mae, "Mean Squared Error": train_mse, "Root Mean Squared Error": train_rmse, "AUC": train_auc, "Sensitivity": train_sensitivity, "Specificity": train_specificity, "Matthews correlation coefficient": train_mcc})
    test_model_metrics.append({"Model Name": model_name, "Mean Absolute Error": test_mae, "Mean Squared Error": test_mse, "Root Mean Squared Error": test_rmse, "AUC": test_auc, "Sensitivity": test_sensitivity, "Specificity": test_specificity, "Matthews correlation coefficient": test_mcc})
    

train_model_metrics = pd.DataFrame(train_model_metrics)
test_model_metrics = pd.DataFrame(test_model_metrics)

In [35]:
print("-------- Training Data Metrics --------")
train_model_metrics

-------- Training Data Metrics --------


Unnamed: 0,Model Name,Mean Absolute Error,Mean Squared Error,Root Mean Squared Error,AUC,Sensitivity,Specificity,Matthews correlation coefficient
0,Logistic Regression,0.441417,0.441417,0.664392,0.585341,0.614269,0.505457,0.120372
1,Decision Tree,0.361893,0.361893,0.601575,0.699102,0.572457,0.700739,0.275633
2,Random Forest,0.346979,0.346979,0.58905,0.716874,0.569639,0.732569,0.306589
3,NaiveBayes,0.44409,0.44409,0.666401,0.585689,0.679864,0.437654,0.121006
4,K-Nearest Neighbors,0.0,0.0,0.0,1.0,1.0,1.0,1.0
5,Gradient Boost,0.094877,0.094877,0.308021,0.96818,0.91144,0.899096,0.81033


In [36]:
print("-------- Test Data Metrics --------")
test_model_metrics

-------- Test Data Metrics --------


Unnamed: 0,Model Name,Mean Absolute Error,Mean Squared Error,Root Mean Squared Error,AUC,Sensitivity,Specificity,Matthews correlation coefficient
0,Logistic Regression,0.445883,0.445883,0.667744,0.579181,0.610196,0.500012,0.110841
1,Decision Tree,0.368963,0.368963,0.607424,0.690134,0.567126,0.692699,0.262014
2,Random Forest,0.356736,0.356736,0.597274,0.704458,0.561803,0.721859,0.287564
3,NaiveBayes,0.447818,0.447818,0.669192,0.579438,0.674532,0.434137,0.111879
4,K-Nearest Neighbors,0.17833,0.17833,0.422292,0.911382,0.952017,0.695909,0.6681
5,Gradient Boost,0.190206,0.190206,0.436126,0.888625,0.82383,0.796252,0.620067


### According to above result, i consider KNN 

In [37]:
model = KNeighborsClassifier(n_neighbors=3, weights='distance', metric='manhattan')
model.fit(X_train, y_train)

In [38]:
y_pred = model.predict(X_test)

In [46]:
report = classification_report(y_test, y_pred, target_names=["Non-Repeat", "Repeat"])
conf_matrix = confusion_matrix(y_test, y_pred)

print("📊 Classification Report:\n", report)
print("🔍 Confusion Matrix:\n", conf_matrix)

📊 Classification Report:
               precision    recall  f1-score   support

  Non-Repeat       0.94      0.70      0.80     40501
      Repeat       0.75      0.95      0.84     39076

    accuracy                           0.82     79577
   macro avg       0.84      0.82      0.82     79577
weighted avg       0.85      0.82      0.82     79577

🔍 Confusion Matrix:
 [[28185 12316]
 [ 1875 37201]]


#### Product Recommendation System for each customer

In [40]:
customer_item_matrix = predict_data.pivot_table(index='CustomerID', columns='Description', values='Quantity', aggfunc='sum').fillna(0)

In [41]:
item_similarity = pd.DataFrame(cosine_similarity(customer_item_matrix.T), 
                                index=customer_item_matrix.columns, 
                                columns=customer_item_matrix.columns)

In [42]:
def recommend_products(purchased_product, n=5):
    if purchased_product not in item_similarity.index:
        return "Product not found"
    
    similar_items = item_similarity[purchased_product].sort_values(ascending=False).iloc[1:n+1]
    return similar_items

In [43]:
def recommend_for_customer(customer_id, n=5):
    if customer_id not in customer_item_matrix.index:
        return "Customer not found"
    
    # Get items purchased by the customer
    purchased_items = customer_item_matrix.loc[customer_id]
    purchased_items = purchased_items[purchased_items > 0].index
    
    recommendations = {}
    for item in purchased_items:
        recs = recommend_products(item, n)
        for rec_item, score in recs.items():
            recommendations[rec_item] = recommendations.get(rec_item, 0) + score
    
    # Sort recommendations by similarity score
    recommended_products = pd.Series(recommendations).sort_values(ascending=False).head(n)
    max_score = recommended_products.max()
    recommended_products = recommended_products / max_score
    
    return recommended_products

In [44]:
recommend_for_customer(17850)

HAND WARMER RED POLKA DOT              1.000000
IVORY EMBROIDERED QUILT                0.958150
GLASS STAR FROSTED T-LIGHT HOLDER      0.892027
3 DRAWER ANTIQUE WHITE WOOD CABINET    0.746281
VINTAGE BILLBOARD DRINK ME MUG         0.734982
dtype: float64