# 1. Load the Data

In [7]:
import pandas as pd

# Load the dataset
df = pd.read_excel("Online Retail.xlsx")

print("Dataset loaded successfully!")
print(df.head())

Dataset loaded successfully!
  InvoiceNo StockCode                          Description  Quantity  \
0    536365    85123A   WHITE HANGING HEART T-LIGHT HOLDER         6   
1    536365     71053                  WHITE METAL LANTERN         6   
2    536365    84406B       CREAM CUPID HEARTS COAT HANGER         8   
3    536365    84029G  KNITTED UNION FLAG HOT WATER BOTTLE         6   
4    536365    84029E       RED WOOLLY HOTTIE WHITE HEART.         6   

          InvoiceDate  UnitPrice  CustomerID         Country  
0 2010-12-01 08:26:00       2.55     17850.0  United Kingdom  
1 2010-12-01 08:26:00       3.39     17850.0  United Kingdom  
2 2010-12-01 08:26:00       2.75     17850.0  United Kingdom  
3 2010-12-01 08:26:00       3.39     17850.0  United Kingdom  
4 2010-12-01 08:26:00       3.39     17850.0  United Kingdom  


# 2. Data Preprocessing and Reshaping

# Clean the Data

In [23]:
# Drop rows without a CustomerID
df.dropna(axis=0, subset=['CustomerID'], inplace=True)

# Convert CustomerID to an integer type for consistency
df['CustomerID'] = df['CustomerID'].astype('int')

# Filter out returned items (Quantity > 0) and items with no price
df = df[df['Quantity'] > 0]
df = df[df['UnitPrice'] > 0]

print("Data cleaned. Shape:", df.shape)

Data cleaned. Shape: (397884, 8)


# Create the User-Item Matrix

In [25]:
from scipy.sparse import csr_matrix

# Create a pivot table: users vs. products
user_item_matrix_df = df.pivot_table(
    index='CustomerID',
    columns='StockCode',
    values='Quantity',
    aggfunc='sum'
).fillna(0)

# Convert the pandas DataFrame to a SciPy sparse matrix for memory efficiency
user_item_matrix_sparse = csr_matrix(user_item_matrix_df.values)

print("User-item matrix created.")

User-item matrix created.


# 3. Model Training (k-NN)

# Configure the Model

In [27]:
from sklearn.neighbors import NearestNeighbors

# Configure the k-NN model
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=10, n_jobs=-1)

# Fit the model to our sparse matrix
# The 'T' transposes the matrix so we are fitting on items (products) instead of users
model_knn.fit(user_item_matrix_sparse.T)

print("Model trained.")

Model trained.


# 4. Create the Recommendation Function

# Create Mappings

In [29]:
# Create a mapping from the matrix index to the StockCode
# The columns of our pivot table are the StockCodes
item_mapper = {i: stock_code for i, stock_code in enumerate(user_item_matrix_df.columns)}

# Write the Function
This function will find the neighbors of a given product in the trained model.

In [35]:
def get_recommendations(stock_code, model, item_map, n_recommendations=5):
    """
    Takes a stock code and returns N recommended similar stock codes.
    """
    # Find the index for the given stock code
    # We need a reverse mapper from stock_code to index
    stock_to_idx = {v: k for k, v in item_map.items()}
    try:
        item_idx = stock_to_idx[stock_code]
    except KeyError:
        return f"Stock code {stock_code} not found."

    # Get the distances and indices of the nearest neighbors
    # We need to reshape our item_idx to be a 2D array for the model
    distances, indices = model.kneighbors(user_item_matrix_sparse.T[item_idx], n_neighbors=n_recommendations+1)

    # The first item is the item itself, so we skip it
    similar_items_indices = indices.squeeze()[1:]
    
    # Map indices back to stock codes
    recommendations = [item_map[i] for i in similar_items_indices]
    
    return recommendations

# --- Example Usage ---
# Let's get recommendations for a popular item, e.g., '22423' (REGENCY CAKESTAND 3 TIER)
example_recs = get_recommendations('22423', model_knn, item_mapper, n_recommendations=5)
print(f"Recommendations for '22423': {example_recs}")

Recommendations for '22423': Stock code 22423 not found.


# 5. Model and Data Saving

In [37]:
import joblib

# What to save:
# 1. The trained k-NN model
# 2. The sparse matrix itself (for lookups)
# 3. The item mapper (to translate indices to StockCodes)

joblib.dump(model_knn, 'knn_model.joblib')
joblib.dump(user_item_matrix_sparse, 'sparse_matrix.joblib')
joblib.dump(item_mapper, 'item_mapper.joblib')

print("Model and data saved successfully.")

Model and data saved successfully.
