In [7]:
import pandas as pd


# Load dataset
df = pd.read_csv('data/online_retail.csv', encoding='ISO-8859-1')

#Data preprocessing
# Remove missing CustomerID
df.dropna(subset=['CustomerID'], inplace=True)

# Remove cancelled invoices
df = df[~df['InvoiceNo'].astype(str).str.startswith('C')]

# Remove negative and zero quantities/prices
df = df[(df['Quantity'] > 0) & (df['UnitPrice'] > 0)]

# Create TotalPrice
df['TotalPrice'] = df['Quantity'] * df['UnitPrice']
import pandas as pd

# ✅ Convert InvoiceDate to datetime
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

In [8]:
import datetime as dt

#RFM Feature Engineering
# Reference date
latest_date = df['InvoiceDate'].max()

# RFM Table
rfm = df.groupby('CustomerID').agg({
    'InvoiceDate': lambda x: (latest_date - x.max()).days,
    'InvoiceNo': 'nunique',
    'TotalPrice': 'sum'
}).reset_index()

rfm.columns = ['CustomerID', 'Recency', 'Frequency', 'Monetary']


In [9]:
# RFM Clustering

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# Standardize
scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm[['Recency', 'Frequency', 'Monetary']])

# Find optimal k (use Elbow/Silhouette in notebook)

# Clustering
kmeans = KMeans(n_clusters=4, random_state=42)
rfm['Cluster'] = kmeans.fit_predict(rfm_scaled)

# Map to labels
segment_map = {
    0: 'High-Value', 
    1: 'Regular', 
    2: 'Occasional', 
    3: 'At-Risk'
}
rfm['Segment'] = rfm['Cluster'].map(segment_map)


In [11]:
# Save model

import pickle
with open('models/kmeans_model.pkl', 'wb') as f:
    pickle.dump(kmeans, f)
with open('models/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

In [12]:
# Product recommadation

# Create user-item matrix
pivot = df.pivot_table(index='CustomerID', columns='Description', values='Quantity', fill_value=0)

# Cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity(pivot.T)
similarity_df = pd.DataFrame(similarity_matrix, index=pivot.columns, columns=pivot.columns)

# Recommend function
def recommend(product, top_n=5):
    return similarity_df[product].sort_values(ascending=False)[1:top_n+1].index.tolist()

In [13]:
similarity_df.to_pickle("models/product_similarity.pkl")