In [None]:
# Assignment:
Applying RNN on E-commerce recommendation system 
The goal here is to predict the next item a customer will purchase based on their purchase history,
which is a classic application of Recurrent Neural Networks (RNNs) in recommender systems.

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv("data.csv", encoding='latin-1')

# --- 1. Data Cleaning ---
# Remove rows with missing CustomerID (we need to track user sequences)
df.dropna(subset=['CustomerID'], inplace=True)
df['CustomerID'] = df['CustomerID'].astype(int)

# Filter out non-stock items (like POST, D, etc. typically cancelled or shipping)
df = df[~df['StockCode'].astype(str).str.contains('^[a-zA-Z]', na=False)]

# Filter out canceled transactions (InvoiceNo starts with 'C') and ensure positive quantity
df = df[~df['InvoiceNo'].astype(str).str.startswith('C')]
df = df[df['Quantity'] > 0]

# --- 2. Create Sequential Data ---
# Convert InvoiceDate to datetime and sort by CustomerID and purchase time
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df.sort_values(by=['CustomerID', 'InvoiceDate'], inplace=True)

# Group by CustomerID and aggregate the sequence of purchased StockCodes
customer_sequences = df.groupby('CustomerID')['StockCode'].apply(list).reset_index()

# --- 3. Item Encoding (Vocabulary Mapping) ---
# Create a vocabulary of all unique StockCodes (items)
all_items = df['StockCode'].unique()
item_to_int = {item: i + 1 for i, item in enumerate(all_items)} # +1 for padding (0)
int_to_item = {i + 1: item for i, item in enumerate(all_items)}
num_items = len(all_items) + 1 # Total vocabulary size including 0 (padding)

print(f"Total number of unique items (StockCode): {num_items - 1}")
print(f"Total number of customer sequences: {len(customer_sequences)}")

Total number of unique items (StockCode): 3659
Total number of customer sequences: 4335


In [2]:
# --- 4. Sequence and Target Generation ---
X_data = [] # Input sequence (e.g., [I1, I2, I3])
y_data = [] # Target item (e.g., I4)
MAX_SEQUENCE_LENGTH = 10 # Define a fixed maximum history length to use

for sequence in customer_sequences['StockCode']:
    # Map item strings to their integer IDs
    encoded_sequence = [item_to_int[item] for item in sequence]
    
    # Create (Input, Target) pairs for every subsequence longer than 1
    for i in range(1, len(encoded_sequence)):
        # Input: The sequence up to item i
        input_seq = encoded_sequence[:i]
        # Target: The item at step i (the next item)
        target_item = encoded_sequence[i]
        
        # Truncate the input sequence to the max length (only keep the MAX_SEQUENCE_LENGTH most recent items)
        if len(input_seq) > MAX_SEQUENCE_LENGTH:
            input_seq = input_seq[-MAX_SEQUENCE_LENGTH:]
            
        X_data.append(input_seq)
        y_data.append(target_item)

# --- 5. Padding and Splitting ---
# Pad input sequences to ensure uniform length
X_padded = pad_sequences(X_data, maxlen=MAX_SEQUENCE_LENGTH, padding='pre') # Padding before the sequence
y_targets = np.array(y_data)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_padded, y_targets, test_size=0.2, random_state=42
)

print(f"Padded Input Shape: {X_padded.shape}")
print(f"Target Shape: {y_targets.shape}")

Padded Input Shape: (392035, 10)
Target Shape: (392035,)


In [4]:
# --- Functional API Version to resolve the warning ---
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input

# Conceptual Parameters (reusing from previous steps)
# num_items = 50001
# EMBEDDING_DIM = 50
# MAX_SEQUENCE_LENGTH = 10

# 1. Define the input layer
input_sequence = Input(shape=(MAX_SEQUENCE_LENGTH,), name='input_sequence')

# 2. Embedding Layer
embedded_sequence = Embedding(input_dim=num_items, 
                              output_dim=EMBEDDING_DIM, 
                              input_length=MAX_SEQUENCE_LENGTH)(input_sequence)

# 3. LSTM Layer
lstm_output = LSTM(64)(embedded_sequence)

# 4. Output Layer
predictions = Dense(num_items, activation='softmax')(lstm_output)

# 5. Create the Model instance
model_rnn_rec_functional = Model(inputs=input_sequence, outputs=predictions)

# Compile the model (rest remains the same)
model_rnn_rec_functional.compile(optimizer='adam', 
                                 loss='sparse_categorical_crossentropy', 
                                 metrics=['accuracy'])

model_rnn_rec_functional.summary()

In [5]:
# --- 6. Train the Model ---
# Note: Training on the full dataset can be time-consuming, using few epochs for demonstration
EPOCHS = 5
BATCH_SIZE = 128

print("\n--- Training RNN Model for Next Item Prediction ---")
history_rec = model_rnn_rec.fit(
    X_train, y_train,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    validation_data=(X_test, y_test)
)

# --- 7. Evaluate the Model ---
loss, accuracy = model_rnn_rec.evaluate(X_test, y_test, verbose=0)
print(f"\nRNN Recommender Test Loss: {loss:.4f}")
print(f"RNN Recommender Test Accuracy (Next Item Prediction): {accuracy:.4f}")
# Note: Accuracy is a strict metric here (it must be the EXACT next item).
# In real-world recommender systems, metrics like Recall@K or NDCG@K are preferred.


--- Training RNN Model for Next Item Prediction ---
Epoch 1/5
[1m2451/2451[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 23ms/step - accuracy: 0.0076 - loss: 7.2918 - val_accuracy: 0.0133 - val_loss: 7.0328
Epoch 2/5
[1m2451/2451[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 22ms/step - accuracy: 0.0250 - loss: 6.6697 - val_accuracy: 0.0357 - val_loss: 6.4627
Epoch 3/5
[1m2451/2451[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 21ms/step - accuracy: 0.0461 - loss: 6.2822 - val_accuracy: 0.0512 - val_loss: 6.2788
Epoch 4/5
[1m2451/2451[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 22ms/step - accuracy: 0.0618 - loss: 6.0981 - val_accuracy: 0.0647 - val_loss: 6.1852
Epoch 5/5
[1m2451/2451[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 22ms/step - accuracy: 0.0769 - loss: 5.9711 - val_accuracy: 0.0749 - val_loss: 6.1290

RNN Recommender Test Loss: 6.1290
RNN Recommender Test Accuracy (Next Item Prediction): 0.0749


In [6]:
def predict_next_item(model, user_history_items, item_to_int, int_to_item, max_len, k=5):
    # 1. Encode and Truncate the history
    encoded_history = [item_to_int.get(item, 0) for item in user_history_items]
    if len(encoded_history) > max_len:
        encoded_history = encoded_history[-max_len:]
    
    # 2. Pad the sequence
    padded_input = pad_sequences([encoded_history], maxlen=max_len, padding='pre')
    
    # 3. Predict the probabilities for all items
    predictions = model.predict(padded_input)[0]
    
    # 4. Get the top K item indices (IDs)
    # np.argsort returns indices that would sort the array (ascending), [-k:] gets the top k indices
    top_k_indices = np.argsort(predictions)[-k:][::-1]
    
    # 5. Map back to StockCodes
    recommended_items = [int_to_item.get(idx, 'UNKNOWN_ITEM') for idx in top_k_indices if idx != 0]

    return recommended_items

# Example: Pick a random customer's last MAX_SEQUENCE_LENGTH items
sample_user_id = customer_sequences['CustomerID'].sample(1).iloc[0]
sample_history = customer_sequences[customer_sequences['CustomerID'] == sample_user_id]['StockCode'].iloc[0]
# Use the last MAX_SEQUENCE_LENGTH items as the input sequence for prediction
input_history = sample_history[-MAX_SEQUENCE_LENGTH:]

print(f"\n--- Recommendation for Customer ID: {sample_user_id} ---")
print(f"Last items purchased: {input_history}")

# Get the top 5 recommendations
recommendations = predict_next_item(model_rnn_rec, input_history, item_to_int, int_to_item, MAX_SEQUENCE_LENGTH, k=5)

print(f"\nTop 5 Next Item Recommendations: {recommendations}")


--- Recommendation for Customer ID: 17262 ---
Last items purchased: ['47566', '71053', '47566', '21896', '22076', '21925', '48138', '75049L', '23053', '23050']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 287ms/step

Top 5 Next Item Recommendations: ['23052', '23053', '23051', '23050', '23049']
