# **Feature Engineering & Modeling**

## **1. Load Data and Train/Test Split**

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv('../data/online_shoppers_intention.csv')
print(f"Dataset shape: {df.shape}")

Dataset shape: (12330, 18)


In [2]:
# Separate features and target
X = df.drop('Revenue', axis=1)
y = df['Revenue']

# 70/30 train/test split with stratification to preserve class balance
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.30, 
    stratify=y,
    random_state=42
)

## **2. Feature Engineering**

In [3]:
# Why: Total time on site = engagement level
X_train['total_duration'] = (X_train['Administrative_Duration'] + 
                              X_train['Informational_Duration'] + 
                              X_train['ProductRelated_Duration'])

X_test['total_duration'] = (X_test['Administrative_Duration'] + 
                             X_test['Informational_Duration'] + 
                             X_test['ProductRelated_Duration'])

In [4]:
# Why: Addresses the zero dominant problem, separates "visited valuable pages" from "didn't"
X_train['has_pagevalue'] = (X_train['PageValues'] > 0).astype(int)
X_test['has_pagevalue'] = (X_test['PageValues'] > 0).astype(int)

In [5]:
# Why: What % of time was spent on products? High = serious shopper
X_train['product_focus'] = X_train['ProductRelated_Duration'] / (X_train['total_duration'] + 1)
X_test['product_focus'] = X_test['ProductRelated_Duration'] / (X_test['total_duration'] + 1)

In [6]:
# Why: High value pages + low exit = strong buy signal
X_train['pagevalue_exit_interaction'] = X_train['PageValues'] * (1 - X_train['ExitRates'])
X_test['pagevalue_exit_interaction'] = X_test['PageValues'] * (1 - X_test['ExitRates'])

In [7]:
# Why: Time per page = how engaged they were (fast clicking vs. careful browsing)
total_pages = X_train['Administrative'] + X_train['Informational'] + X_train['ProductRelated']
X_train['engagement_rate'] = X_train['total_duration'] / (total_pages + 1)

total_pages = X_test['Administrative'] + X_test['Informational'] + X_test['ProductRelated']
X_test['engagement_rate'] = X_test['total_duration'] / (total_pages + 1)

**Note: "+1" in each denominator to avoid division by 0**

## **3. Feature Selection**

In [8]:
numerical_features = [
    'PageValues',
    'ExitRates', 
    'BounceRates',
    'total_duration',
    'product_focus',
    'engagement_rate',
    'has_pagevalue',
    'pagevalue_exit_interaction'
]

categorical_features = [
    'Month'
]

features_to_use = numerical_features + categorical_features

X_train_selected = X_train[features_to_use].copy()
X_test_selected = X_test[features_to_use].copy()

## **4. Encoding**

In [9]:
# One-hot encode Month into 9 dummy variables
X_train_encoded = pd.get_dummies(X_train_selected, columns=['Month'], drop_first=True)
X_test_encoded = pd.get_dummies(X_test_selected, columns=['Month'], drop_first=True)

## **5. Feature Scaling**

In [10]:
from sklearn.preprocessing import StandardScaler

# Initialize scaler
scaler = StandardScaler()

# Fit on training data
scaler.fit(X_train_encoded[numerical_features])

# Transform both train and test with the SAME scaler
X_train_scaled = X_train_encoded.copy()
X_test_scaled = X_test_encoded.copy()

X_train_scaled[numerical_features] = scaler.transform(X_train_encoded[numerical_features])
X_test_scaled[numerical_features] = scaler.transform(X_test_encoded[numerical_features])