# Data Exploration & Baseline Model for Prediction

This notebook explores the data with the goal of identifying features that help predict user behavior (like Click-Through Rate or Item Purchase) and builds a simple baseline model using R.

In [None]:
# 1. Environment Setup
options(repos = c(CRAN = "https://packagemanager.posit.co/cran/__linux__/noble/latest"))

# Install/Load libraries
packages <- c("jsonlite", "tidyverse", "lubridate", "rpart", "rpart.plot")
for (pkg in packages) {
  if (!require(pkg, character.only = TRUE)) install.packages(pkg)
  library(pkg, character.only = TRUE)
}

## 2. Load Data

We load a subset of the training and item data.

In [None]:
# Paths to data
train_path <- "../data/raw/train_dataset.jl"
item_path <- "../data/raw/item_data.jl"

# Load samples
# Using a smaller sample initially to ensure stability
message("Reading training data...")
train_raw <- stream_in(pipe(paste("head -n 5000", shQuote(train_path))), verbose = FALSE)

message("Reading item data...")
item_data <- stream_in(pipe(paste("head -n 20000", shQuote(item_path))), verbose = FALSE)

message("Loaded ", nrow(train_raw), " training records and ", nrow(item_data), " items.")

## 3. Feature Engineering

We extract core features from user history. We focus on predicting if the **last item viewed** is the one that was **bought**.

In [None]:
extract_features_safe <- function(i) {
  row <- train_raw[i, ]
  history <- row$user_history[[1]]
  item_bought <- row$item_bought
  
  if (is.null(history) || nrow(history) == 0) return(NULL)
  
  # Basic counts
  num_views <- sum(history$event_type == "view", na.rm = TRUE)
  num_searches <- sum(history$event_type == "search", na.rm = TRUE)
  
  # Timestamp parsing
  ts <- ymd_hms(history$event_timestamp, quiet = TRUE)
  duration <- if(all(is.na(ts))) 0 else as.numeric(difftime(max(ts, na.rm=TRUE), min(ts, na.rm=TRUE), units="mins"))
  
  # Last event
  last_row <- history[nrow(history), ]
  last_item_viewed <- if(last_row$event_type == "view") last_row$event_info else NA
  
  # Target
  bought_last <- if(!is.na(last_item_viewed)) (as.character(last_item_viewed) == as.character(item_bought)) else FALSE
  
  tibble(
    num_views = num_views,
    num_searches = num_searches,
    session_duration_min = duration,
    total_events = nrow(history),
    search_ratio = num_searches / nrow(history),
    bought_last = bought_last
  )
}

message("Extracting features...")
train_df <- map_dfr(1:nrow(train_raw), extract_features_safe) %>%
  mutate(bought_last = as.factor(bought_last))

head(train_df)

## 4. Exploration

How does session activity relate to purchase probability?

In [None]:
# Plot Views vs Purchase
ggplot(train_df, aes(x = bought_last, y = num_views, fill = bought_last)) +
  geom_boxplot() +
  scale_y_log10() + 
  theme_minimal() +
  labs(title = "Views vs Purchase of Last Item Viewed (Log Scale)", y = "Number of Views")

## 5. Baseline Model & Important Features

We train an interpretable Decision Tree to predict the outcome.

In [None]:
# Train model
model <- rpart(bought_last ~ ., data = train_df, method = "class", 
               control = rpart.control(cp = 0.001))

# Plot tree
if (nrow(model$frame) > 1) {
  rpart.plot(model, main = "Decision Tree for Buyer Intent")
} else {
  print("Tree did not split. Data might be too unbalanced or features not predictive yet.")
}

# Feature Importance
imp <- model$variable.importance
if (!is.null(imp)) {
  tibble(Feature = names(imp), Importance = imp) %>%
    ggplot(aes(x = reorder(Feature, Importance), y = Importance)) +
    geom_col(fill = "steelblue") +
    coord_flip() +
    theme_minimal() +
    labs(title = "Feature Importance (Baseline)", x = "")
} else {
  print("No feature importance calculated (no splits).")
}