In [None]:
from pyspark.sql.functions import col
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.classification import RandomForestClassifier, LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

In [None]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder

In [None]:
labeled_df.schema["label"].dataType

In [None]:
# Train/test split
train_df, test_df = labeled_df.randomSplit([0.8, 0.2], seed=42)

In [None]:
# Index and encode categoricals
aisle_indexer = StringIndexer(inputCol="aisle_id", outputCol="aisle_index")
dept_indexer  = StringIndexer(inputCol="department_id", outputCol="dept_index")
aisle_encoder = OneHotEncoder(inputCol="aisle_index", outputCol="aisle_vec")
dept_encoder  = OneHotEncoder(inputCol="dept_index", outputCol="dept_vec")

In [None]:
# Assemble features
feature_cols = [
    "avg_cart_position",
    "total_orders",
    "total_reorders",
    "aisle_vec",
    "dept_vec"
]
assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features"
)

In [None]:
# Create evaluator for AUC (area under ROC curve)
evaluator = BinaryClassificationEvaluator(
    labelCol="label",
    rawPredictionCol="rawPrediction",
    metricName="areaUnderROC"
)

# Create evaluator for F1 score
f1_evaluator = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="f1"
)

In [None]:
# Define logistic regression model
lr = LogisticRegression(labelCol="label", featuresCol="features")

# Create pipeline with preprocessing steps and logistic regression
lr_pipeline = Pipeline(stages=[
    aisle_indexer,
    dept_indexer,
    aisle_encoder,
    dept_encoder,
    assembler,
    lr
])

In [None]:
# Train logistic regression pipeline on training data
lr_model = lr_pipeline.fit(train_df)

# Make predictions on test data
lr_predictions = lr_model.transform(test_df)

In [None]:
# Evaluate AUC and F1 score for logistic regression
lr_auc = evaluator.evaluate(lr_predictions)
lr_f1 = f1_evaluator.evaluate(lr_predictions)

print(f"Logistic Regression AUC: {lr_auc:.4f}")
print(f"Logistic Regression F1 Score: {lr_f1:.4f}")

In [None]:
# Define random forest classifier
rf = RandomForestClassifier(labelCol="label", featuresCol="features")

In [None]:
# Create pipeline with preprocessing steps and random forest
pipeline = Pipeline(stages=[
    aisle_indexer,
    dept_indexer,
    aisle_encoder,
    dept_encoder,
    assembler,
    rf
])

In [None]:
# Train the random forest pipeline on training data
model = pipeline.fit(train_df)

In [None]:
# Make predictions on the test set
predictions = model.transform(test_df)

In [None]:
# Imbalanced so we will use AUC and F1 as evaluation metrics

# Evaluate Random Forest
rf_auc = evaluator.evaluate(predictions)
print(f"AUC = {rf_auc:.4f}")

rf_f1 = f1_evaluator.evaluate(predictions)
print(f"Test F1 Score: {rf_f1:.4f}")

# Train set metrics
train_predictions = model.transform(train_df)
print(f"Train F1 Score: {f1_evaluator.evaluate(train_predictions):.4f}")


In [None]:
# Get the trained Random Forest model from the pipeline
rf_model = model.stages[-1]

# Get importances
importances = rf_model.featureImportances.toArray()

# Feature names
feature_names = ["total_orders", "total_reorders", "avg_cart_position"] + ["aisle_vec", "dept_vec"]

# Print top features
print("Feature Importances:")
for name, imp in zip(feature_names, importances):
    print(f"{name:<20} → {imp:.4f}")


In [None]:
from pyspark.sql.functions import col
from pyspark.sql import functions as F

# Compute confusion matrix by counting label-prediction pairs in test predictions
conf_matrix = (
    predictions.groupBy("label", "prediction")
    .agg(F.count("*").alias("count"))
    .orderBy("label", "prediction")
)

conf_matrix.show()
