In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
import warnings

# Stop any existing session first (safety)
try:
    SparkSession.getActiveSession().stop()
except:
    pass

# Build fresh session with all configs
spark = SparkSession.builder \
    .appName("Div4_Modeling") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()

# Suppress warnings
warnings.filterwarnings('ignore')
spark.sparkContext.setLogLevel("ERROR")
print("Fresh session created with Kryo serializer.") #for faster serialization

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (
25/12/06 19:08:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/12/06 19:08:51 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/12/06 19:08:51 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
25/12/06 19:08:51 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


Fresh session created with Kryo serializer.


In [2]:
print("Kryo active:", spark.conf.get("spark.serializer") == "org.apache.spark.serializer.KryoSerializer")
print("Driver memory:", spark.conf.get("spark.driver.memory"))


Kryo active: True
Driver memory: 4g


## Division 4: Data Preparation (From Div3 Output)

Load featured data (41 cols, 6M rows). Assemble 37 numeric features into vectors (exclude Class, row_id, Amount_Category). Split 80/20 for train/test.

In [3]:
# load featured data from Div3
df_featured = spark.read.parquet("featured_fraud_data.parquet")  
print(f"Loaded {df_featured.count():,} rows from featured data")

# Select numeric features 
feature_cols = [c for c in df_featured.columns if c not in ["Class", "row_id", "Amount_Category"]]
print(f"Selected {len(feature_cols)} features: {feature_cols[:5]}...")  # Preview first 5

# VectorAssembler
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features", handleInvalid="skip")
assembled = assembler.transform(df_featured)

print("Features assembled into DenseVectors")

                                                                                

Loaded 5,998,034 rows from featured data
Selected 38 features: ['Time', 'V1', 'V2', 'V3', 'V4']...
Features assembled into DenseVectors


In [4]:
# Split (80/20)
train_df, test_df = assembled.randomSplit([0.8, 0.2], seed=42)

# check shapes
print("Assembled rows:", assembled.count())
print("Train rows:", train_df.count())
print("Test rows:", test_df.count())
print("Features dim:", len(feature_cols))

# sample vectors 
train_df.select("features", "Class").sample(0.001).show(5, truncate=50)

                                                                                

Assembled rows: 5998034


                                                                                

Train rows: 4797417


                                                                                

Test rows: 1200617
Features dim: 38


[Stage 13:>                                                         (0 + 1) / 1]

+--------------------------------------------------+-----+
|                                          features|Class|
+--------------------------------------------------+-----+
|[2.0,-0.425965884412454,0.960523044882985,1.141...|    0|
|[10.0,0.38497821518095,0.616109459176472,-0.874...|    0|
|[59.0,-0.773292609110981,-4.1460072502577,-0.93...|    0|
|[60.0,1.10702937694843,0.216441000371351,0.5383...|    0|
|[275.0,-0.363518693983855,0.0554644052427988,1....|    0|
+--------------------------------------------------+-----+
only showing top 5 rows


                                                                                

## Model Building: LR & RF with Imbalance Handling

Logistic Regression (linear baseline) + Random Forest (ensemble for nonlinearity). Class weights (100x fraud) to address 0.17% imbalance.

In [5]:
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.sql.functions import when, col, lit
import time

# add weights 
train_df = train_df.withColumn("weights", when(col("Class") == 1, lit(100.0)).otherwise(lit(1.0)))
test_df = test_df.withColumn("weights", when(col("Class") == 1, lit(100.0)).otherwise(lit(1.0)))

# Models
lr = LogisticRegression(labelCol="Class", featuresCol="features", weightCol="weights")
rf = RandomForestClassifier(labelCol="Class", featuresCol="features", weightCol="weights", seed=42, numTrees=50)

print("Models ready with weights for imbalance.")

Models ready with weights for imbalance.


In [6]:
# Grids 
lr_param_grid = (ParamGridBuilder().addGrid(lr.regParam, [0.01, 0.1]).build())
rf_param_grid = (ParamGridBuilder().addGrid(rf.maxDepth, [5, 10]).addGrid(rf.numTrees, [20, 50]).build())

# Evaluator
evaluator = BinaryClassificationEvaluator(labelCol="Class", rawPredictionCol="rawPrediction", metricName="areaUnderROC")

# CV
lr_cv = CrossValidator(estimator=lr, estimatorParamMaps=lr_param_grid, evaluator=evaluator, numFolds=3, seed=42, parallelism=2)
rf_cv = CrossValidator(estimator=rf, estimatorParamMaps=rf_param_grid, evaluator=evaluator, numFolds=3, seed=42, parallelism=2)

# Pipelines
lr_pipeline = Pipeline(stages=[lr_cv])
rf_pipeline = Pipeline(stages=[rf_cv])

print("Tuning setups ready.")

Tuning setups ready.


In [7]:
import time

# sample for dev 
train_df_sample = train_df.sample(0.1, seed=42)  
print(f"Using sample: {train_df_sample.count():,} train rows (full: {train_df.count():,})")

# Train LR (parallelism=1 to avoid network issues)
start = time.time()
print("Training LR CV (parallelism = 1)...")
try:
    lr_model = lr_pipeline.copy({lr_cv.parallelism: 1}).fit(train_df_sample)  # Copy to override
    lr_time = time.time() - start
    print(f"LR trained in {lr_time:.2f}s")
except Exception as e:
    print(f"LR retry with no CV: {e}")
    # Fallback: No tuning
    lr_fallback = lr.fit(train_df_sample)
    lr_model = Pipeline(stages=[lr_fallback])  # Wrap as "model"

# Train RF (same fix)
start = time.time()
print("Training RF CV (parallelism = 1)...")
try:
    rf_model = rf_pipeline.copy({rf_cv.parallelism: 1}).fit(train_df_sample)
    rf_time = time.time() - start
    print(f"RF trained in {rf_time:.2f}s")
except Exception as e:
    print(f"RF retry with no CV: {e}")
    rf_fallback = rf.fit(train_df_sample)
    rf_model = Pipeline(stages=[rf_fallback])

# Best params (or fallback)
try:
    lr_best = lr_model.stages[0].bestModel
    rf_best = rf_model.stages[0].bestModel
    print("Best LR regParam:", lr_best.getRegParam())
    print("Best RF maxDepth:", rf_best.getMaxDepth(), "| numTrees:", rf_best.getNumTrees())
except:
    print("Fallback models used: no tuned params.")

                                                                                

Using sample: 479,195 train rows (full: 4,797,417)
Training LR CV (parallelism = 1)...


                                                                                

LR trained in 228.53s
Training RF CV (parallelism = 1)...


                                                                                

RF trained in 413.77s
Best LR regParam: 0.01
Fallback models used: no tuned params.


In [8]:
# Predictions on test 

test_sample = test_df.sample(0.1, seed=42) if 'test_df_sample' not in locals() else test_df  
print(f"Using test sample: {test_sample.count():,} rows")

# Predict
lr_preds = lr_model.transform(test_sample)
rf_preds = rf_model.transform(test_sample)

# Final: RF preds 
final_preds = rf_preds.select("Class", "prediction", "probability", "rawPrediction", "weights")

# Save Parquet for Div5
final_preds.write.mode("overwrite").option("compression", "snappy").parquet("model_predictions.parquet")
print("RF predictions saved to model_predictions.parquet")

# Quick Eval (AUC-ROC for imbalance)
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol="Class", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
lr_auc = evaluator.evaluate(lr_preds)
rf_auc = evaluator.evaluate(rf_preds)
print(f"LR Test AUC-ROC: {lr_auc:.4f}")
print(f"RF Test AUC-ROC: {rf_auc:.4f} (target >0.95)")

# Sample predictions
final_preds.select("Class", "prediction", "probability").show(10, truncate=False)

                                                                                

Using test sample: 120,159 rows


                                                                                

RF predictions saved to model_predictions.parquet


                                                                                

LR Test AUC-ROC: 0.9901
RF Test AUC-ROC: 0.9971 (target >0.95)


[Stage 693:>                                                        (0 + 1) / 1]

+-----+----------+------------------------------------------+
|Class|prediction|probability                               |
+-----+----------+------------------------------------------+
|0    |0.0       |[0.9959822097198483,0.004017790280151658] |
|0    |0.0       |[0.9959822097198483,0.004017790280151658] |
|0    |0.0       |[0.9945413990238231,0.0054586009761769146]|
|0    |0.0       |[0.9935353229972833,0.006464677002716721] |
|0    |0.0       |[0.9952072756880055,0.004792724311994559] |
|0    |0.0       |[0.9884496524642982,0.011550347535701748] |
|0    |0.0       |[0.9952072756880055,0.004792724311994559] |
|0    |0.0       |[0.9959296572004501,0.004070342799549848] |
|0    |0.0       |[0.9828800727722535,0.017119927227746412] |
|0    |0.0       |[0.9943544160564934,0.005645583943506532] |
+-----+----------+------------------------------------------+
only showing top 10 rows


                                                                                

## Div4 Summary
- Trained: LR (reg=0.01, AUC ~0.99) + RF (fallback defaults, AUC ~0.95+) on 479K sample.
- Imbalance: Weights effective, check fraud recall in Div5.
- Scalability: 10min local; full data/cluster for prod.
- Output: model_predictions.parquet ready for eval.