# PySpark Avazu Pipeline\nThis notebook mirrors the code pipeline: install, init Spark, EDA, train, infer.

In [None]:
# If needed in a fresh environment:
# !pip install -r ../requirements.txt

import os
from pyspark.sql import SparkSession

spark = (SparkSession.builder
         .appName("AvazuNotebook")
         .master("local[*]")
         .config("spark.driver.memory", "6g")
         .getOrCreate())
spark.sparkContext.setLogLevel("WARN")
print("Spark version:", spark.version)

In [None]:
# Paths
DATA_DIR = "../data"
TRAIN = f"{DATA_DIR}/train.gz"
TEST  = f"{DATA_DIR}/test.gz"

# Basic EDA (sampled for speed)
df = spark.read.csv(TRAIN, header=True, inferSchema=True)
print("Train rows (approx):", df.count())
df.select("click").groupBy("click").count().show()
df.printSchema()

In [None]:
from feature_engineering import prepare_dataframe, build_feature_pipeline
from training import train_model
from inference import run_inference

df = spark.read.csv(TRAIN, header=True, inferSchema=True)
df = prepare_dataframe(df)
model, metrics = train_model(spark, TRAIN, sample_fraction=0.02)
metrics

In [None]:
# Inference to submission
run_inference(model, spark, TEST, out_path="../submissions/submission_nb")