In [None]:
import logging
import warnings
from matplotlib import MatplotlibDeprecationWarning
from pyspark.sql import SparkSession
from ydata_profiling import ProfileReport
from ydata_profiling.config import Settings

logging.basicConfig(level=logging.INFO)

# Local paths for input and output
input_path = "/input/cleaned_titles.csv"
output_path = "cleaned_titles_report.html"

spark_session = (
    SparkSession.builder
    .appName("SparkProfiling")
    .master("local[*]")
    .config("spark.executor.memory", "4g")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

spark_df = spark_session.read.csv(input_path, header=True, inferSchema=True)

# Configuration settings for ydata_profiling
cfg = Settings()
cfg.infer_dtypes = False
cfg.correlations["auto"].calculate = False
cfg.correlations["pearson"].calculate = False
cfg.correlations["spearman"].calculate = False
cfg.interactions.continuous = False
cfg.missing_diagrams["bar"] = False
cfg.missing_diagrams["heatmap"] = False
cfg.missing_diagrams["matrix"] = False
cfg.samples.tail = 0
cfg.samples.random = 0

# Create and start the monitoring process
warnings.filterwarnings("ignore", category=MatplotlibDeprecationWarning)

# ydata_profiling tool
profile = ProfileReport(spark_df, config=cfg)

profile.to_file(output_path)

print(f"Profile report saved to {output_path}")
