
# Notebook: 00-Environment-Check

BRAUX Owen and CAMBIER Elliot

    Description: This notebook captures the execution environment details (OS, Python, Java, Spark versions, and configuration) and generates the ENV.md file as required by the project checklist.

    Run this notebook once at the beginning of the project or whenever the environment changes.


In [3]:
import sys
import platform
import subprocess
from pathlib import Path
import pyspark
from pyspark.sql import SparkSession

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("BDA - Environment Check") \
    .master("local[*]") \
    .getOrCreate()

# Helper Function to Get Java Version
def get_java_version():
    """Executes 'java -version' and returns the first line of the output."""
    try:
        output = subprocess.check_output(["java", "-version"], stderr=subprocess.STDOUT)
        return output.decode("utf-8").strip().splitlines()[0]
    except Exception as e:
        return f"Could not determine Java version: {e}"

# Gather All Environment Information
java_version_str = get_java_version()
spark_conf_items = sorted(spark.sparkContext.getConf().getAll())

# Format the output for ENV.md
env_lines = [
    "# BDA Final Project - Environment Summary",
    "",
    "This file documents the environment used to run the project pipeline, ensuring reproducibility.",
    "",
    "## Key Components",
    f"- **Operating System:** `{platform.platform()}`",
    f"- **Python Version:** `{sys.version.split()[0]}`",
    f"- **PySpark Version:** `{pyspark.__version__}`",
    f"- **Apache Spark Version:** `{spark.version}`",
    f"- **Java Version:** `{java_version_str}`",
    "",
    "## Spark Configuration",
    "The following Spark configuration was active during the execution:",
    ""
]

# Add each Spark configuration property to the list
for key, value in spark_conf_items:
    env_lines.append(f"- `{key}`: `{value}`")

# Write to ENV.md file 
env_file_path = Path("../ENV.md")
env_file_path.write_text("\n".join(env_lines) + "\n", encoding='utf-8')

print(f"Successfully generated environment file:)")
print(f"File location: {env_file_path.resolve()}")


spark.stop()

25/11/14 11:25:32 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


Successfully generated environment file:)
File location: /home/owenb/big_data/Project_Big_Data_Analytics/ENV.md
