In [2]:
import requests
try:
    response = requests.get("http://minio:9000", timeout=5)
    print(f"✅ MinIO reachable (HTTP {response.status_code})")
except Exception as e:
    print(f"❌ MinIO connection failed: {e}")

✅ MinIO reachable (HTTP 403)


In [4]:
from pyspark.sql import SparkSession

spark = (SparkSession.builder
    .appName("MinIO-Integration")
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000")
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin")
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
    # Performance and reliability settings:
    .config("spark.hadoop.fs.s3a.connection.timeout", "10000")
    .config("spark.hadoop.fs.s3a.attempts.maximum", "3")
    .config("spark.hadoop.fs.s3a.connection.establish.timeout", "5000")
    .config("spark.hadoop.fs.s3a.fast.upload", "true")
    # Use local JARs instead of downloading:
    .config("spark.jars", "/opt/spark/jars/hadoop-aws-3.3.4.jar,/opt/spark/jars/aws-java-sdk-bundle-1.12.262.jar")
    .config("spark.driver.extraClassPath", "/opt/spark/jars/*")
    .config("spark.executor.extraClassPath", "/opt/spark/jars/*")
    .getOrCreate())

# Verify configuration
print("Spark session created with MinIO config:")
for (k,v) in sorted(spark.sparkContext._conf.getAll()):
    if "s3a" in k.lower():
        print(f"{k}: {v}")

Spark session created with MinIO config:
spark.hadoop.fs.s3a.access.key: minioadmin
spark.hadoop.fs.s3a.attempts.maximum: 3
spark.hadoop.fs.s3a.connection.establish.timeout: 5000
spark.hadoop.fs.s3a.connection.ssl.enabled: false
spark.hadoop.fs.s3a.connection.timeout: 10000
spark.hadoop.fs.s3a.endpoint: http://minio:9000
spark.hadoop.fs.s3a.fast.upload: true
spark.hadoop.fs.s3a.impl: org.apache.hadoop.fs.s3a.S3AFileSystem
spark.hadoop.fs.s3a.path.style.access: true
spark.hadoop.fs.s3a.secret.key: minioadmin


In [5]:
# List files in your bucket
files = spark.sparkContext.wholeTextFiles("s3a://default/data/")
print(f"Found {files.count()} files in bucket:")
for path in files.collect()[:3]:  # Show first 3 files
    print(path[0])  # Prints S3 paths

Found 2 files in bucket:
s3a://default/data/smart_meter_data.json
s3a://default/data/weather_data.parquet


In [8]:
# 1. First let's see the raw JSON content
print("Inspecting first 5 lines of raw JSON file:")
raw_json = spark.read.text("s3a://default/data/smart_meter_data.json")
raw_json.show(18, truncate=False)

Inspecting first 5 lines of raw JSON file:
+-----------------------------------------------+
|value                                          |
+-----------------------------------------------+
|[                                              |
|  {                                            |
|    "meter_id": "WELLINGTON_626",              |
|    "timestamp": "2025-06-10T06:10:07.414848Z",|
|    "kwh_usage": 3.54,                         |
|    "voltage": 230,                            |
|    "customer_id": "CUST_7591",                |
|    "region": "Christchurch"                   |
|  },                                           |
|  {                                            |
|    "meter_id": "DUNEDIN_660",                 |
|    "timestamp": "2025-06-10T21:25:07.414960Z",|
|    "kwh_usage": 2.14,                         |
|    "voltage": 240,                            |
|    "customer_id": "CUST_1812",                |
|    "region": "Auckland"                       |
|  },  

In [6]:
from pyspark.sql.functions import avg

# Continue working with the weather data
weather_df = spark.read.parquet("s3a://default/data/weather_data.parquet")

print("\nWeather Data Analysis:")
print(f"Total records: {weather_df.count()}")
print("\nSummary statistics:")
weather_df.describe().show()

print("\nAverage rainfall by region:")
weather_df.groupBy("region").agg(
    avg("rainfall_mm").alias("avg_rainfall"),
    avg("max_temp_c").alias("avg_max_temp"),
    avg("min_temp_c").alias("avg_min_temp")
).show()


Weather Data Analysis:
Total records: 450

Summary statistics:
+-------+----------+----------+-----------------+-----------------+------------------+
|summary|      date|    region|       max_temp_c|       min_temp_c|       rainfall_mm|
+-------+----------+----------+-----------------+-----------------+------------------+
|  count|       450|       450|              450|              450|               450|
|   mean|      NULL|      NULL|17.53088888888889|7.988666666666666|10.167777777777786|
| stddev|      NULL|      NULL|4.261962241501345|4.512712111506372| 5.806240935709952|
|    min|2025-03-12|  Auckland|             10.0|              0.0|               0.0|
|    max|2025-06-09|Wellington|             24.9|             15.0|              20.0|
+-------+----------+----------+-----------------+-----------------+------------------+


Average rainfall by region:
+------------+------------------+------------------+-----------------+
|      region|      avg_rainfall|      avg_max_temp|