# Imports

In [1]:
import os
import sys
from glob import glob

from pyspark.sql import SparkSession
from pyspark.sql import functions as F

# Required for Spark to find Python executable
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

# Required to use the correct Java version
os.environ['JAVA_HOME'] = '/opt/homebrew/opt/openjdk@17'

# Generate huge CSV

In [10]:
import csv
import random

def generate_test_csv(filename: str, num_rows=1000000):
    print(f"Generating {filename} with {num_rows:,} rows...")

    with open(filename, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)

        # Write header
        writer.writerow(['id', 'name', 'amount', 'category', 'date'])

        # Generate random data
        categories = ['Food', 'Entertainment', 'Shopping', 'Transportation', 'Utilities']

        for i in range(1, num_rows + 1):
            # Generate amounts with different distributions
            if random.random() < 0.3:  # 30% chance of high amounts
                amount = round(random.uniform(100, 1000), 2)
            else:  # 70% chance of lower amounts
                amount = round(random.uniform(1, 150), 2)

            row = [
                i,
                f"User_{i}",
                amount,
                random.choice(categories),
                f"2025-07-{random.randint(1, 31):02d}"
            ]
            writer.writerow(row)

    print(f"Generated {filename} successfully!")

file_path = '../data/huge_csv.csv'
size = 1000000
generate_test_csv(file_path, size)

Generating ../data/huge_csv.csv with 1,000,000 rows...
Generated ../data/huge_csv.csv successfully!


# Start Spark App

In [3]:
spark = SparkSession.builder.appName("huge-csv").getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/07/07 20:59:32 WARN Utils: Your hostname, Alissons-MacBook-Pro.local, resolves to a loopback address: 127.0.0.1; using 192.168.1.21 instead (on interface en0)
25/07/07 20:59:32 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/07 20:59:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Read CSV file

In [11]:
csv_df = spark.read.csv(file_path, header=True)

In [27]:
csv_df = csv_df.withColumns({
    'amount': F.col('amount').cast('float'),
    'date': F.col('date').cast('date')
})

In [28]:
csv_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- amount: float (nullable = true)
 |-- category: string (nullable = true)
 |-- date: date (nullable = true)



# Filter rows

In [31]:
# Filter rows where category = 'Food'
csv_food = csv_df.where(F.col('category') == 'Food')

# Aggregate filtered

In [46]:
csv_food.groupby('category').agg(
    F.count('*').alias('count'),
    F.min('amount').alias('min_amount'),
    F.max('amount').alias('max_amount'),
    F.round(F.avg('amount'), 2).alias('avg_amount')
).show()

+--------+------+----------+----------+----------+
|category| count|min_amount|max_amount|avg_amount|
+--------+------+----------+----------+----------+
|    Food|199806|       1.0|    999.99|    217.67|
+--------+------+----------+----------+----------+

