In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, stddev, expr, count, sum as Fsum
from pyspark.sql.window import Window
import random
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.clustering import KMeans

# Set up your Spark session
spark = SparkSession.builder \
    .appName("Advanced PySpark Test") \
    .master("spark://localhost:7077") \
    .config("spark.driver.host", "localhost") \
    .config("spark.driver.bindAddress", "0.0.0.0") \
    .config("spark.driver.port", "7078") \
    .config("spark.executor.memory", "4g") \
    .config("spark.executor.cores", "4") \
    .config("spark.executor.instances", "4") \
    .config("spark.local.dir", "/tmp/spark-temp") \
    .getOrCreate()

# Set log level to INFO
spark.sparkContext.setLogLevel("INFO")

# Print the Spark version to verify the connection
print("Spark version:", spark.version)

# Generate synthetic data
num_samples = 1000000
data = [(random.randint(1, 100), random.random(), random.choice(['A', 'B', 'C'])) for _ in range(num_samples)]

# Define schema
columns = ["IntegerColumn", "FloatColumn", "Category"]

# Create a DataFrame
df = spark.createDataFrame(data, columns)

# Show the DataFrame
print("Initial DataFrame:")
df.show(5)

# Perform transformations
# 1. Compute average and standard deviation of FloatColumn
stats = df.agg(
    avg("FloatColumn").alias("Average_Float"),
    stddev("FloatColumn").alias("StdDev_Float")
)

print("Statistics of FloatColumn:")
stats.show()

# 2. Add a new column with normalized values of FloatColumn
average_float = stats.collect()[0]["Average_Float"]
stddev_float = stats.collect()[0]["StdDev_Float"]

df = df.withColumn("NormalizedFloat", (col("FloatColumn") - average_float) / stddev_float)

# 3. Group by Category and compute count and average of IntegerColumn
grouped_stats = df.groupBy("Category").agg(
    count("IntegerColumn").alias("Count"),
    avg("IntegerColumn").alias("Average_Integer")
)

print("Grouped Statistics by Category:")
grouped_stats.show()

# 4. Apply a complex expression to create a new column
df = df.withColumn("ComplexExpression", expr("IntegerColumn * sin(FloatColumn) + cos(FloatColumn)"))

# 5. Use window functions to compute running total of IntegerColumn partitioned by Category
window_spec = Window.partitionBy("Category").orderBy("IntegerColumn").rowsBetween(Window.unboundedPreceding, Window.currentRow)
df = df.withColumn("RunningTotal", Fsum("IntegerColumn").over(window_spec))

print("Transformed DataFrame with Running Total:")
df.show(5)

# 6. Machine Learning - Linear Regression
# Generate synthetic data for regression
ml_data = [(random.random(), random.random() * 10 + random.random()) for _ in range(num_samples)]
ml_columns = ["Feature", "Label"]

ml_df = spark.createDataFrame(ml_data, ml_columns)

# Assemble features
vector_assembler = VectorAssembler(inputCols=["Feature"], outputCol="features")
ml_df = vector_assembler.transform(ml_df)

# Split data into training and test sets
train_data, test_data = ml_df.randomSplit([0.8, 0.2], seed=42)

# Train a linear regression model
lr = LinearRegression(featuresCol='features', labelCol='Label')
lr_model = lr.fit(train_data)

# Print model coefficients and intercept
print("Linear Regression Coefficients: ", lr_model.coefficients)
print("Linear Regression Intercept: ", lr_model.intercept)

# Make predictions on the test data
predictions = lr_model.transform(test_data)
predictions.select("Feature", "Label", "prediction").show(5)

# Evaluate the model
evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="Label", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE): {rmse}")

# 7. Machine Learning - Clustering with K-Means
# Generate synthetic data for clustering
clustering_data = [(random.random(), random.random()) for _ in range(num_samples)]
clustering_columns = ["Feature1", "Feature2"]

clustering_df = spark.createDataFrame(clustering_data, clustering_columns)

# Assemble features for clustering
vector_assembler_clustering = VectorAssembler(inputCols=["Feature1", "Feature2"], outputCol="features")
clustering_df = vector_assembler_clustering.transform(clustering_df)

# Train a K-Means model
kmeans = KMeans(k=3, seed=1)
kmeans_model = kmeans.fit(clustering_df)

# Make predictions
cluster_predictions = kmeans_model.transform(clustering_df)
cluster_predictions.select("Feature1", "Feature2", "prediction").show(5)

# Stop the Spark session
spark.stop()


24/06/28 22:12:28 INFO SparkContext: Running Spark version 3.5.1
24/06/28 22:12:28 INFO SparkContext: OS info Linux, 6.2.0-39-generic, amd64
24/06/28 22:12:28 INFO SparkContext: Java version 17.0.9
24/06/28 22:12:28 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).
24/06/28 22:12:28 INFO ResourceUtils: No custom resources configured for spark.driver.
24/06/28 22:12:28 INFO SparkContext: Submitted application: Advanced PySpark Test
24/06/28 22:12:28 INFO ResourceProfile: Default ResourceProfile created, executor resources: Map(cores -> name: cores, amount: 4, script: , vendor: , memory -> name: memory, amount: 4096, script: , vendor: , offHeap -> name: offHeap, amount: 0, script: , vendor: ), task resources: Map(cpus -> name: cpus, amount: 1.0)
24/06/28 22:12:28 INFO ResourceProfile: Limiting resource is cpus at 4 tasks per executor
24/06/28 22:12:28 INFO Reso

Spark version: 3.5.1


24/06/28 22:12:29 INFO SharedState: Setting hive.metastore.warehouse.dir ('null') to the value of spark.sql.warehouse.dir.
24/06/28 22:12:29 INFO SharedState: Warehouse path is 'file:/root/Trustia/Cicada-binance/spark-warehouse'.
24/06/28 22:12:30 INFO StandaloneSchedulerBackend$StandaloneDriverEndpoint: Registered executor NettyRpcEndpointRef(spark-client://Executor) (127.0.0.1:41994) with ID 0,  ResourceProfileId 0
24/06/28 22:12:30 INFO StandaloneSchedulerBackend$StandaloneDriverEndpoint: Registered executor NettyRpcEndpointRef(spark-client://Executor) (127.0.0.1:42000) with ID 1,  ResourceProfileId 0
24/06/28 22:12:30 INFO BlockManagerMasterEndpoint: Registering block manager 127.0.0.1:36899 with 2.2 GiB RAM, BlockManagerId(0, 127.0.0.1, 36899, None)
24/06/28 22:12:30 INFO BlockManagerMasterEndpoint: Registering block manager 127.0.0.1:42841 with 2.2 GiB RAM, BlockManagerId(1, 127.0.0.1, 42841, None)


Initial DataFrame:


24/06/28 22:12:44 INFO SparkContext: Starting job: showString at NativeMethodAccessorImpl.java:0
24/06/28 22:12:44 INFO DAGScheduler: Got job 0 (showString at NativeMethodAccessorImpl.java:0) with 1 output partitions
24/06/28 22:12:44 INFO DAGScheduler: Final stage: ResultStage 0 (showString at NativeMethodAccessorImpl.java:0)
24/06/28 22:12:44 INFO DAGScheduler: Parents of final stage: List()
24/06/28 22:12:44 INFO DAGScheduler: Missing parents: List()
24/06/28 22:12:44 INFO DAGScheduler: Submitting ResultStage 0 (MapPartitionsRDD[6] at showString at NativeMethodAccessorImpl.java:0), which has no missing parents
24/06/28 22:12:44 INFO MemoryStore: Block broadcast_0 stored as values in memory (estimated size 13.2 KiB, free 434.4 MiB)
24/06/28 22:12:44 INFO MemoryStore: Block broadcast_0_piece0 stored as bytes in memory (estimated size 6.8 KiB, free 434.4 MiB)
24/06/28 22:12:44 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on localhost:44785 (size: 6.8 KiB, free: 434.4 MiB)


+-------------+-------------------+--------+
|IntegerColumn|        FloatColumn|Category|
+-------------+-------------------+--------+
|           96|0.17959478724299838|       A|
|           72|0.26446945233007657|       B|
|           84|0.17098642346686066|       C|
|           40| 0.6116877547081923|       B|
|           32| 0.7479216165525343|       B|
+-------------+-------------------+--------+
only showing top 5 rows

Statistics of FloatColumn:


24/06/28 22:12:46 INFO BlockManagerInfo: Added broadcast_1_piece0 in memory on 127.0.0.1:36899 (size: 10.3 KiB, free: 2.2 GiB)
24/06/28 22:12:46 INFO TaskSetManager: Finished task 1.0 in stage 1.0 (TID 2) in 486 ms on 127.0.0.1 (executor 1) (1/8)
24/06/28 22:12:46 INFO TaskSetManager: Finished task 5.0 in stage 1.0 (TID 6) in 482 ms on 127.0.0.1 (executor 1) (2/8)
24/06/28 22:12:46 INFO TaskSetManager: Finished task 3.0 in stage 1.0 (TID 4) in 485 ms on 127.0.0.1 (executor 1) (3/8)
24/06/28 22:12:46 INFO TaskSetManager: Finished task 7.0 in stage 1.0 (TID 8) in 481 ms on 127.0.0.1 (executor 1) (4/8)
24/06/28 22:12:47 INFO TaskSetManager: Finished task 2.0 in stage 1.0 (TID 3) in 1646 ms on 127.0.0.1 (executor 0) (5/8)
24/06/28 22:12:47 INFO TaskSetManager: Finished task 6.0 in stage 1.0 (TID 7) in 1642 ms on 127.0.0.1 (executor 0) (6/8)
24/06/28 22:12:47 INFO TaskSetManager: Finished task 0.0 in stage 1.0 (TID 1) in 1653 ms on 127.0.0.1 (executor 0) (7/8)
24/06/28 22:12:47 INFO TaskSet

+------------------+------------------+
|     Average_Float|      StdDev_Float|
+------------------+------------------+
|0.5004873761859209|0.2886989718775669|
+------------------+------------------+



24/06/28 22:12:47 INFO TaskSetManager: Finished task 1.0 in stage 4.0 (TID 11) in 271 ms on 127.0.0.1 (executor 1) (1/8)
24/06/28 22:12:48 INFO TaskSetManager: Finished task 3.0 in stage 4.0 (TID 13) in 293 ms on 127.0.0.1 (executor 1) (2/8)
24/06/28 22:12:48 INFO TaskSetManager: Finished task 7.0 in stage 4.0 (TID 17) in 297 ms on 127.0.0.1 (executor 1) (3/8)
24/06/28 22:12:48 INFO TaskSetManager: Finished task 5.0 in stage 4.0 (TID 15) in 325 ms on 127.0.0.1 (executor 1) (4/8)
24/06/28 22:12:48 INFO TaskSetManager: Finished task 2.0 in stage 4.0 (TID 12) in 333 ms on 127.0.0.1 (executor 0) (5/8)
24/06/28 22:12:48 INFO TaskSetManager: Finished task 4.0 in stage 4.0 (TID 14) in 333 ms on 127.0.0.1 (executor 0) (6/8)
24/06/28 22:12:48 INFO TaskSetManager: Finished task 6.0 in stage 4.0 (TID 16) in 339 ms on 127.0.0.1 (executor 0) (7/8)
24/06/28 22:12:48 INFO TaskSetManager: Finished task 0.0 in stage 4.0 (TID 10) in 344 ms on 127.0.0.1 (executor 0) (8/8)
24/06/28 22:12:48 INFO TaskSched

Grouped Statistics by Category:


24/06/28 22:12:48 INFO BlockManagerInfo: Removed broadcast_4_piece0 on localhost:44785 in memory (size: 7.9 KiB, free: 434.4 MiB)
24/06/28 22:12:48 INFO BlockManagerInfo: Added broadcast_6_piece0 in memory on 127.0.0.1:36899 (size: 19.9 KiB, free: 2.2 GiB)
24/06/28 22:12:48 INFO BlockManagerInfo: Removed broadcast_4_piece0 on 127.0.0.1:42841 in memory (size: 7.9 KiB, free: 2.2 GiB)
24/06/28 22:12:48 INFO BlockManagerInfo: Removed broadcast_3_piece0 on localhost:44785 in memory (size: 10.3 KiB, free: 434.4 MiB)
24/06/28 22:12:48 INFO BlockManagerInfo: Removed broadcast_3_piece0 on 127.0.0.1:42841 in memory (size: 10.3 KiB, free: 2.2 GiB)
24/06/28 22:12:48 INFO BlockManagerInfo: Removed broadcast_3_piece0 on 127.0.0.1:36899 in memory (size: 10.3 KiB, free: 2.2 GiB)
24/06/28 22:12:49 INFO TaskSetManager: Finished task 2.0 in stage 9.0 (TID 22) in 661 ms on 127.0.0.1 (executor 1) (1/8)
24/06/28 22:12:49 INFO TaskSetManager: Finished task 4.0 in stage 9.0 (TID 24) in 660 ms on 127.0.0.1 (ex

+--------+------+-----------------+
|Category| Count|  Average_Integer|
+--------+------+-----------------+
|       B|333196| 50.4520972640728|
|       C|332430|50.54151851517613|
|       A|334374|50.48330611829867|
+--------+------+-----------------+

Transformed DataFrame with Running Total:


24/06/28 22:12:49 INFO CodeGenerator: Code generated in 7.039687 ms
24/06/28 22:12:49 INFO DAGScheduler: Registering RDD 25 (showString at NativeMethodAccessorImpl.java:0) as input to shuffle 3
24/06/28 22:12:49 INFO DAGScheduler: Got map stage job 8 (showString at NativeMethodAccessorImpl.java:0) with 8 output partitions
24/06/28 22:12:49 INFO DAGScheduler: Final stage: ShuffleMapStage 12 (showString at NativeMethodAccessorImpl.java:0)
24/06/28 22:12:49 INFO DAGScheduler: Parents of final stage: List()
24/06/28 22:12:49 INFO DAGScheduler: Missing parents: List()
24/06/28 22:12:49 INFO DAGScheduler: Submitting ShuffleMapStage 12 (MapPartitionsRDD[25] at showString at NativeMethodAccessorImpl.java:0), which has no missing parents
24/06/28 22:12:49 INFO MemoryStore: Block broadcast_8 stored as values in memory (estimated size 17.6 KiB, free 434.3 MiB)
24/06/28 22:12:49 INFO MemoryStore: Block broadcast_8_piece0 stored as bytes in memory (estimated size 9.0 KiB, free 434.2 MiB)
24/06/28 2

+-------------+-------------------+--------+--------------------+------------------+------------+
|IntegerColumn|        FloatColumn|Category|     NormalizedFloat| ComplexExpression|RunningTotal|
+-------------+-------------------+--------+--------------------+------------------+------------+
|            1| 0.4138824354331524|       B|  -0.299983544068513|1.3177333169553478|           1|
|            1|0.04896025268316473|       B| -1.5640066903121579| 1.047742380769839|           2|
|            1| 0.4577844569536277|       B|-0.14791503743353437| 1.338995659788864|           3|
|            1|0.23914566324267317|       B|  -0.905239499966349| 1.208413388476698|           4|
|            1| 0.7820474784777913|       B|  0.9752722722243572| 1.414205623629453|           5|
+-------------+-------------------+--------+--------------------+------------------+------------+
only showing top 5 rows

