In [21]:
# Implement a PySpark script that runs Monte Carlo simulations in parallel.

from pyspark.sql import SparkSession
import random

spark = SparkSession.builder \
    .appName("MonteCarloSimulation") \
    .getOrCreate()

def monte_carlo_pi(num_samples):
    inside_circle = 0
    for _ in range(num_samples):
        x, y = random.uniform(0, 1), random.uniform(0, 1)
        if x**2 + y**2 <= 1:
            inside_circle += 1
    return inside_circle

num_simulations = 100
samples_per_simulation = 10000

# RDD for parallel
rdd = spark.sparkContext.parallelize([samples_per_simulation] * num_simulations)

# simulation in parallel
inside_circle_counts = rdd.map(monte_carlo_pi).collect()

total_inside_circle = sum(inside_circle_counts)

estimated_pi = (total_inside_circle / (num_simulations * samples_per_simulation)) * 4

print(f"Estimated value of pi after {num_simulations} simulations: {estimated_pi}")

spark.stop()


24/10/07 10:21:11 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


Estimated value of pi after 100 simulations: 3.141664


In [22]:
# Demonstrate how to define and apply probability distributions to input parameters using
# PySpark.

from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import numpy as np

spark = SparkSession.builder \
    .appName("ProbabilityDistributions") \
    .getOrCreate()

num_samples = 1000

# normal distribution (mean=0, std=1)
normal_samples = np.random.normal(loc=0, scale=1, size=num_samples)

# uniform distribution (low=0, high=1)
uniform_samples = np.random.uniform(low=0, high=1, size=num_samples)

data = [(float(normal_samples[i]), float(uniform_samples[i])) for i in range(num_samples)]

columns = ["normal_sample", "uniform_sample"]
df = spark.createDataFrame(data, columns)


print("DataFrame with Samples:")
df.show(5)

# example calculations for application
# we can compute the sum of the two samples and create a new column
df = df.withColumn("sum_samples", col("normal_sample") + col("uniform_sample"))

print("Updated DataFrame with Sum of Samples:")
df.show(5)

spark.stop()


24/10/07 10:21:28 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


DataFrame with Samples:
+--------------------+------------------+
|       normal_sample|    uniform_sample|
+--------------------+------------------+
| -0.4771518973604032|0.6363857117498878|
|  0.3936321205016935|0.5390937722016786|
|-0.27895340583342626|0.5281089370622566|
| -0.7684754673532722|0.7907719390812169|
|-0.15098062140476629|0.5589047940559694|
+--------------------+------------------+
only showing top 5 rows

Updated DataFrame with Sum of Samples:
+--------------------+------------------+-------------------+
|       normal_sample|    uniform_sample|        sum_samples|
+--------------------+------------------+-------------------+
| -0.4771518973604032|0.6363857117498878|0.15923381438948458|
|  0.3936321205016935|0.5390937722016786| 0.9327258927033721|
|-0.27895340583342626|0.5281089370622566|0.24915553122883033|
| -0.7684754673532722|0.7907719390812169|0.02229647172794469|
|-0.15098062140476629|0.5589047940559694|0.40792417265120307|
+--------------------+----------------