# Spark Application

## Imports and environment initialization

In [1]:
import os
# Set environment variables
os.environ["JAVA_HOME"] = "C:/Program Files/Java/jdk-22"
os.environ["SPARK_HOME"] = "C:/Spark/spark-3.5.1-bin-hadoop3"
os.environ["HADOOP_HOME"] = "C:/Hadoop"
os.environ["PYSPARK_PYTHON"] = "C:/Users/chris/AppData/Local/Programs/Python/Python311/python.exe" 
os.environ["PATH"] = (
    os.path.join(os.environ["JAVA_HOME"], "bin") + os.pathsep +
    os.path.join(os.environ["SPARK_HOME"], "bin") + os.pathsep +
    os.path.join(os.environ["HADOOP_HOME"], "bin") + os.pathsep +
    os.path.join(os.environ["PYSPARK_PYTHON"]) + os.pathsep +
    os.environ["PATH"]
)

import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

## Create & Configure Session

In [3]:
def create_session():
    # create the session
    conf = SparkConf()
    conf.setAppName("DIS-lab-1")    # Sets name of the Spark Application
    conf.setMaster("local[*]")    # Master URL. In this case local[*] uses all the available cores in the machine
    conf.set("spark.driver.memory", "2G")   # Memory allocated to driver process
    conf.set("spark.driver.maxResultSize", "2g")    # Maximum size of results that can be returned to driver
    conf.set("spark.executor.memory", "1G")    # Memory allocated to each executor     
    sc = pyspark.SparkContext(conf=conf)    # Initializes tha Spark context with this specific configuration
    spark = SparkSession.builder.getOrCreate()    # Creates Spark session
    
    return sc, spark

In [4]:
try:
    if 'sc' in globals() and sc is not None:
        sc.stop()
        print("--Stopped existing SparkContext")
    if 'spark' in globals() and isinstance(spark, SparkSession):
        spark.stop()
        print("--Stopped existing SparkSession")
except Exception as e:
    print(f"Error stopping existing Spark session or context: {e}")

# Create a new Spark session
sc, spark = create_session()
print("Spark session created successfully!")
spark

--Stopped existing SparkContext
--Stopped existing SparkSession
Spark session created successfully!


In [5]:
# Get the SparkConf object
conf = spark.sparkContext.getConf()    # Retrieves the current Spark configuration.

# Print the configuration
for key, value in conf.getAll():
    print(f"{key}: {value}")

spark.driver.port: 60536
spark.driver.extraJavaOptions: -Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/jdk.internal.ref=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED -Djdk.reflect.useDirectMethodHandle=false
spark.driver.memory: 2G
spark.app.startTime: 1715814630947
spark.driver.host: Belmont.ho

### Example

#### Parallelize a Collection

In [6]:
# Parallelize a collection
data = [
    ("Vahid", 26, "Phd-1234"), ("Yannis", 41, "Prof-2314"),
    ("Enas", 35, "Phd-2356"), ("Duygu", 27, "Phd-3517"),
    ("Tom", 25, "Phd-2359"), ("Ioana", 35, "Prof-1246"),
    ("Timo", 45, "Msc-5431"), ("John", 33, "Msc-7194"),
    ("Linda", 30, "Phd-7890"), ("Mike", 29, "Msc-2468"),
    ("Sara", 38, "Prof-9753"), ("Emma", 31, "Phd-1478"),
    ("Alex", 36, "Prof-3698"), ("Sophia", 32, "Msc-8521"),
    ("Daniel", 28, "Phd-6397"), ("Olivia", 34, "Prof-4268"),
    ("Robert", 42, "Phd-5192"), ("Emily", 27, "Msc-6152"),
    ("Max", 29, "Msc-9472"), ("Ava", 31, "Prof-2468"),
    ("William", 33, "Phd-1382"), ("Mia", 35, "Prof-7634"),
    ("James", 40, "Msc-9573"), ("Charlotte", 28, "Phd-4297"),
    ("Benjamin", 37, "Prof-7641"),("Jack", 29, "Phd-7641")
    ]

rdd = sc.parallelize(data)
print(type(rdd))

<class 'pyspark.rdd.RDD'>


#### Task 1: Get count of people above 30 and their average ages

In [7]:
try:
    # Filter the RDD to get people above 30
    above_30_RDD = rdd.filter(lambda x: x[1] > 30)
    print("Filtered RDD for people above 30 created successfully.")
    print(f"Filtered RDD: {above_30_RDD.collect()}")

    # Count the number of people above 30
    count_above_30 = above_30_RDD.count()
    print(f"Count of people above 30: {count_above_30}")

    # Calculate the total age of people above 30
    total_age_above_30 = above_30_RDD.map(lambda x: x[1]).sum()
    print(f"Total age of people above 30: {total_age_above_30}")

    # Calculate the average age of people above 30
    avg_age_above_30 = total_age_above_30 / count_above_30
    print(f"Average age of people above 30: {avg_age_above_30:.2f}")
except Exception as e:
    print(f"Error in Task 1: {e}")

Filtered RDD for people above 30 created successfully.
Filtered RDD: [('Yannis', 41, 'Prof-2314'), ('Enas', 35, 'Phd-2356'), ('Ioana', 35, 'Prof-1246'), ('Timo', 45, 'Msc-5431'), ('John', 33, 'Msc-7194'), ('Sara', 38, 'Prof-9753'), ('Emma', 31, 'Phd-1478'), ('Alex', 36, 'Prof-3698'), ('Sophia', 32, 'Msc-8521'), ('Olivia', 34, 'Prof-4268'), ('Robert', 42, 'Phd-5192'), ('Ava', 31, 'Prof-2468'), ('William', 33, 'Phd-1382'), ('Mia', 35, 'Prof-7634'), ('James', 40, 'Msc-9573'), ('Benjamin', 37, 'Prof-7641')]
Count of people above 30: 16
Total age of people above 30: 578
Average age of people above 30: 36.12
