# PySpark Environment

In [1]:
# Set the pyspark environment
# Use the conda environment in this path: /Users/rmontecino/anaconda3/envs/pyspark-env
import os
os.environ['SPARK_HOME'] = '/Users/rmontecino/anaconda3/envs/pyspark-env/lib/python3.12/site-packages/pyspark'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'notebook'
os.environ['PYSPARK_PYTHON'] = 'python3'

In [2]:
# Import PySpark and initialize SparkSession
from pyspark.sql import SparkSession

In [3]:
# Create a SparkSession
spark = SparkSession.builder.appName('SparkSQL').getOrCreate()

24/08/07 21:22:57 WARN Utils: Your hostname, Ricardos-MacBook-Pro-CleverIT.local resolves to a loopback address: 127.0.0.1; using 192.168.1.86 instead (on interface en0)
24/08/07 21:22:57 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/07 21:22:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/08/07 21:22:58 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


# Creating RDDs

## Creating RDD from a list

In [4]:
# Create a numbers array
data = [1, 2, 3, 4, 5]

# Create a collect action to distribute the data
distData = spark.sparkContext.parallelize(data)

# Perform a collect action to see the data
distData.collect()


                                                                                

[1, 2, 3, 4, 5]

## Creating RDD from a list of tuples

In [5]:
# Create an RDD from a list of tuples with name and age between 20 and 49
data = [('Alice', 34), ('Bob', 45), ('Charlie', 23), ('David', 49), ('Alice', 28)]
distData = spark.sparkContext.parallelize(data)

# Perform a collect action to see the data
distData.collect()

[('Alice', 34), ('Bob', 45), ('Charlie', 23), ('David', 49), ('Alice', 28)]

# RDDs Transformations

## Map Transformation

In [6]:
# Map transformation: Convert name to uppercase
mappedData = distData.map(lambda x: (x[0].upper(), x[1]))

# Perform a collect action to see the data
mappedData.collect()

                                                                                

[('ALICE', 34), ('BOB', 45), ('CHARLIE', 23), ('DAVID', 49), ('ALICE', 28)]

## Filter Transformation

In [7]:
# Filter transformation: Filter records with age greater than 30
filteredData = mappedData.filter(lambda x: x[1] > 30)

# Perform a collect action to see the data
filteredData.collect()

[('ALICE', 34), ('BOB', 45), ('DAVID', 49)]

## ReduceByKey Transformation

In [8]:
# ReduceByKey: Calculate the total age for each name
reducedData = filteredData.reduceByKey(lambda x, y: x + y)

# Perform a collect action to see the data
reducedData.collect()

[('DAVID', 49), ('ALICE', 34), ('BOB', 45)]

## SortBy Transformation

In [9]:
# SortyBy Transformation: Sort the data by age in descending order
sortedData = reducedData.sortBy(lambda x: x[1], ascending=False)

# Perform a collect action to see the data
sortedData.collect()

[('DAVID', 49), ('BOB', 45), ('ALICE', 34)]