In [3]:
import sys
import os



# Add PySpark and Python paths manually (adjust paths as needed)
spark_home = os.environ.get('SPARK_HOME', None)  # Optional: Set if using standalone Spark
python_path = os.path.join(spark_home, 'python') if spark_home else ''
sys.path.insert(0, python_path)
sys.path.insert(0, os.path.join(python_path, 'lib', 'py4j-0.10.9.7-src.zip'))  # Match your Py4J version

In [4]:
os.environ['SPARK_HOME'] = "/Applications/spark-3.5.5-bin-hadoop3"
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python'

In [5]:
from pyspark.sql import SparkSession

In [8]:
spark = SparkSession.builder \
        .appName("RDD-DEMO").getOrCreate()


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/10 17:40:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/04/10 17:40:11 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/04/10 17:40:11 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


# How to create RDDs

In [17]:
numbers = list(range(1,6)) 

In [18]:
rdd = spark.sparkContext.parallelize(numbers)

In [20]:
# retrieve all elements of the RDD
rdd.collect()

                                                                                

[1, 2, 3, 4, 5]

In [21]:
# create an RDD from a list of tuples

In [78]:
data = [("Driss", 25), ("Ahmed", 19), ("Yacine", 33), ("Ahmed", 27)]
rdd = spark.sparkContext.parallelize(data)

In [79]:
print("All elements of the rdd ",rdd.collect())

All elements of the rdd  [('Driss', 25), ('Ahmed', 19), ('Yacine', 33), ('Ahmed', 27)]


# RDD Actions


In [80]:
count = rdd.count()

In [81]:
print("the number of elements are : ",count)

the number of elements are :  4


In [82]:
first_element = rdd.first()
print("the first element of the rdd is: ", first_element)

the first element of the rdd is:  ('Driss', 25)


In [83]:
taken_elements = rdd.take(2)
print("the first two elements of the rdd are: {}".format(taken_elements))

the first two elements of the rdd are: [('Driss', 25), ('Ahmed', 19)]


In [84]:
rdd.foreach(lambda x: print(x))

('Driss', 25)
('Yacine', 33)
('Ahmed', 19)
('Ahmed', 27)


# RDDs Operation: Transformations

In [85]:
mapped_rdd = rdd.map(lambda x: (x[0].capitalize(), x[1] ))

In [86]:
result = mapped_rdd.collect()

In [87]:
print(result)

[('Driss', 25), ('Ahmed', 19), ('Yacine', 33), ('Ahmed', 27)]


In [88]:
# filter transformations 

In [89]:
filtred_rdd = rdd.filter(lambda x : x[1]>25)
filtred_rdd.collect()

[('Yacine', 33), ('Ahmed', 27)]

In [93]:
reduced_rdd = rdd.reduceByKey(lambda x, y: x + y)
reduced_rdd.collect()

[('Yacine', 33), ('Driss', 25), ('Ahmed', 46)]

# Save RDDs to text file and read RDDs from text file


In [94]:
rdd.saveAsTextFile("output.txt")

In [95]:
rdd_text = spark.sparkContext.textFile("output.txt")

In [106]:
rdd_text.foreach(lambda x : print(x))

('Driss', 25)
('Yacine', 33)
('Ahmed', 19)
('Ahmed', 27)
