In [8]:
##1 Create an RDD from [5, 10, 15, 20, 25] and compute the average value.

from pyspark import SparkContext

# Initialize Spark context
sc = SparkContext.getOrCreate()

# Create an RDD from a Python list
numbers = [5, 10, 15, 20, 25]
rdd = sc.parallelize(numbers)  # parallelize converts list to RDD

# Compute the sum of all numbers using reduce
total_sum = rdd.reduce(lambda x, y: x + y)  # reduce applies the lambda cumulatively

# Count the total number of elements in the RDD
count = rdd.count() 

# Compute the average
average = total_sum / count

# Print the result
print("Average:", average)  # Output: Average: 15.0

Average: 15.0


In [9]:
##2 Create an RDD from [1, 2, 2, 3, 4, 4, 5] and count distinct numbers.


# Create an RDD from a Python list
numbers = [1, 2, 2, 3, 4, 4, 5]
rdd = sc.parallelize(numbers)  # Convert list to RDD

# Get distinct values and count them
distinct_count = rdd.distinct().count()

# distinct() removes duplicates
# count() returns total unique elements

# Print the result
print("Number of distinct values:", distinct_count)

Number of distinct values: 5


In [10]:
##3 Create an RDD from [3, 8, 2, 10, 6] and find the maximum number.


# Create an RDD from a Python list
numbers = [3, 8, 2, 10, 6]
rdd = sc.parallelize(numbers)  # Convert list to RDD

# Find the maximum value using reduce
max_num = rdd.reduce(lambda a, b: a if a > b else b)  
# Compare elements pairwise, keep the larger one

# Print the maximum value
print("Maximum number:", max_num)

Maximum number: 10


In [11]:
##4 Load covid-dataset/covid-data.csv, filter for Afghanistan (iso_code = 'AFG'), 
    # and compute total new_cases.


# Load CSV file into an RDD (each line is a string)
rdd = sc.textFile("covid-dataset/covid-data.csv")  

# Get the header row
header = rdd.first()  

# Filter out header and keep only Afghanistan data (iso_code == 'AFG')
afg_data = rdd.filter(lambda row: row != header and row.startswith("AFG"))  

# Extract 'new_cases' column (6th column) and convert to float
new_cases = afg_data.map(lambda row: float(row.split(',')[5] or 0.0))  

# Compute total new cases
total_cases = new_cases.sum()  

# Display total new cases
print(f"Total new cases for Afghanistan: {total_cases}")

Total new cases for Afghanistan: 235214.0


In [12]:
##5 Create an RDD from [1, 2, 3, 4, 5], compute factorials, and sum them.

import math

# Create an RDD from a Python list
numbers = [1, 2, 3, 4, 5]
rdd = sc.parallelize(numbers)  # parallelize converts list to RDD

# Compute factorial of each number and sum them
factorial_sum = (
    rdd
    .map(lambda x: math.factorial(x))  # map: compute factorial of each element
    .reduce(lambda a, b: a + b)       # reduce: sum all factorials
)

# Print the result
print("Sum of factorials:", factorial_sum)


Sum of factorials: 153


In [13]:
##6 Create an RDD from [1, 2, 3, 4, 5, 6, 7], filter odd numbers, cube them, and compute the sum.


numbers = [1, 2, 3, 4, 5, 6, 7]
rdd = sc.parallelize(numbers)

# Keep only odd numbers, cube them, and sum the results
result = (
    rdd
    .filter(lambda x: x % 2 != 0)   # Odd numbers only
    .map(lambda x: x**3)            # Cube each
    .reduce(lambda a, b: a + b)     # Sum all cubes
)

print("Sum of cubes of odd numbers:", result) 


Sum of cubes of odd numbers: 496


In [14]:
##7 Load covid-dataset/covid-data.csv, group by continent, and count records per continent.


rdd = sc.textFile("covid-dataset/covid-data.csv")  # Load CSV
header = rdd.first()  # Get header
counts = (rdd
          .filter(lambda row: row != header)              # Skip header
          .map(lambda row: (row.split(',')[1], 1))        # (continent, 1)
          .reduceByKey(lambda a, b: a + b))               # Sum per continent

print(counts.collect())

[('', 26525), ('Europe', 91031), ('Africa', 95419), ('North America', 68638), ('South America', 23440), ('Asia', 84199), ('Oceania', 40183)]


In [17]:
##8 Create RDDs: [('S1', 'Alice'), ('S2', 'Bob'), ('S3', 'Charlie')] and [('S1', 85), ('S2', 90),
    #('S4', 95)]. Join on student ID and collect results.


students = sc.parallelize([('S1','Alice'), ('S2','Bob'), ('S3','Charlie')])  # RDD of student IDs & names
grades = sc.parallelize([('S1',85), ('S2',90), ('S4',95)])                 # RDD of student IDs & grades

joined_rdd = students.join(grades)  # Join on student ID (key)

print(joined_rdd.collect())  # Collect joined RDD

[('S2', ('Bob', 90)), ('S1', ('Alice', 85))]


In [21]:
##9 Load covid-dataset/covid-data.csv, filter for Brazil (iso_code = 'BRA'), cache, 
    # and compute average new_deaths.


rdd = sc.textFile("covid-dataset/covid-data.csv")  # Load CSV
header = rdd.first()                               # Get header
bra_data = rdd.filter(lambda row: row != header and row.startswith("BRA")).cache()  # Filter Brazil, cache

new_deaths = bra_data.map(lambda row: float(row.split(',')[6] or 0.0))  # Extract new_deaths column
count = new_deaths.count()                                             # Count rows
total = new_deaths.sum()                                               # Sum of new_deaths
average = total / count if count > 0 else 0.0                          # Compute average

print(f"Average new deaths for Brazil: {average}")

Average new deaths for Brazil: 22408.555053763415


In [23]:
##10 Create an RDD from [('apple', 5), ('banana', 2), ('orange', 8), ('apple', 3)].
    # Sum values by key and sort by value descending.

data = [('apple',5), ('banana',2), ('orange',8), ('apple',3)]
rdd = sc.parallelize(data)                        # Create RDD of key-value pairs

summed_rdd = rdd.reduceByKey(lambda a, b: a + b)  # Sum values by key
sorted_rdd = summed_rdd.sortBy(lambda x: x[1], ascending=False)  # Sort by value descending

print(sorted_rdd.collect())  # Collect sorted RDD

[('apple', 8), ('orange', 8), ('banana', 2)]
