In [8]:
##1 Create an RDD from [5, 10, 15, 20, 25] and compute the average value.

from pyspark import SparkContext

# Initialize Spark context
sc = SparkContext.getOrCreate()

# Create an RDD from a Python list
numbers = [5, 10, 15, 20, 25]
rdd = sc.parallelize(numbers)  # parallelize converts list to RDD

# Compute the sum of all numbers using reduce
total_sum = rdd.reduce(lambda x, y: x + y)  # reduce applies the lambda cumulatively

# Count the total number of elements in the RDD
count = rdd.count() 

# Compute the average
average = total_sum / count

# Print the result
print("Average:", average)  # Output: Average: 15.0

Average: 15.0


In [9]:
##2 Create an RDD from [1, 2, 2, 3, 4, 4, 5] and count distinct numbers.


# Create an RDD from a Python list
numbers = [1, 2, 2, 3, 4, 4, 5]
rdd = sc.parallelize(numbers)  # Convert list to RDD

# Get distinct values and count them
distinct_count = rdd.distinct().count()

# distinct() removes duplicates
# count() returns total unique elements

# Print the result
print("Number of distinct values:", distinct_count)

Number of distinct values: 5


In [10]:
##3 Create an RDD from [3, 8, 2, 10, 6] and find the maximum number.


# Create an RDD from a Python list
numbers = [3, 8, 2, 10, 6]
rdd = sc.parallelize(numbers)  # Convert list to RDD

# Find the maximum value using reduce
max_num = rdd.reduce(lambda a, b: a if a > b else b)  
# Compare elements pairwise, keep the larger one

# Print the maximum value
print("Maximum number:", max_num)

Maximum number: 10


In [11]:
##4 Load covid-dataset/covid-data.csv, filter for Afghanistan (iso_code = 'AFG'), 
    # and compute total new_cases.


# Load CSV file into an RDD (each line is a string)
rdd = sc.textFile("covid-dataset/covid-data.csv")  

# Get the header row
header = rdd.first()  

# Filter out header and keep only Afghanistan data (iso_code == 'AFG')
afg_data = rdd.filter(lambda row: row != header and row.startswith("AFG"))  

# Extract 'new_cases' column (6th column) and convert to float
new_cases = afg_data.map(lambda row: float(row.split(',')[5] or 0.0))  

# Compute total new cases
total_cases = new_cases.sum()  

# Display total new cases
print(f"Total new cases for Afghanistan: {total_cases}")

Total new cases for Afghanistan: 235214.0


In [12]:
##5 Create an RDD from [1, 2, 3, 4, 5], compute factorials, and sum them.

import math

# Create an RDD from a Python list
numbers = [1, 2, 3, 4, 5]
rdd = sc.parallelize(numbers)  # parallelize converts list to RDD

# Compute factorial of each number and sum them
factorial_sum = (
    rdd
    .map(lambda x: math.factorial(x))  # map: compute factorial of each element
    .reduce(lambda a, b: a + b)       # reduce: sum all factorials
)

# Print the result
print("Sum of factorials:", factorial_sum)


Sum of factorials: 153


In [13]:
##6 Create an RDD from [1, 2, 3, 4, 5, 6, 7], filter odd numbers, cube them, and compute the sum.


numbers = [1, 2, 3, 4, 5, 6, 7]
rdd = sc.parallelize(numbers)

# Keep only odd numbers, cube them, and sum the results
result = (
    rdd
    .filter(lambda x: x % 2 != 0)   # Odd numbers only
    .map(lambda x: x**3)            # Cube each
    .reduce(lambda a, b: a + b)     # Sum all cubes
)

print("Sum of cubes of odd numbers:", result) 


Sum of cubes of odd numbers: 496


In [14]:
##7 Load covid-dataset/covid-data.csv, group by continent, and count records per continent.


rdd = sc.textFile("covid-dataset/covid-data.csv")  # Load CSV
header = rdd.first()  # Get header
counts = (rdd
          .filter(lambda row: row != header)              # Skip header
          .map(lambda row: (row.split(',')[1], 1))        # (continent, 1)
          .reduceByKey(lambda a, b: a + b))               # Sum per continent

print(counts.collect())

[('', 26525), ('Europe', 91031), ('Africa', 95419), ('North America', 68638), ('South America', 23440), ('Asia', 84199), ('Oceania', 40183)]


In [17]:
##8 Create RDDs: [('S1', 'Alice'), ('S2', 'Bob'), ('S3', 'Charlie')] and [('S1', 85), ('S2', 90),
    #('S4', 95)]. Join on student ID and collect results.


students = sc.parallelize([('S1','Alice'), ('S2','Bob'), ('S3','Charlie')])  # RDD of student IDs & names
grades = sc.parallelize([('S1',85), ('S2',90), ('S4',95)])                 # RDD of student IDs & grades

joined_rdd = students.join(grades)  # Join on student ID (key)

print(joined_rdd.collect())  # Collect joined RDD

[('S2', ('Bob', 90)), ('S1', ('Alice', 85))]


In [21]:
##9 Load covid-dataset/covid-data.csv, filter for Brazil (iso_code = 'BRA'), cache, 
    # and compute average new_deaths.


rdd = sc.textFile("covid-dataset/covid-data.csv")  # Load CSV
header = rdd.first()                               # Get header
bra_data = rdd.filter(lambda row: row != header and row.startswith("BRA")).cache()  # Filter Brazil, cache

new_deaths = bra_data.map(lambda row: float(row.split(',')[6] or 0.0))  # Extract new_deaths column
count = new_deaths.count()                                             # Count rows
total = new_deaths.sum()                                               # Sum of new_deaths
average = total / count if count > 0 else 0.0                          # Compute average

print(f"Average new deaths for Brazil: {average}")

Average new deaths for Brazil: 22408.555053763415


In [23]:
##10 Create an RDD from [('apple', 5), ('banana', 2), ('orange', 8), ('apple', 3)].
    # Sum values by key and sort by value descending.

data = [('apple',5), ('banana',2), ('orange',8), ('apple',3)]
rdd = sc.parallelize(data)                        # Create RDD of key-value pairs

summed_rdd = rdd.reduceByKey(lambda a, b: a + b)  # Sum values by key
sorted_rdd = summed_rdd.sortBy(lambda x: x[1], ascending=False)  # Sort by value descending

print(sorted_rdd.collect())  # Collect sorted RDD

[('apple', 8), ('orange', 8), ('banana', 2)]


In [25]:
##11 Load covid-dataset/covid-data.csv, filter for 2020, and count records.


rdd = sc.textFile("covid-dataset/covid-data.csv")  # Load CSV
header = rdd.first()                               # Get header
data_2020 = rdd.filter(lambda row: row != header and row.split(',')[3].startswith('2020'))  # Filter 2020

count = data_2020.count()  # Count rows

print(f"Number of records in 2020: {count}")

Number of records in 2020: 90982


In [29]:
##12 Load covid-dataset/covid-data.csv, find max total_cases per country, and get top 5.


rdd = sc.textFile("covid-dataset/covid-data.csv")  # Load CSV
header = rdd.first()                               # Get header
cases = rdd.filter(lambda row: row != header)     \
           .map(lambda row: (row.split(',')[2], float(row.split(',')[4] or 0.0)))  # (country, total_cases)

max_cases = cases.reduceByKey(lambda a, b: a if a > b else b)  # Max cases per country
top_5 = max_cases.top(5, key=lambda x: x[1])                   # Top 5 countries by cases

print(top_5)

[('World', 775866783.0), ('High-income countries', 429044049.0), ('Asia', 301499099.0), ('Europe', 252916868.0), ('Upper-middle-income countries', 251753518.0)]


In [30]:
##13 Create RDDs from 'spark is awesome and spark is fast' and count word frequencies.


text1 = sc.parallelize("spark is awesome".split())  # RDD from first text
text2 = sc.parallelize("spark is fast".split())     # RDD from second text

combined_rdd = text1.union(text2)                   # Combine both RDDs
word_counts = combined_rdd.map(lambda w: (w, 1))   \
                          .reduceByKey(lambda a, b: a + b)  # Count occurrences

print(word_counts.collect())

[('is', 2), ('spark', 2), ('fast', 1), ('awesome', 1)]


In [38]:
##14 Load covid-dataset/covid-data.csv, count rows with empty or non-numeric new_cases.


rdd = sc.textFile("covid-dataset/covid-data.csv")  # Load CSV
header = rdd.first()                               # Get header

# Filter rows with empty or non-numeric 'new_cases' (6th column)
invalid_cases = rdd.filter(lambda row: row != header) \
                   .filter(lambda row: not row.split(',')[5] or not row.split(',')[5].replace('.', '').replace('-', '').isdigit())

count = invalid_cases.count()  # Count invalid rows

print(f"Number of rows with missing or non-numeric new_cases: {count}")

Number of rows with missing or non-numeric new_cases: 19276


In [41]:
##15 Create an RDD from ["Spark is fun", "RDDs are powerful", "Spark uses RDDs"], get distinct words.

sentences = ["Spark is fun", "RDDs are powerful", "Spark uses RDDs"]
rdd = sc.parallelize(sentences)              # Create RDD from list of sentences
words = rdd.flatMap(lambda s: s.split()).distinct()    # Split sentences into words and get unique words

print(words.collect())

['is', 'are', 'Spark', 'uses', 'powerful', 'fun', 'RDDs']


In [42]:
##16 Create an RDD from [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], compute sum and count.


numbers = [1,2,3,4,5,6,7,8,9,10]
rdd = sc.parallelize(numbers)  

# Aggregate to compute sum and count in one pass
result = rdd.aggregate(
    (0,0),                                           # Initial value (sum, count)
    lambda acc, v: (acc[0]+v, acc[1]+1),            # SeqOp: update sum & count per partition
    lambda acc1, acc2: (acc1[0]+acc2[0], acc1[1]+acc2[1])  # CombOp: merge partitions
)

print(f"Sum: {result[0]}, Count: {result[1]}")

Sum: 55, Count: 10


In [43]:
##17 Create RDDs: [('a', 1), ('b', 2), ('a', 3)] and [('a', 'x'), ('b', 'y'), ('c', 'z')], use cogroup.


rdd1 = sc.parallelize([('a',1), ('b',2), ('a',3)])   # First RDD
rdd2 = sc.parallelize([('a','x'), ('b','y'), ('c','z')])  # Second RDD

cogrouped = rdd1.cogroup(rdd2)                       # Group values by key from both RDDs
result = cogrouped.mapValues(lambda x: (list(x[0]), list(x[1])))  # Convert iterables to lists

print(result.collect())

[('b', ([2], ['y'])), ('c', ([], ['z'])), ('a', ([1, 3], ['x']))]


In [45]:
##18 Load covid-dataset/covid-data.csv, take 1% sample with replacement, count records.



rdd = sc.textFile("covid-dataset/covid-data.csv")            # Load CSV
header = rdd.first()                             # Get header
data = rdd.filter(lambda row: row != header)    # Skip header

sampled = data.sample(True, 0.01)               # Take 1% sample with replacement
count = sampled.count()                          # Count sampled rows

print(f"Sampled records count: {count}")

Sampled records count: 4217


In [46]:
##19 Create RDDs: [1, 2, 3] and ['a', 'b'], compute Cartesian product.


rdd1 = sc.parallelize([1, 2, 3])      # First RDD
rdd2 = sc.parallelize(['a', 'b'])    # Second RDD

cartesian_rdd = rdd1.cartesian(rdd2)  # Compute Cartesian product of two RDDs

print(cartesian_rdd.collect())

[(1, 'a'), (1, 'b'), (2, 'a'), (2, 'b'), (3, 'a'), (3, 'b')]


In [47]:
##20 Create an RDD from [('a', '1 2 3'), ('b', '4 5')], split values with flatMapValues.


data = [('a','1 2 3'), ('b','4 5')]
rdd = sc.parallelize(data)                             # Create RDD of key-value pairs

split_rdd = rdd.flatMapValues(lambda v: v.split())     # Split values into multiple key-value pairs

print(split_rdd.collect())

[('a', '1'), ('a', '2'), ('a', '3'), ('b', '4'), ('b', '5')]
