In [1]:
from pyspark.sql import SparkSession

In [2]:
spark=SparkSession.builder.getOrCreate()

In [3]:
rdd=spark.sparkContext.parallelize([1,2,3,4,5,6])

In [4]:
rdd.collect()

[1, 2, 3, 4, 5, 6]

In [5]:
rdd1=rdd.map(lambda x:x**2)  #map transformation applies a given function to each element of the RDD
rdd1.collect()

[1, 4, 9, 16, 25, 36]

In [6]:
rdd2=rdd1.filter(lambda x:x>10)  #filter transformation creates a new RDD by selecting only the elements that satisfy a given condition
rdd2.collect()

[16, 25, 36]

In [7]:
rdd3=rdd.map(lambda x:x**2).filter(lambda x:x>10)
rdd3.collect()

[16, 25, 36]

In [8]:
rdd=spark.sparkContext.parallelize(['Chaitesh Sharp','Manasa Sharp'])
rdd.collect()


['Chaitesh Sharp', 'Manasa Sharp']

In [9]:
for item in rdd.collect():
    print(item)

Chaitesh Sharp
Manasa Sharp


In [10]:
rdd1=rdd.map(lambda x:x.split(' '))
for items in rdd1.collect():
    print(items)

['Chaitesh', 'Sharp']
['Manasa', 'Sharp']


In [11]:
rdd2=rdd.flatMap(lambda x:x.split(' '))
for items in rdd2.collect():
    print(items)
rdd2.collect()

Chaitesh
Sharp
Manasa
Sharp


['Chaitesh', 'Sharp', 'Manasa', 'Sharp']

In [12]:
cd=spark.sparkContext.parallelize([[1, 2], [3, 4], [5]])
cd.collect()

[[1, 2], [3, 4], [5]]

In [13]:
rdd1=cd.flatMap(lambda x:x)  #It flattens the nested lists into a single-level RDD.

rdd1.collect()

[1, 2, 3, 4, 5]

In [14]:
import random

In [15]:
data=random.sample(range(1,100),50)
data[0:5]

[43, 31, 60, 65, 59]

In [16]:
rdd=spark.sparkContext.parallelize(data)
rdd.take(5)

[43, 31, 60, 65, 59]

In [17]:
tranform=rdd.map(lambda x:x+2)
tranform.take(5)

[45, 33, 62, 67, 61]

In [18]:
filtered=rdd.filter(lambda x:x>50) #filter numbers greater than 50
filtered.take(5)

[60, 65, 59, 96, 77]

In [19]:
count=filtered.count() #count the number of elements

In [20]:
count

22

In [21]:
sum1=filtered.reduce(lambda x,y:x+y)
sum1

1675

In [22]:
average=sum1/count
average

76.13636363636364

In [23]:
rdd=spark.sparkContext.parallelize([('apple',1),('banana',2),('orange',3)])
rdd.collect()

[('apple', 1), ('banana', 2), ('orange', 3)]

In [24]:
rdd1=rdd.mapValues(lambda x:x+10) #Applies a function to the values of a key-value pair RDD 
rdd1.collect()

[('apple', 11), ('banana', 12), ('orange', 13)]

In [25]:
rdd_flatmap = rdd.flatMapValues(lambda x: [x, x+10])  
rdd_flatmap.collect()

[('apple', 1),
 ('apple', 11),
 ('banana', 2),
 ('banana', 12),
 ('orange', 3),
 ('orange', 13)]

In [26]:
rdd_map=rdd.mapValues(lambda x:[x,x])   #if we use mapValues
rdd_map.collect()

[('apple', [1, 1]), ('banana', [2, 2]), ('orange', [3, 3])]

In [27]:
rdd = spark.sparkContext.parallelize([("apple", 1), ("banana", 2), ("apple", 3), ("banana", 1)])

In [28]:
rdd.collect()

[('apple', 1), ('banana', 2), ('apple', 3), ('banana', 1)]

In [29]:
rdd_red=rdd.reduceByKey(lambda x,y:x+y) #reducebykey function reduced the values for each key
rdd_red.collect()       

[('apple', 4), ('banana', 3)]

In [30]:
rdd = spark.sparkContext.parallelize([("apple", 1), ("banana", 2), ("apple", 3), ("banana", 1),("apple", 1)]) #shuffle operations happen 

In [31]:
rdd_red=rdd.reduceByKey(lambda x,y:x+y) #reducebykey function reduced the values for each key 
rdd_red.collect()       

[('apple', 5), ('banana', 3)]

In [32]:
gro=rdd.groupByKey()
gro.collect()

[('apple', <pyspark.resultiterable.ResultIterable at 0x2e4d5ee95d0>),
 ('banana', <pyspark.resultiterable.ResultIterable at 0x2e4d5f28790>)]

In [33]:
result=gro.mapValues(list) #to convert result iterable into list
result.collect()

[('apple', [1, 3, 1]), ('banana', [2, 1])]

In [34]:
rdd = spark.sparkContext.parallelize([("apple", 1), ("banana", 2), ("apple", 3), ("banana", 1),("apple", 1),("orange", 1),('papaya',2)])

In [35]:
rdd.collect()

[('apple', 1),
 ('banana', 2),
 ('apple', 3),
 ('banana', 1),
 ('apple', 1),
 ('orange', 1),
 ('papaya', 2)]

In [36]:
result=rdd.sortByKey() #default ascending if we want descending we use ascending=False
result.collect()

[('apple', 1),
 ('apple', 3),
 ('apple', 1),
 ('banana', 2),
 ('banana', 1),
 ('orange', 1),
 ('papaya', 2)]

In [37]:
result=rdd.sortByKey(ascending=False)
result.collect()

[('papaya', 2),
 ('orange', 1),
 ('banana', 2),
 ('banana', 1),
 ('apple', 1),
 ('apple', 3),
 ('apple', 1)]

In [38]:
rdd = spark.sparkContext.parallelize([("apple", 1), ("banana", 2), ("apple", 3), ("banana", 1), ("apple", 2),("apple", 1), ("banana", 2), ("apple", 3), ("banana", 1), ("apple", 2)])
rdd.collect()

[('apple', 1),
 ('banana', 2),
 ('apple', 3),
 ('banana', 1),
 ('apple', 2),
 ('apple', 1),
 ('banana', 2),
 ('apple', 3),
 ('banana', 1),
 ('apple', 2)]

In [39]:
combine=rdd.combineByKey(lambda value:value,
                         lambda combo,value:combo+value,
                         lambda combo1,combo2:combo1+combo2)
combine.collect()

[('apple', 12), ('banana', 6)]

In [40]:
result=rdd.combineByKey(lambda value:(value,1),
                        lambda comb,value:(comb[0]+value,comb[1]+1),
                        lambda comb1,comb2:(comb1[0]+comb2[0],comb1[1]+comb2[1]))
result.collect()

[('apple', (12, 6)), ('banana', (6, 4))]

In [41]:
final_result=result.mapValues(lambda x:x[0]/x[1])
final_result.collect()

[('apple', 2.0), ('banana', 1.5)]

In [43]:
activity= [
    ("U1", "Logged in"),
    ("U2", "Viewed product"),
    ("U1", "Added product to cart"),
    ("U3", "Logged in"),
    ("U2", "Checked out"),
    ("U1", "Made a purchase")
]
rdd=spark.sparkContext.parallelize(activity)
rdd.collect()

[('U1', 'Logged in'),
 ('U2', 'Viewed product'),
 ('U1', 'Added product to cart'),
 ('U3', 'Logged in'),
 ('U2', 'Checked out'),
 ('U1', 'Made a purchase')]

In [45]:
result=rdd.combineByKey(lambda value:value,
                        lambda comb,value:comb+','+value,
                        lambda comb1,comb2:comb1+','+comb2)
result.collect()

[('U3', 'Logged in'),
 ('U1', 'Logged in,Added product to cart,Made a purchase'),
 ('U2', 'Viewed product,Checked out')]

In [46]:
rdd=spark.sparkContext.parallelize([1,2,3,4,5,6])
rdd.first()

1