In [None]:
sc

In [None]:
spark


#### 1. Create new spark session

In [None]:
sc.stop()

In [None]:
from pyspark import SparkConf, SparkContext
#setMaster() = Set Spark Content Manager which is local[cpu cores]
config = SparkConf().setMaster('local[2]').setAppName("RDDSession")
sc = SparkContext(conf = config)

In [None]:
sc

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SQLSession').getOrCreate()

#### 2. Create RDD
     * Create RDD using sc.parellelize() using collections like numpy array ,lists, tuples etc.

In [None]:
rdd1 = sc.parallelize([10,20,30,40,50,60,70,80,90])

In [None]:
type(rdd1)

In [None]:
print(rdd1.collect())

In [None]:
print((rdd1.take(5)))    #take(num of values as per the input count)

In [None]:
print("Total Count of RDD :", rdd1.count())       #Returns total count of values present in the RDD

In [None]:
rdd1.getNumPartitions()      # Number of RDD Partitions = Number of Workers

#### 3. Narrow Transformation

In [None]:
#Map Transformation - Applies to all individual values
rdd2 = rdd1.map(lambda val : val * 3)

In [None]:
#call action() method - collect() or take()
rdd2.collect()

In [None]:
rdd3 = rdd1.map(lambda val : str(val) + "Number")
rdd3.collect()

In [None]:
#Filter method - Narow Transformation which applies condition to each element of RDD
#All transformation method like map(), filter() expect a function as an Input
rdd4 = rdd1.filter(lambda x : x <= 70) 
rdd4.collect()

In [None]:
#Create an RDD using range()
rdd5 = sc.parallelize(range(1,30))
print(rdd5.take(10))

In [None]:
rdd6 = rdd5.filter(lambda x: x % 2 == 0).map(lambda x: str(x) + 'Even')
print(rdd6.collect())

In [None]:
city_rdd = sc.parallelize(["Delhi,Kolkata, Kochi,Vizag,Varkala,Chennai,Banglore,Pune",
                         "Dubai, New York,Berlin,Noida,Banglore,Vizag",
                         "Venice,Dehradun,Munnar,Mumbai,Kochi,Kottayam"])
city_rdd.collect()

In [None]:
city_names = city_rdd.map(lambda val : val.split(','))
print(city_names.collect())

In [None]:
city_names2 = city_names.flatMap(lambda val : val)
city_names2.collect()

In [None]:
# Union - Union of Two RDDs
rdd7  = rdd1.filter(lambda val : val > 20 and val <= 70)
uniRDD = rdd1.union(rdd7)
uniRDD.collect()

#### Task : Create RDD using city_names and generate City_name starting with 'B' or 'K'

In [None]:
city_names_3 = city_names2.filter(lambda val : val[0].lower() == 'k' or val[0].lower() == 'b')
city_names_3.collect()

In [None]:
city_names3 = city_names2.filter(lambda val : val.startswith('K') or val.startswith('B'))
city_names3.collect()

In [None]:
#distinct = returns new RDD with distinct elements
distinct_city = city_names2.distinct()
print(distinct_city.collect())

In [None]:
#simple (withReplacement ,fraction, seed)
#samples a fraction of data from a RDD with replacement
sample_rdd = city_names2.sample(True, 0.5, seed=11)     #withReplacement
sample_rdd.collect()

In [None]:
sample_rdd = city_names2.sample(False, 0.5, seed=11)      #withoutReplacement
sample_rdd.collect()

#### 4. Wide Transformation

In [None]:
city_names_3.getNumPartitions()

In [128]:
pairs = [('Trivandrum',24), ('Chennai',30),('Mumbai',21),('Kochi',12),('Chennai',20),('Trivandrum',15),('Delhi',25)
        ,('Delhi',12),('Kottayam',60)]
pairs_rdd = sc.parallelize(pairs)
pairs_rdd.collect()

[('Trivandrum', 24),
 ('Chennai', 30),
 ('Mumbai', 21),
 ('Kochi', 12),
 ('Chennai', 20),
 ('Trivandrum', 15),
 ('Delhi', 25),
 ('Delhi', 12),
 ('Kottayam', 60)]

In [None]:
pairs_rdd.getNumPartitions()

In [None]:
#repartition() - To create more number of partition
new_pair_rdd = pairs_rdd.repartition(4)
new_pair_rdd.getNumPartitions()

In [127]:
#coalesce - Reduces the number of RDD Partirtions
new_pair_rdd1 = new_pair_rdd.coalesce(2)
new_pair_rdd1.getNumPartitions()

2

In [None]:
#ReduceByKey - Aggrregate each value of RDD by keys . It does involves shuffling of values to ensure that all values
#
pairs_rdd2 = pairs_rdd.reduceByKey(lambda x,_:x + _ )
pairs_rdd2.collect()

In [129]:
orders = [('Fries',2),('Nuggets',3),('PaniPuri',5),('Chole Bhature',2),('Dosa',1)
         ,('Dosa',1),('PaniPuri',5),('Vada',2),('PaniPuri',5),('Fries',2),('Chole Bhature',2)]
orders_rdd = sc.parallelize(orders)
orders_rdd.reduceByKey(lambda x,y : x + y).collect()

[('Nuggets', 3),
 ('Chole Bhature', 4),
 ('Dosa', 2),
 ('Vada', 2),
 ('Fries', 4),
 ('PaniPuri', 15)]

In [None]:
#Intersection : Method to scan all rdd and find common among all partitions of RDD
rdd8 = sc.parallelize(['Delhi','Kolkata','Banglore','Vizag','Varkala','Chennai'])
rdd9 = sc.parallelize(['Delhi','Pune','Trivandrum','Banglore','Vizag','Mumbai'])
rdd8.intersection(rdd9).collect()

In [None]:
#sc.parallelize(collections, number of partitions)
rdd10 = sc.parallelize(range(1,50),4)
rdd10.getNumPartitions()

In [None]:
print(rdd10.collect())

In [None]:
def partition_func(iterator):
    return [sum(iterator)]

In [None]:
partitioned_rdd = rdd10.mapPartitions(partition_func)
partitioned_rdd.collect()

In [None]:
pairs = [('Trivandrum',24), ('Chennai',30),('Mumbai',21),('Kochi',12),('Chennai',20),('Trivandrum',15),('Delhi',25)
        ,('Delhi',12),('Kottayam',60)]
pairs_rdd = sc.parallelize(pairs)

In [None]:
pairs_rdd.groupByKey().collect()

In [None]:
#groupByKey - Groups data by key , which involves shuffling all values for each key.
grouped_rdd = pairs_rdd.groupByKey().mapValues(list)
grouped_rdd.collect()

In [None]:
pairs_rdd.groupByKey().mapValues(len).collect()


In [None]:
#Join() - Joins two RDD based on their keys which requires shuffling tghe data to ensure all keys are correctly paired.
order1 = sc.parallelize([('Fries',2),('Nuggets',3),('PaniPuri',5),('Chole Bhature',2),('Dosa',1)])
order2 = sc.parallelize([('Dosa',1),('PaniPuri',5),('Vada',2),('Juice',5),('Fries',2),('Chole Bhature',2)])

In [None]:
#join will scan all partitioned rdd to find common keys
joined_rdd = order1.join(order2)
joined_rdd.collect()

In [None]:
order1.leftOuterJoin(order2).collect()

In [None]:
order1.rightOuterJoin(order2).collect()

In [None]:
order1.fullOuterJoin(order2).collect()

In [None]:
#Groups data from two rdd by key, resulting ionto new RDD with keys and values grouped from both RDD.
results = order1.cogroup(order2).collect()
for key, (values1,values2) in results:
    print(f"{key}, {list(values1)}, {list(values2)}")

#### RDD using FileName()

In [None]:
# load localFS use 'file://path_to_filename'
emp_RDD = sc.textFile("file:///home/hadoop/Downloads/Employee_Advance.csv")
emp_RDD.take(6)

In [None]:
emp_RDD1 = emp_RDD.map(lambda row: row.split(','))
emp_RDD1.collect()

#### 1)Show all employees working in the Department = "Business Development"

In [151]:
emp_BD = emp_RDD1.filter(lambda val: val[5] == "Business Development")
#Returns first 5 rows of List of Employees working in Business Development
emp_BD.take(5)

[['3',
  'Siward',
  'Struijs',
  'sstruijs2@wikia.com',
  'Male',
  'Business Development',
  '432657',
  'Armenian',
  'London',
  'United States'],
 ['6',
  'Tova',
  'Manclark',
  'tmanclark5@army.mil',
  'Female',
  'Business Development',
  '659039',
  'Hiri Motu',
  'Le Mans',
  'France'],
 ['9',
  'Warde',
  'Stenett',
  'wstenett8@altervista.org',
  'Genderqueer',
  'Business Development',
  '807769',
  'Moldovan',
  'Bastia',
  'France'],
 ['17',
  'Bern',
  'Lafond',
  'blafondg@amazon.de',
  'Non-binary',
  'Business Development',
  '933737',
  'Greek',
  'San Antonio',
  'United States'],
 ['19',
  'Hillery',
  'Costall',
  'hcostalli@rakuten.co.jp',
  'Genderfluid',
  'Business Development',
  '736463',
  'Norwegian',
  'Clermont-Ferrand',
  'France']]

#### 2)Count total number of Employees in Organization

In [162]:
emp_RDD1.count()


1000

#### 3)Find Average Income of each Department

In [168]:
emp_Dep = emp_RDD1.map(lambda val: (val[5],val[6]))
emp_total = emp_Dep.groupByKey().mapValues(len)
emp_sum = emp_Dep.reduceByKey(lambda a1,a2: int(a1)+int(a2)).join(emp_total)
emp_sum.map(lambda x : (x[0],x[1][0]/x[1][1])).collect()

[('Accounting', 814474.3333333334),
 ('Human Resources', 772824.3829787234),
 ('Legal', 754482.0),
 ('Services', 765440.4444444445),
 ('Sales', 745501.5921052631),
 ('Product Management', 696304.2985074627),
 ('Support', 825695.0947368421),
 ('Business Development', 783081.2710280374),
 ('Research and Development', 789849.1486486486),
 ('Training', 757752.380952381),
 ('Marketing', 744170.7831325302),
 ('Engineering', 757094.5466666666)]

In [171]:
#Alternative
emp_Dep = emp_RDD1.map(lambda val: (val[5],int(val[6])))
emp_total = emp_Dep.groupByKey().mapValues(list).collect()
for i in emp_total:
    print(i[0], sum(i[1])/len(i[1]))

Accounting 814474.3333333334
Human Resources 772824.3829787234
Legal 754482.0
Services 765440.4444444445
Sales 745501.5921052631
Product Management 696304.2985074627
Support 825695.0947368421
Business Development 783081.2710280374
Research and Development 789849.1486486486
Training 757752.380952381
Marketing 744170.7831325302
Engineering 757094.5466666666


#### 4)Count the number of Employees working in each department

In [175]:
emp_Dep = emp_RDD1.map(lambda val: (val[5],1)).groupByKey().mapValues(len)
emp_Dep.collect()

[('Accounting', 93),
 ('Human Resources', 94),
 ('Legal', 80),
 ('Services', 72),
 ('Sales', 76),
 ('Product Management', 67),
 ('Support', 95),
 ('Business Development', 107),
 ('Research and Development', 74),
 ('Training', 84),
 ('Marketing', 83),
 ('Engineering', 75)]

In [133]:
#Alternative
emp_RDD1.groupBy(lambda row : row[5]).mapValues(len).collect()

[('Accounting', 93),
 ('Human Resources', 94),
 ('Legal', 80),
 ('Services', 72),
 ('Sales', 76),
 ('Product Management', 67),
 ('Support', 95),
 ('Business Development', 107),
 ('Research and Development', 74),
 ('Training', 84),
 ('Marketing', 83),
 ('Engineering', 75)]

#### 5) List of all unique titles

In [172]:
#distinct - wide transformation
emp_RDD1.map(lambda row : row[5]).distinct().collect()

['Accounting',
 'Human Resources',
 'Legal',
 'Services',
 'Sales',
 'Product Management',
 'Support',
 'Business Development',
 'Research and Development',
 'Training',
 'Marketing',
 'Engineering']

#### 6) Count all the employees whose first letter starts with "A"

In [174]:
emp_RDD1.filter(lambda x: x[1].startswith('A')).count()

81

#### 7) WordCount Example using Spark RDD

In [176]:
!hdfs dfs -ls /

Found 3 items
drwxr-xr-x   - hadoop supergroup          0 2022-11-21 15:25 /hbase
drwxrwxrwx   - hadoop supergroup          0 2022-11-21 15:12 /tmp
drwxr-xr-x   - hadoop supergroup          0 2022-11-21 15:11 /user


In [177]:
!hdfs dfs -mkdir /wordcount

In [186]:
!hdfs dfs -put /home/hadoop/Downloads/Harry_Potter_and_the_Deathly_Hallows.txt /wordcount/

In [188]:
rdd = sc.textFile('/wordcount/Harry_Potter_and_the_Deathly_Hallows.txt')
rdd_word1 = rdd.flatMap(lambda line : line.split(" "))
rdd_word2 = rdd_word1.map(lambda word : (word.lower(),1))
rdd_word3 = rdd_word2.reduceByKey(lambda x, y: x + y)
rdd_word4 = rdd_word3.filter(lambda val : val[1] > 10).sortBy(lambda val : val[1], ascending = False)
rdd_word4.collect()

[('the', 10280),
 ('and', 5283),
 ('to', 4843),
 ('of', 4121),
 ('he', 3845),
 ('a', 3523),
 ('was', 2681),
 ('his', 2602),
 ('in', 2159),
 ('had', 1990),
 ('harry', 1867),
 ('it', 1859),
 ('that', 1813),
 ('said', 1676),
 ('you', 1485),
 ('as', 1409),
 ('at', 1401),
 ('i', 1275),
 ('with', 1127),
 ('they', 1091),
 ('not', 1083),
 ('on', 984),
 ('for', 980),
 ('but', 975),
 ('she', 918),
 ('her', 864),
 ('—', 841),
 ('from', 836),
 ('have', 784),
 ('…', 756),
 ('were', 738),
 ('be', 737),
 ('him', 666),
 ('into', 655),
 ('hermione', 653),
 ('out', 650),
 ('could', 627),
 ('all', 583),
 ('ron', 578),
 ('what', 571),
 ('been', 567),
 ('“i', 535),
 ('up', 510),
 ('we', 498),
 ('there', 493),
 ('who', 490),
 ('is', 484),
 ('—”', 472),
 ('their', 465),
 ('did', 451),
 ('would', 443),
 ('them', 442),
 ('like', 424),
 ('if', 424),
 ('looked', 419),
 ('by', 419),
 ('harry,', 414),
 ('back', 409),
 ('so', 403),
 ('over', 383),
 ('this', 382),
 ('an', 381),
 ('then', 365),
 ('one', 364),
 ('wand

#### 8) Count the number of Employees working in each city and Sort City by Employees Count in Desc Order.

In [195]:
city_RDD = emp_RDD1.map(lambda x : (x[-2],1))
count = city_RDD.groupByKey().mapValues(len).sortBy(lambda val : val[1], ascending = False).collect()
count

[('Washington', 13),
 ('Berlin', 13),
 ('Lyon', 13),
 ('Cincinnati', 9),
 ('Dallas', 9),
 ('Cergy-Pontoise', 9),
 ('Sacramento', 8),
 ('Strasbourg', 8),
 ('Lille', 7),
 ('Dijon', 7),
 ('Los Angeles', 7),
 ('Pittsburgh', 7),
 ('Bordeaux', 7),
 ('San Jose', 7),
 ('Denver', 7),
 ('El Paso', 7),
 ('Montpellier', 7),
 ('Chicago', 6),
 ('Tulsa', 6),
 ('Quimper', 6),
 ('Marseille', 6),
 ('New York City', 6),
 ('Pau', 6),
 ('Poitiers', 6),
 ('Montgomery', 6),
 ('Le Mans', 5),
 ('Avignon', 5),
 ('Seattle', 5),
 ('Marne-la-Vallée', 5),
 ('Limoges', 5),
 ('Oakland', 5),
 ('Fort Worth', 5),
 ('Houston', 5),
 ('San Diego', 5),
 ('Philadelphia', 5),
 ('Paris La Défense', 5),
 ('Rungis', 5),
 ('Perpignan', 5),
 ('Rouen', 5),
 ('Caen', 5),
 ('Minneapolis', 5),
 ('Nantes', 5),
 ('Orléans', 5),
 ('Nürnberg', 4),
 ('Mobile', 4),
 ('Aix-en-Provence', 4),
 ('Angoulême', 4),
 ('Charlotte', 4),
 ('Düsseldorf', 4),
 ('Topeka', 4),
 ('Kansas City', 4),
 ('Roissy Charles-de-Gaulle', 4),
 ('Saint Petersburg', 4)

#### Saving Data using RDD

In [196]:
rdd_word4.getNumPartitions()

2

In [197]:
#Saving RDD to local filesystem
rdd_word4.saveAsTextFile('file:///home/hadoop/Downloads/output/')

In [198]:
#Saving RDD to local filesystem
rdd_word4.saveAsTextFile('wordcount/output/')

#### RDD - Statistical Function

In [202]:
rdd_city = emp_RDD1.map(lambda row :(row[5],int(row[6]))) 
rdd_city.min(lambda x : x[1])

('Training', 12727)

In [203]:
rdd_city.max(lambda x : x[1])

('Support', 1496924)

In [208]:
rdd_11 = sc.parallelize(range(1,10)).sample(False, 0.3, seed = 1)
rdd_11.min()

6

In [209]:
rdd_11.max()

9

In [212]:
#Measures dIstribution of Samples data, and sample deviation from mean value
rdd_city.map(lambda x: x[1]).stdev()

424405.95187845733

In [213]:
rdd_city.map(lambda x: x[1]).variance()

180120411989.85944

In [214]:
rdd_city.map(lambda x: x[1]).mean()

770051.4190000002

In [215]:
#Total addition of Sample
rdd_city.map(lambda x: x[1]).sum()

770051419

In [216]:
rdd_city.map(lambda x: x[1]).stats()

(count: 1000, mean: 770051.4190000002, stdev: 424405.95187845733, max: 1496924, min: 12727)

####  RDD Persistance and Storage Levels

In [217]:
emp_RDD.cache()

file:///home/hadoop/Downloads/Employee_Advance.csv MapPartitionsRDD[231] at textFile at NativeMethodAccessorImpl.java:0

In [219]:
from pyspark.storagelevel import StorageLevel
#persist the RDD in memory with default storage level(MEMORY ONLY)
emp_RDD = emp_RDD.persist(StorageLevel.MEMORY_ONLY)

In [222]:
emp_RDD1 = emp_RDD.map(lambda row : row.split(','))
emp_Map = emp_RDD1.map(lambda row: (row[5],1))
emp_Map.reduceByKey(lambda x, y : x + y).collect()

[('Accounting', 93),
 ('Human Resources', 94),
 ('Legal', 80),
 ('Services', 72),
 ('Sales', 76),
 ('Product Management', 67),
 ('Support', 95),
 ('Business Development', 107),
 ('Research and Development', 74),
 ('Training', 84),
 ('Marketing', 83),
 ('Engineering', 75)]

In [223]:
emp_RDD.unpersist()

file:///home/hadoop/Downloads/Employee_Advance.csv MapPartitionsRDD[231] at textFile at NativeMethodAccessorImpl.java:0

In [224]:
emp_RDD = emp_RDD.persist(StorageLevel.MEMORY_AND_DISK_SER)