In [1]:
sc


In [2]:
spark

In [3]:
hr_employee = spark.read.csv("file:///home/hadoop/Downloads/HR_Employee.csv", inferSchema = True, header = True)

In [4]:
hr_employee.printSchema()

root
 |-- EmployeeID: integer (nullable = true)
 |-- Department: string (nullable = true)
 |-- JobRole: string (nullable = true)
 |-- Attrition: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- MaritalStatus: string (nullable = true)
 |-- Education: string (nullable = true)
 |-- EducationField: string (nullable = true)
 |-- BusinessTravel: string (nullable = true)
 |-- JobInvolvement: string (nullable = true)
 |-- JobLevel: integer (nullable = true)
 |-- JobSatisfaction: string (nullable = true)
 |-- Hourlyrate: integer (nullable = true)
 |-- Income: integer (nullable = true)
 |-- Salaryhike: integer (nullable = true)
 |-- OverTime: string (nullable = true)
 |-- Workex: integer (nullable = true)
 |-- YearsSinceLastPromotion: integer (nullable = true)
 |-- EmpSatisfaction: string (nullable = true)
 |-- TrainingTimesLastYear: integer (nullable = true)
 |-- WorkLifeBalance: string (nullable = true)
 |-- Performance_Rating: string (nul

#### 1.BigData File Types
    *Parquet File Format - Records are stored columunar format , this file format compress dataset of .csv of structured format into parquet format. Parquet format is good for query type of response.
    * AVRO File Format - Row based , faster read and write .Used widely for serialization , Stores schema in JSON format. There is no compression in this file format.
    * ORC File Format

In [6]:
hr_employee.rdd.getNumPartitions()

1

In [5]:
hr_employee.write.parquet("file:///home/hadoop/Downloads/HR_Parquet")

In [7]:
hr_employee.write.orc("/HR_orc")

In [9]:
spark.read.orc("/HR_orc").show(5)

+----------+--------------------+--------------------+---------+------+---+-------------+-------------+--------------+-----------------+--------------+--------+---------------+----------+------+----------+--------+------+-----------------------+---------------+---------------------+---------------+------------------+
|EmployeeID|          Department|             JobRole|Attrition|Gender|Age|MaritalStatus|    Education|EducationField|   BusinessTravel|JobInvolvement|JobLevel|JobSatisfaction|Hourlyrate|Income|Salaryhike|OverTime|Workex|YearsSinceLastPromotion|EmpSatisfaction|TrainingTimesLastYear|WorkLifeBalance|Performance_Rating|
+----------+--------------------+--------------------+---------+------+---+-------------+-------------+--------------+-----------------+--------------+--------+---------------+----------+------+----------+--------+------+-----------------------+---------------+---------------------+---------------+------------------+
|         1|               Sales|     Sales

#### Optimization Techniques
    *Optimizing Spark jobs can significantly improve performance of Spark Running queries, Spark Jobs

#### 2.Partitioning
    * Partitioning divides data into smaller chunks, which can be processed parallely.

In [10]:
hr_employee.rdd.getNumPartitions()

1

In [12]:
partitioned_df = hr_employee.repartition(3)

In [13]:
partitioned_df.write.parquet("/HR_Partition")

#### 3.Caching & Persistance
    * Managing different level of Storages

In [14]:
#In-Memory cache storage
hr_employee.cache()

DataFrame[EmployeeID: int, Department: string, JobRole: string, Attrition: string, Gender: string, Age: int, MaritalStatus: string, Education: string, EducationField: string, BusinessTravel: string, JobInvolvement: string, JobLevel: int, JobSatisfaction: string, Hourlyrate: int, Income: int, Salaryhike: int, OverTime: string, Workex: int, YearsSinceLastPromotion: int, EmpSatisfaction: string, TrainingTimesLastYear: int, WorkLifeBalance: string, Performance_Rating: string]

In [16]:
#Persistance of Dataframe with specific type of storage options like - memory only, memory ser, memory and disk
from pyspark import StorageLevel
hr_employee1 =  hr_employee.persist(StorageLevel.MEMORY_AND_DISK)

In [17]:
hr_employee2 =  hr_employee.persist(StorageLevel.MEMORY_ONLY_SER)

#### 4. Serialization
    * Efficient serialization reduces time to read/write and transfer it over network.
    Java Serialization , Kyro Serialization ispopular serialization method for better performance over default java serialization.

a) JavaSerialization :
* It is default serialization method. It is easy to use but drwaback is it will slow down the read,write process.

In [18]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [19]:
spark = SparkSession.builder.appName("Java Serialization").getOrCreate()

In [20]:
spark.stop()

In [32]:
spark = SparkSession.builder.appName("JPySpark Serialization")\
.config("spark.serializer","org.apache.spark.serializer.JavaSerializer").getOrCreate()

In [33]:
spark

b) Kyro Serialization: Faster, More compact than Java Serialization

In [24]:
spark = SparkSession.builder\
.config("spark.serializer","org.apache.spark.serializer.KyroSerializer")\
.config("spark.kyro.registrationRequired", "true")\
.config("spark.kyro.classesToRegister","org.apache.spark.example.Person")\
.appName("Kyro Serialization").getOrCreate()

#### 5.Broadcast Join
    *Broadcast small datasets improves join performance.

In [40]:
small_df = spark.read.csv("file:///home/hadoop/Downloads/airports.csv", inferSchema = True, header =True)
df = spark.read.csv("file:///home/hadoop/Downloads/raw_flight_data.csv", inferSchema =True, header=True)

In [41]:
from pyspark.sql.functions import broadcast
broadcast_df = broadcast(small_df)

In [1]:
broadcast_df = broadcast_df.cache()
df = df.cache()

NameError: name 'broadcast_df' is not defined

In [46]:
#Broadcast join
airport_df = df.join(broadcast_df, df.OriginAirportID == broadcast_df.airport_id)

In [49]:
airport_df.show(5)

+----------+---------+-------+---------------+-------------+--------+--------+----------+--------------+-----+--------------------+
|DayofMonth|DayOfWeek|Carrier|OriginAirportID|DestAirportID|DepDelay|ArrDelay|airport_id|          city|state|                name|
+----------+---------+-------+---------------+-------------+--------+--------+----------+--------------+-----+--------------------+
|        19|        5|     DL|          11433|        13303|      -3|       1|     11433|       Detroit|   MI|Detroit Metro Way...|
|        19|        5|     DL|          14869|        12478|       0|      -8|     14869|Salt Lake City|   UT|Salt Lake City In...|
|        19|        5|     DL|          14057|        14869|      -4|     -15|     14057|      Portland|   OR|Portland Internat...|
|        19|        5|     DL|          15016|        11433|      28|      24|     15016|     St. Louis|   MO|Lambert-St. Louis...|
|        19|        5|     DL|          11193|        12892|      -6|     -1

#### 6.Level of Parallellism

In [51]:
#Adjust the level of parallelism based upon the cluster size
spark.conf.set("spark.default.parallelism", 100)

#### 7. Avoid GroupByKey
    * Use ReduceByKey or aggregrateByKey instead of GroupByKey() to reduce number of shuffle.

In [53]:
rdd = spark.sparkContext.parallelize([('dosa',2),('idly',3),('vada',5),('rice',1),
                                    ('coffee',5),('idly',3),('vada',3)])
rdd.groupByKey().mapValues(sum).collect()

[('dosa', 2), ('idly', 6), ('vada', 8), ('rice', 1), ('coffee', 5)]

In [54]:
rdd.reduceByKey(lambda x,y : x+y).collect()

[('dosa', 2), ('idly', 6), ('vada', 8), ('rice', 1), ('coffee', 5)]

In [57]:
from pyspark.sql.functions import sum
df = spark.createDataFrame([('dosa',2),('idly',3),('vada',5),('rice',1),
                                    ('coffee',5),('idly',3),('vada',3),('sweets',3)],
                          ["order","value"])
df.groupBy("order").agg(sum("value").alias("total_value")).show()

+------+-----------+
| order|total_value|
+------+-----------+
|sweets|          3|
|  vada|          8|
|  dosa|          2|
|  idly|          6|
|  rice|          1|
|coffee|          5|
+------+-----------+



In [58]:
df.rdd.reduceByKey(lambda x,y : x+y).collect()

[('dosa', 2),
 ('idly', 6),
 ('vada', 8),
 ('rice', 1),
 ('coffee', 5),
 ('sweets', 3)]

#### 8. Reduce Shuffle
    *Reduce the number of shuffles by optimizing transformations.
    *Use reduceByKey() over groupByKey()
    *Use map() and reduce() over groupby()

#### 9.Repartition() & Coalesce()

#### 10.Accumulators
    * Use accumulators for optimizing aggregate information like count(), sum(), corr(), max(), std(), var() etc. 
    across all executers parallely executing tasks in multiple worker nodes.
    * Accumulators are used for counting and sum.

In [82]:
acc = spark.sparkContext.accumulator(0)

In [66]:
type(acc)

pyspark.accumulators.Accumulator

In [78]:
rdd = spark.sparkContext.parallelize([1,2,3,4,5,6,7,8,9])

In [83]:
#python udf
def add(x):
    acc.add(x)

In [84]:
rdd.foreach(add)

In [85]:
print(acc.value)

45


In [74]:
def counter(x):
    global acc
    acc.add(1)
    return x

In [86]:
rdd.map(counter).count()

9

#### 11.Bucketing
     * Use bucketing to create buckets of large datasets for efficent query and joins.