In [7]:
from pyspark.sql import SparkSession

spark=SparkSession.builder.appName("WindowFunction").master("local[*]").getOrCreate()
spark

In [16]:
df=spark.read.format("csv").option("header",True).option("inferschema",True).load("input/emp.csv")

In [17]:
df.printSchema()

root
 |-- employee_id: integer (nullable = true)
 |-- department_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- hire_date: timestamp (nullable = true)



In [None]:
#you can see that the datatypes are inferred the datatype correctly

In [18]:
df.show(5) # 5 means number of rows to be printed

+-----------+-------------+----------+---+------+------+-------------------+
|employee_id|department_id|      name|age|gender|salary|          hire_date|
+-----------+-------------+----------+---+------+------+-------------------+
|          1|          101|  John Doe| 30|  Male| 50000|2015-01-01 00:00:00|
|          2|          101|Jane Smith| 25|Female| 45000|2016-02-15 00:00:00|
|          3|          102| Bob Brown| 35|  Male| 55000|2014-05-01 00:00:00|
|          4|          102| Alice Lee| 28|Female| 48000|2017-09-30 00:00:00|
|          5|          103| Jack Chan| 40|  Male| 60000|2013-04-01 00:00:00|
+-----------+-------------+----------+---+------+------+-------------------+
only showing top 5 rows



In [42]:
emp_schema="employee_id int, department_id int,name string, age int,gender string,salary double, hire_date date"
df_using_schema=spark.read.format("csv").option("header",True).schema(emp_schema).load("input/emp.csv")
df_using_schema.printSchema()
df_using_schema.show()

root
 |-- employee_id: integer (nullable = true)
 |-- department_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: double (nullable = true)
 |-- hire_date: date (nullable = true)

+-----------+-------------+-------------+---+------+-------+----------+
|employee_id|department_id|         name|age|gender| salary| hire_date|
+-----------+-------------+-------------+---+------+-------+----------+
|          1|          101|     John Doe| 30|  Male|50000.0|2015-01-01|
|          2|          101|   Jane Smith| 25|Female|45000.0|2016-02-15|
|          3|          102|    Bob Brown| 35|  Male|55000.0|2014-05-01|
|          4|          102|    Alice Lee| 28|Female|48000.0|2017-09-30|
|          5|          103|    Jack Chan| 40|  Male|60000.0|2013-04-01|
|          6|          103|    Jill Wong| 32|Female|52000.0|2018-07-01|
|          7|          101|James Johnson| 42|  Male|70000.0|2012-03-1

In [37]:
%%markdown

### how to handle bad records
```
1.Permissive Mode(Default)
2.DropMalformed -> will remove the corrupted Column
3.Failfast -> will not allow any corrupted record
```


### how to handle bad records
```
1.Permissive Mode(Default)
2.DropMalformed -> will remove the corrupted Column
3.Failfast -> will not allow any corrupted record
```


In [43]:

emp_schema="employee_id int, department_id int,name string, age int,gender string,salary double, hire_date date,bad_record string"
df_corrupt=spark.read.format("csv").option("header",True).option("columnNameOfCorruptRecord","bad_record").schema(emp_schema).load("input/emp.csv")
df_corrupt.printSchema()
df_corrupt.show()

root
 |-- employee_id: integer (nullable = true)
 |-- department_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: double (nullable = true)
 |-- hire_date: date (nullable = true)
 |-- bad_record: string (nullable = true)

+-----------+-------------+-------------+---+------+-------+----------+--------------------+
|employee_id|department_id|         name|age|gender| salary| hire_date|          bad_record|
+-----------+-------------+-------------+---+------+-------+----------+--------------------+
|          1|          101|     John Doe| 30|  Male|50000.0|2015-01-01|                null|
|          2|          101|   Jane Smith| 25|Female|45000.0|2016-02-15|                null|
|          3|          102|    Bob Brown| 35|  Male|55000.0|2014-05-01|                null|
|          4|          102|    Alice Lee| 28|Female|48000.0|2017-09-30|                null|
|          5|          1

In [44]:
emp_schema="employee_id int, department_id int,name string, age int,gender string,salary double, hire_date date"
df_mode_dropmalformed=spark.read.format("csv").option("header",True).option("mode","DROPMALFORMED").schema(emp_schema).load("input/emp.csv")
df_mode_dropmalformed.printSchema()
df_mode_dropmalformed.show()

root
 |-- employee_id: integer (nullable = true)
 |-- department_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: double (nullable = true)
 |-- hire_date: date (nullable = true)

+-----------+-------------+-------------+---+------+-------+----------+
|employee_id|department_id|         name|age|gender| salary| hire_date|
+-----------+-------------+-------------+---+------+-------+----------+
|          1|          101|     John Doe| 30|  Male|50000.0|2015-01-01|
|          2|          101|   Jane Smith| 25|Female|45000.0|2016-02-15|
|          3|          102|    Bob Brown| 35|  Male|55000.0|2014-05-01|
|          4|          102|    Alice Lee| 28|Female|48000.0|2017-09-30|
|          5|          103|    Jack Chan| 40|  Male|60000.0|2013-04-01|
|          6|          103|    Jill Wong| 32|Female|52000.0|2018-07-01|
|          7|          101|James Johnson| 42|  Male|70000.0|2012-03-1

In [34]:
%%markdown

### failfast


### failfast


In [None]:
emp_schema="employee_id int, department_id int,name string, age int,gender string,salary double, hire_date date"
df_mode_dropmalformed=spark.read.format("csv").option("header",True).option("mode","FAILFAST").schema(emp_schema).load("input/emp.csv")
df_mode_dropmalformed.printSchema()
df_mode_dropmalformed.show()

In [46]:
_options={
    "header":"true",
    "inferSchema":"true",
    "mode":"PERMISSIVE"
}

emp_schema="employee_id int, department_id int,name string, age int,gender string,salary double, hire_date date"
df_mode_dropmalformed=spark.read.format("csv").options(**_options).load("input/emp.csv")
df_mode_dropmalformed.printSchema()
df_mode_dropmalformed.show()

root
 |-- employee_id: integer (nullable = true)
 |-- department_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: string (nullable = true)
 |-- hire_date: timestamp (nullable = true)

+-----------+-------------+-------------+---+------+------+-------------------+
|employee_id|department_id|         name|age|gender|salary|          hire_date|
+-----------+-------------+-------------+---+------+------+-------------------+
|          1|          101|     John Doe| 30|  Male| 50000|2015-01-01 00:00:00|
|          2|          101|   Jane Smith| 25|Female| 45000|2016-02-15 00:00:00|
|          3|          102|    Bob Brown| 35|  Male| 55000|2014-05-01 00:00:00|
|          4|          102|    Alice Lee| 28|Female| 48000|2017-09-30 00:00:00|
|          5|          103|    Jack Chan| 40|  Male| 60000|2013-04-01 00:00:00|
|          6|          103|    Jill Wong| 32|Female| 52000|2018-07-01 00

In [47]:
%%markdown

### Reading many csv file from a folder


### Reading many csv file from a folder


In [54]:
df_from_many=spark.read.option("header",True).csv(["input/manycsv/emp1.csv","input/manycsv/emp2.csv","input/manycsv/emp3.csv"])
df_from_many.printSchema()

df_from_many.show()

root
 |-- employee_id: string (nullable = true)
 |-- department_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: string (nullable = true)
 |-- hire_date: string (nullable = true)

+-----------+-------------+------------+---+------+------+----------+
|employee_id|department_id|        name|age|gender|salary| hire_date|
+-----------+-------------+------------+---+------+------+----------+
|         11|          103| Chris Green| 27|     M| 52000|2021-03-15|
|         12|          103|  Tina White| 36|     F| 63000|2019-09-08|
|         13|          103| Daniel Cruz| 40|     M| 71000|2017-05-29|
|         14|          103|  Nina Gomez| 31|     F| 60000|2020-11-20|
|         15|          103|  Brian Hall| 35|     M| 67000|2018-01-04|
|          6|          102|  Mary Zhang| 30|     F| 59000|2020-10-01|
|          7|          102|   James Kim| 45|     M| 76000|2016-04-12|
|          8|     

In [74]:
from pyspark.sql.functions import col
df_from_many=spark.read.option("header",True).option("inferschema",True).csv("input/manycsv").orderBy(col("employee_id"))
df_from_many.printSchema()
print(df_from_many.rdd.getNumPartitions())
df_from_many.show()

root
 |-- employee_id: integer (nullable = true)
 |-- department_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- hire_date: timestamp (nullable = true)

1
+-----------+-------------+------------+---+------+------+-------------------+
|employee_id|department_id|        name|age|gender|salary|          hire_date|
+-----------+-------------+------------+---+------+------+-------------------+
|          1|          101|    John Doe| 29|     M| 55000|2020-01-15 00:00:00|
|          2|          101| Alice Smith| 34|     F| 62000|2019-03-22 00:00:00|
|          3|          101|   Raj Patel| 41|     M| 72000|2018-11-05 00:00:00|
|          4|          101|   Susan Lee| 25|     F| 50000|2021-06-10 00:00:00|
|          5|          101|    Mark Liu| 38|     M| 68000|2017-08-14 00:00:00|
|          6|          102|  Mary Zhang| 30|     F| 59000|2020-10-01 00:00:00

In [75]:
df_from_many.repartition(3).write.mode("overwrite").save("input/parquetinputs")

In [63]:
df_from_many.write.mode("overwrite").save("input/employee.orc")

In [64]:
%%markdown
 #### reading parquet file

 #### reading parquet file


df_parquet=spark.read.format("parquet").load("input/employee.parquet/employee.parquet")
df_parquet.printSchema()
df_parquet.show()

In [84]:
import time

def get_time_of_execution(func):
    def inner_get_time()->str:
        start_time=time.time()
        func()
        end_time=time.time()
        duration=(end_time-start_time)*1000
        return(f"ExectuionTime:{duration} ms")
    print(inner_get_time())

In [71]:
@get_time_of_execution
def parquet_file_exe():
    df_parquet=spark.read.format("parquet").load("input/employee.parquet/employee.parquet")
    df_parquet.count()
    

ExectuionTime:435.78243255615234 ms


In [73]:
@get_time_of_execution
def csv_file_exe():
    df_parquet=spark.read.format("csv").load("input/manycsv/")
    df_parquet.count()

ExectuionTime:573.9097595214844 ms


In [80]:
create_df_from_many_parquets=spark.read.format("parquet").option("recursiveFileLookup",True).load("input/parquetinputs")
create_df_from_many_parquet


+-----------+-------------+------------+---+------+------+-------------------+
|employee_id|department_id|        name|age|gender|salary|          hire_date|
+-----------+-------------+------------+---+------+------+-------------------+
|          7|          102|   James Kim| 45|     M| 76000|2016-04-12 00:00:00|
|         12|          103|  Tina White| 36|     F| 63000|2019-09-08 00:00:00|
|         15|          103|  Brian Hall| 35|     M| 67000|2018-01-04 00:00:00|
|          9|          102|Robert Brown| 39|     M| 70000|2018-07-25 00:00:00|
|          2|          101| Alice Smith| 34|     F| 62000|2019-03-22 00:00:00|
|         14|          103|  Nina Gomez| 31|     F| 60000|2020-11-20 00:00:00|
|          4|          101|   Susan Lee| 25|     F| 50000|2021-06-10 00:00:00|
|          1|          101|    John Doe| 29|     M| 55000|2020-01-15 00:00:00|
|          8|          102|  Linda Wong| 28|     F| 54000|2022-02-19 00:00:00|
|         11|          103| Chris Green| 27|     M| 

In [83]:
a=10
b=20
print(f"{a}+{b}={a+b}")
print(a,"+",b,"=",(a+b))

10+20=30
10 + 20 = 30


In [85]:
def get_time_of_execution(func):
    def myfunction()->str:
        print("Before function")
        func()
        print("After function")
        return "function executed"
    print(myfunction())

In [90]:
@get_time_of_execution
def demo():
    print("I am from demo")

Before function
I am from demo
After function
function executed


In [89]:
demo()

I am from demo


In [96]:
df.describe

<bound method DataFrame.describe of DataFrame[employee_id: int, department_id: int, name: string, age: int, gender: string, salary: int, hire_date: timestamp]>

In [98]:
df.show(5)

+-----------+-------------+----------+---+------+------+-------------------+
|employee_id|department_id|      name|age|gender|salary|          hire_date|
+-----------+-------------+----------+---+------+------+-------------------+
|          1|          101|  John Doe| 30|  Male| 50000|2015-01-01 00:00:00|
|          2|          101|Jane Smith| 25|Female| 45000|2016-02-15 00:00:00|
|          3|          102| Bob Brown| 35|  Male| 55000|2014-05-01 00:00:00|
|          4|          102| Alice Lee| 28|Female| 48000|2017-09-30 00:00:00|
|          5|          103| Jack Chan| 40|  Male| 60000|2013-04-01 00:00:00|
+-----------+-------------+----------+---+------+------+-------------------+
only showing top 5 rows



In [100]:
from pyspark.sql.functions import when,col
df.withColumn("grade",when(col("salary")>50000,"A").when(((col("salary")>45000) & (col("salary")<50000)),"B").otherwise("d")).show()

+-----------+-------------+-------------+---+------+------+-------------------+-----+
|employee_id|department_id|         name|age|gender|salary|          hire_date|grade|
+-----------+-------------+-------------+---+------+------+-------------------+-----+
|          1|          101|     John Doe| 30|  Male| 50000|2015-01-01 00:00:00|    d|
|          2|          101|   Jane Smith| 25|Female| 45000|2016-02-15 00:00:00|    d|
|          3|          102|    Bob Brown| 35|  Male| 55000|2014-05-01 00:00:00|    A|
|          4|          102|    Alice Lee| 28|Female| 48000|2017-09-30 00:00:00|    B|
|          5|          103|    Jack Chan| 40|  Male| 60000|2013-04-01 00:00:00|    A|
|          6|          103|    Jill Wong| 32|Female| 52000|2018-07-01 00:00:00|    A|
|          7|          101|James Johnson| 42|  Male| 70000|2012-03-15 00:00:00|    A|
|          8|          102|     Kate Kim| 29|Female|  null|2019-10-01 00:00:00|    d|
|          9|          103|      Tom Tan| 33|  Male| 5