In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

emp_df = [(1,'Manish',26,75000,'bihar','nominee1'),
(2,'Nikita',23,100000,'uttarpradesh','nominee2'),
(3,'Pritam',22,150000,'Bangalore','India'),
(4,'Prantosh',17,'200000','Kolkata','India'),
(5,'Vikash',31,300000,'null','nominee5')]

emp_schema = ['id','name','age','salary','address','nominee']

employee_df = spark.createDataFrame(data = emp_df, schema = emp_schema)
employee_df.createOrReplaceTempView("employee_tbl")
display(employee_df)

id,name,age,salary,address,nominee
1,Manish,26,75000,bihar,nominee1
2,Nikita,23,100000,uttarpradesh,nominee2
3,Pritam,22,150000,Bangalore,India
4,Prantosh,17,200000,Kolkata,India
5,Vikash,31,300000,,nominee5


ALIASING USING COL

In [0]:
display(employee_df.select(col("id").alias("employee_id"),"name","salary"))

employee_id,name,salary
1,Manish,75000
2,Nikita,100000
3,Pritam,150000
4,Prantosh,200000
5,Vikash,300000


FILTERING DATA

In [0]:
print("SINGLE FILTER")
#FOR SINGLE FILTER USING COL()
display(employee_df.filter(col("salary")>150000))
print("MULTIPLE FILTERS")
#FOR MULTIPLE FILTERS USING COL() {Remember always separate conditions in different parenthesis}
display(employee_df.filter((col("salary")>150000) & (col("age")<18)))

SINGLE FILTER


id,name,age,salary,address,nominee
4,Prantosh,17,200000,Kolkata,India
5,Vikash,31,300000,,nominee5


MULTIPLE FILTERS


id,name,age,salary,address,nominee
4,Prantosh,17,200000,Kolkata,India


In [0]:
#LITERALS
display(employee_df.select("*", lit("Pandey").alias("Last_name")))
#Literals are used as default value columns

id,name,age,salary,address,nominee,Last_name
1,Manish,26,75000,bihar,nominee1,Pandey
2,Nikita,23,100000,uttarpradesh,nominee2,Pandey
3,Pritam,22,150000,Bangalore,India,Pandey
4,Prantosh,17,200000,Kolkata,India,Pandey
5,Vikash,31,300000,,nominee5,Pandey


READING DATA USING WITHCOLUMN()

In [0]:
#withColumn() includes our existing columns and it can also add new columns in the output
display(employee_df.withColumn("Surname", lit("Singh")))

id,name,age,salary,address,nominee,Surname
1,Manish,26,75000,bihar,nominee1,Singh
2,Nikita,23,100000,uttarpradesh,nominee2,Singh
3,Pritam,22,150000,Bangalore,India,Singh
4,Prantosh,17,200000,Kolkata,India,Singh
5,Vikash,31,300000,,nominee5,Singh


RENAMING COLUMN USING withColumnRenamed() function

In [0]:
display(employee_df.withColumnRenamed("id", "emp_id"))

emp_id,name,age,salary,address,nominee
1,Manish,26,75000,bihar,nominee1
2,Nikita,23,100000,uttarpradesh,nominee2
3,Pritam,22,150000,Bangalore,India
4,Prantosh,17,200000,Kolkata,India
5,Vikash,31,300000,,nominee5


In [0]:
#TYPE CASTING USING withColumn()
print("Before casting schema")
employee_df.printSchema()
print("After casting salary from string to integer\n")
employee_df.withColumn("salary",col("salary").cast("long")).printSchema()

Before casting schema
root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- salary: string (nullable = true)
 |-- address: string (nullable = true)
 |-- nominee: string (nullable = true)

After casting salary from string to integer

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- salary: long (nullable = true)
 |-- address: string (nullable = true)
 |-- nominee: string (nullable = true)



In [0]:
#REMOVING MULTIPLE COLUMNS
display(employee_df.drop("nominee", col("address")))

id,name,age,salary
1,Manish,26,75000
2,Nikita,23,100000
3,Pritam,22,150000
4,Prantosh,17,200000
5,Vikash,31,300000
