In [0]:
# Creating dataframe

from pyspark.sql.types import StructType, StructField, StringType, IntegerType

details = [
    ("Virat Kholi", 97, 84, 77, 161),
    ("Sachin Tendulkar", 65, None, 65, 65),
    ("Ms Dhoni", 60, 84, None, 84),
    ("Rishab Pant", 88, 90, 74, 164),
    ("Suresh Raina", 70, 50, 70, 120),
    ("Ravindra Jadeja", 50, None, None, 0),
]
columns = StructType(
    [
        StructField(name="Name", dataType=StringType()),
        StructField(name="Attendence", dataType=IntegerType()),
        StructField(name="Subject_1", dataType=StringType()),
        StructField(name="Subject_2", dataType=StringType()),
        StructField(name="Total", dataType=StringType()),
    ]
)

In [0]:
df=spark.createDataFrame(details,columns)

In [0]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Attendence: integer (nullable = true)
 |-- Subject_1: string (nullable = true)
 |-- Subject_2: string (nullable = true)
 |-- Total: string (nullable = true)



In [0]:
df.display()

Name,Attendence,Subject_1,Subject_2,Total
Virat Kholi,97,84.0,77.0,161
Sachin Tendulkar,65,,65.0,65
Ms Dhoni,60,84.0,,84
Rishab Pant,88,90.0,74.0,164
Suresh Raina,70,50.0,70.0,120
Ravindra Jadeja,50,,,0


In [0]:
# replacing NULL values with a string.
df1=df.fillna("Absent")
df1.display()

Name,Attendence,Subject_1,Subject_2,Total
Virat Kholi,97,84,77,161
Sachin Tendulkar,65,Absent,65,65
Ms Dhoni,60,84,Absent,84
Rishab Pant,88,90,74,164
Suresh Raina,70,50,70,120
Ravindra Jadeja,50,Absent,Absent,0


### *CASE STATEMENT ( WHEN() OTHERWISE() )*

PySpark When Otherwise – The when() is a SQL function that returns a Column type, and otherwise() is a Column function. If otherwise() is not used, it returns the None/NULL
value.   
It will go through a condition and if it matches means the result present in When() will be executed or else it will go to the otherwise(). Same concept as if else and switch statement in programming language.          

In [0]:
from pyspark.sql.functions import when

df1.withColumn(
    "status",
    when(df1.Total >= 100, "Pass").when(df1.Total < 100, "Fail")
).display()

Name,Attendence,Subject_1,Subject_2,Total,status
Virat Kholi,97,84,77,161,Pass
Sachin Tendulkar,65,Absent,65,65,Fail
Ms Dhoni,60,84,Absent,84,Fail
Rishab Pant,88,90,74,164,Pass
Suresh Raina,70,50,70,120,Pass
Ravindra Jadeja,50,Absent,Absent,0,Fail


In [0]:
# We can give more than one condion to validate using AND,OR.

df1.withColumn(
    "Grade",
    when((df1.Attendence > 90) & (df1.Total > 150), "Distinction")
    .when((df1.Attendence > 80) & (df1.Total > 130), "Good")
    .when((df1.Attendence < 70) & (df1.Total < 100) & (df1.Total > 0), "Okay")
    .when(df1.Total == 0, "Absent")
    .otherwise("Average"),
).display()

Name,Attendence,Subject_1,Subject_2,Total,Grade
Virat Kholi,97,84,77,161,Distinction
Sachin Tendulkar,65,Absent,65,65,Okay
Ms Dhoni,60,84,Absent,84,Okay
Rishab Pant,88,90,74,164,Good
Suresh Raina,70,50,70,120,Average
Ravindra Jadeja,50,Absent,Absent,0,Absent


#### *SPLIT()*
Split method is used to split the record from a specific column i.e, if we have a Name column and both first name and last name are mentioned in the same column itself
so we can to seperate it using split()

In [0]:
from pyspark.sql.functions import split

data = (
    df1.withColumn("First_Name", split(df1["Name"], " ").getItem(0))
    .withColumn("Last_Name", split(df1["Name"], " ").getItem(1)) 
)
data.display()

Name,Attendence,Subject_1,Subject_2,Total,First_Name,Last_Name
Virat Kholi,97,84,77,161,Virat,Kholi
Sachin Tendulkar,65,Absent,65,65,Sachin,Tendulkar
Ms Dhoni,60,84,Absent,84,Ms,Dhoni
Rishab Pant,88,90,74,164,Rishab,Pant
Suresh Raina,70,50,70,120,Suresh,Raina
Ravindra Jadeja,50,Absent,Absent,0,Ravindra,Jadeja


#### *CONCAT()*

concat() function of Pyspark SQL is used to concatenate multiple DataFrame columns into a single column. It can also be used to concatenate column types string,
binary, and compatible array columns.

In [0]:
from pyspark.sql.functions import concat

data.select("*", concat(data.First_Name, data.Last_Name).alias("Full_Name")).display()

Name,Attendence,Subject_1,Subject_2,Total,First_Name,Last_Name,Full_Name
Virat Kholi,97,84,77,161,Virat,Kholi,ViratKholi
Sachin Tendulkar,65,Absent,65,65,Sachin,Tendulkar,SachinTendulkar
Ms Dhoni,60,84,Absent,84,Ms,Dhoni,MsDhoni
Rishab Pant,88,90,74,164,Rishab,Pant,RishabPant
Suresh Raina,70,50,70,120,Suresh,Raina,SureshRaina
Ravindra Jadeja,50,Absent,Absent,0,Ravindra,Jadeja,RavindraJadeja


#### *CONCAT_WS()*

concat_ws() function of Pyspark concatenates multiple string columns into a single column with a given separator or delimiter.

In [0]:
from pyspark.sql.functions import concat_ws

data.select(
    "*", concat_ws(" ", data.First_Name, data.Last_Name).alias("Full_Name")
).display()

Name,Attendence,Subject_1,Subject_2,Total,First_Name,Last_Name,Full_Name
Virat Kholi,97,84,77,161,Virat,Kholi,Virat Kholi
Sachin Tendulkar,65,Absent,65,65,Sachin,Tendulkar,Sachin Tendulkar
Ms Dhoni,60,84,Absent,84,Ms,Dhoni,Ms Dhoni
Rishab Pant,88,90,74,164,Rishab,Pant,Rishab Pant
Suresh Raina,70,50,70,120,Suresh,Raina,Suresh Raina
Ravindra Jadeja,50,Absent,Absent,0,Ravindra,Jadeja,Ravindra Jadeja
