In [0]:
#Q:  Write a Pyspark code to find the output table as given below-
#   employeeid, default_number, total_entry, total_login, total_logout, latest_login, latest_logout. 

In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import *

In [0]:
checkin_df = spark.createDataFrame(
    [
(1000, 'login', '2023-06-16 01:00:15.34'),
 (1000, 'login', '2023-06-16 02:00:15.34'),
 (1000, 'login', '2023-06-16 03:00:15.34'),
 (1000, 'logout', '2023-06-16 12:00:15.34'),
 (1001, 'login', '2023-06-16 01:00:15.34'),
 (1001, 'login', '2023-06-16 02:00:15.34'),
 (1001, 'login', '2023-06-16 03:00:15.34'),
 (1001, 'logout', '2023-06-16 12:00:15.34')
 ],
 ["employeeid", "entry_details", "timestamp_details"]
 )

detail_df = spark.createDataFrame([(1001, 9999, 'false'),
 (1001, 1111, 'false'),
 (1001, 2222, 'true'),
 (1003, 3333, 'false')],
 ["id", "phone_number", "isdefault"])

In [0]:
checkin_df.show(truncate=0)

+----------+-------------+----------------------+
|employeeid|entry_details|timestamp_details     |
+----------+-------------+----------------------+
|1000      |login        |2023-06-16 01:00:15.34|
|1000      |login        |2023-06-16 02:00:15.34|
|1000      |login        |2023-06-16 03:00:15.34|
|1000      |logout       |2023-06-16 12:00:15.34|
|1001      |login        |2023-06-16 01:00:15.34|
|1001      |login        |2023-06-16 02:00:15.34|
|1001      |login        |2023-06-16 03:00:15.34|
|1001      |logout       |2023-06-16 12:00:15.34|
+----------+-------------+----------------------+



In [0]:
display(detail_df)

id,phone_number,isdefault
1001,9999,False
1001,1111,False
1001,2222,True
1003,3333,False


In [0]:
joined_df = checkin_df.join(detail_df,checkin_df['employeeid']==detail_df['id'],'left')



Out[6]: 0

In [0]:
joined_df.show(truncate=0)

+----------+-------------+----------------------+----+------------+---------+
|employeeid|entry_details|timestamp_details     |id  |phone_number|isdefault|
+----------+-------------+----------------------+----+------------+---------+
|1000      |login        |2023-06-16 01:00:15.34|null|null        |null     |
|1000      |login        |2023-06-16 02:00:15.34|null|null        |null     |
|1000      |login        |2023-06-16 03:00:15.34|null|null        |null     |
|1000      |logout       |2023-06-16 12:00:15.34|null|null        |null     |
|1001      |login        |2023-06-16 01:00:15.34|1001|2222        |true     |
|1001      |login        |2023-06-16 01:00:15.34|1001|1111        |false    |
|1001      |login        |2023-06-16 01:00:15.34|1001|9999        |false    |
|1001      |login        |2023-06-16 02:00:15.34|1001|2222        |true     |
|1001      |login        |2023-06-16 02:00:15.34|1001|1111        |false    |
|1001      |login        |2023-06-16 02:00:15.34|1001|9999      

In [0]:
#Fetch only  when isdefault is true 
joined_df=joined_df.filter(col("isdefault")=="true")

In [0]:
display(joined_df)

employeeid,entry_details,timestamp_details,id,phone_number,isdefault
1001,login,2023-06-16 01:00:15.34,1001,2222,True
1001,login,2023-06-16 02:00:15.34,1001,2222,True
1001,login,2023-06-16 03:00:15.34,1001,2222,True
1001,logout,2023-06-16 12:00:15.34,1001,2222,True


In [0]:
#calculate the total_entry 
total_entry=joined_df.groupBy("employeeid").agg(count('*').alias("Total_Entry"))

In [0]:
display(total_entry)

employeeid,Total_Entry
1001,4


In [0]:
#total_login calculation
total_login=joined_df.filter(col("entry_details")=="login").groupBy("employeeid").agg(count('*').alias("Total_Login_Entry"))

In [0]:
#total_login calculation
total_logout=joined_df.filter(col("entry_details")=="logout").groupBy("employeeid").agg(count('*').alias("Total_Logout_Entry"))

In [0]:
display(total_logout)

employeeid,Total_Logout_Entry
1001,1


In [0]:
#Latest login details
latest_login=joined_df.filter(col("entry_details")=="login").groupBy("employeeid").agg(min("timestamp_details").alias("Latest_Login"))

#Latest Logout Details
latest_logout=joined_df.filter(col("entry_details")=="logout").groupBy("employeeid").agg(max("timestamp_details").alias("Latest_Logout"))

In [0]:
latest_logout.show(truncate=0)

+----------+----------------------+
|employeeid|Latest_Logout         |
+----------+----------------------+
|1001      |2023-06-16 12:00:15.34|
+----------+----------------------+



In [0]:
#Joining all the information to make a single data frame
final_df=total_entry.join(total_login,total_entry["employeeid"]==total_login["employeeid"],"inner").select(total_entry["employeeid"],"Total_Entry","Total_Login_Entry")
final_df=final_df.join(total_logout,final_df["employeeid"]==total_logout["employeeid"],"inner").select(final_df["employeeid"],"Total_Entry","Total_Login_Entry","Total_Logout_Entry")
final_df=final_df.join(latest_login,final_df["employeeid"]==latest_login["employeeid"],"inner").select(final_df["employeeid"],"Total_Entry","Total_Login_Entry","Total_Logout_Entry","Latest_Login")
final_df=final_df.join(latest_logout,final_df["employeeid"]==latest_logout["employeeid"],"inner").select(final_df["employeeid"],"Total_Entry","Total_Login_Entry","Total_Logout_Entry","Latest_Login","Latest_Logout")




In [0]:
final_df.show()

+----------+-----------+-----------------+------------------+--------------------+--------------------+
|employeeid|Total_Entry|Total_Login_Entry|Total_Logout_Entry|        Latest_Login|       Latest_Logout|
+----------+-----------+-----------------+------------------+--------------------+--------------------+
|      1001|          4|                3|                 1|2023-06-16 01:00:...|2023-06-16 12:00:...|
+----------+-----------+-----------------+------------------+--------------------+--------------------+

