A company record its employee's movement In and Out of office in a table. Please note below points about the data:

 

1- First entry for each employee is “in”
2- Every “in” is succeeded by an “out”
3- Employee can work across days
Write a SQL to find the number of employees inside the Office at “2019-04-01 19:05:00".

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import *

# Initialize Spark session
spark = SparkSession.builder.appName("example").getOrCreate()

# Sample data
data = [
    ("1", "in", "2019-04-01 12:00:00"),
    ("1", "out", "2019-04-01 15:00:00"),
    ("1", "in", "2019-04-01 17:00:00"),
    ("1", "out", "2019-04-01 21:00:00"),
    ("2", "in", "2019-04-01 10:00:00"),
    ("2", "out", "2019-04-01 16:00:00"),
    ("3", "in", "2019-04-01 19:00:00"),
    ("3", "out", "2019-04-02 05:00:00"),
    ("4", "in", "2019-04-01 10:00:00"),
    ("4", "out", "2019-04-01 20:00:00")
]

# Define schema
schema = ["emp_id", "action", "created_at"]

# Create DataFrame
df = spark.createDataFrame(data, schema)

# Converting to correct datatype
df=df.withColumn("emp_id",col("emp_id").cast("integer")) \
    .withColumn("created_at",col("created_at").cast("timestamp"))


In [0]:
#Approach-1
df_approach_1=df.groupBy(col("emp_id")).agg(min("created_at").alias("Login"),max(col("created_at")).alias("logout")) \
  .filter((col("login") <"2019-04-01 19:05:00") & (col("logout") >"2019-04-01 19:05:00")).agg(count(col("emp_id")).alias("no_of_emp"))

In [0]:
#Approach-2
# Define the window specification
window_spec = Window.partitionBy("emp_id").orderBy("created_at")

# Add the next_created_at column
df_app_2 = df.withColumn("next_created_at", lead("created_at").over(window_spec))

# Filter the DataFrame and count the results
result = df_app_2.filter(
    (col("action") == "in") &
    (col("created_at") <= "2019-04-01 19:05:00") &
    (col("next_created_at") >= "2019-04-01 19:05:00")
).agg(count("*").alias("no_of_emp_inside"))

result.show()


+----------------+
|no_of_emp_inside|
+----------------+
|               3|
+----------------+



In [0]:
display(df)

emp_id,action,created_at,next_created_at
1,in,2019-04-01T12:00:00.000+0000,2019-04-01T15:00:00.000+0000
1,out,2019-04-01T15:00:00.000+0000,2019-04-01T17:00:00.000+0000
1,in,2019-04-01T17:00:00.000+0000,2019-04-01T21:00:00.000+0000
1,out,2019-04-01T21:00:00.000+0000,
2,in,2019-04-01T10:00:00.000+0000,2019-04-01T16:00:00.000+0000
2,out,2019-04-01T16:00:00.000+0000,
3,in,2019-04-01T19:00:00.000+0000,2019-04-02T05:00:00.000+0000
3,out,2019-04-02T05:00:00.000+0000,
4,in,2019-04-01T10:00:00.000+0000,2019-04-01T20:00:00.000+0000
4,out,2019-04-01T20:00:00.000+0000,
