In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, max, when
from pyspark.sql.window import Window
from datetime import datetime

# Initialize Spark Session
spark = SparkSession.builder.appName("BillingCalculation").getOrCreate()

# Sample Data
billings_data = [
    ('Sachin', '1990-01-01', 25),
    ('Sehwag', '1989-01-01', 15),
    ('Dhoni', '1989-01-01', 20),
    ('Sachin', '1991-02-05', 30),
]

hours_worked_data = [
    ('Sachin', '1990-07-01', 3),
    ('Sachin', '1990-08-01', 5),
    ('Sehwag', '1990-07-01', 2),
    ('Sachin', '1991-07-01', 4),
]

# Convert String Date to DateType
billings_data = [(emp, datetime.strptime(date, "%Y-%m-%d").date(), rate) for emp, date, rate in billings_data]
hours_worked_data = [(emp, datetime.strptime(date, "%Y-%m-%d").date(), hours) for emp, date, hours in hours_worked_data]

# Define Schema & Create DataFrames
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType

billings_schema = StructType([
    StructField("emp_name", StringType(), True),
    StructField("bill_date", DateType(), True),
    StructField("bill_rate", IntegerType(), True)
])

hours_schema = StructType([
    StructField("emp_name", StringType(), True),
    StructField("work_date", DateType(), True),
    StructField("bill_hrs", IntegerType(), True)
])

billings_df = spark.createDataFrame(billings_data, schema=billings_schema)
hours_df = spark.createDataFrame(hours_worked_data, schema=hours_schema)

billings_df.createOrReplaceTempView("Billings")
hours_df.createOrReplaceTempView("Hours")

# # Perform Join and Find the Latest Bill Rate Before Work Date
# window_spec = Window.partitionBy("emp_name").orderBy(col("bill_date").desc())

# billings_df = billings_df.withColumn("latest_bill_rate", max("bill_rate").over(window_spec))

# # Join to Get Billing Rate Before Work Date
# joined_df = hours_df.join(billings_df, "emp_name")\
#     .filter(col("bill_date") <= col("work_date"))\
#     .withColumn("latest_rate", max("bill_rate").over(Window.partitionBy("emp_name", "work_date").orderBy(col("bill_date").desc())))\
#     .select("emp_name", "work_date", "bill_hrs", "latest_rate")

# # Calculate Billing Amount
# final_df = joined_df.withColumn("bill_amount", col("bill_hrs") * col("latest_rate"))

# # Show Result
# final_df.show()
