In [10]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, TimestampType
from datetime import datetime

# Initialize Spark Session
spark = SparkSession.builder.appName("TicketsAndHolidays").getOrCreate()

# -----------------------
# Create Tickets Table
# -----------------------

# Data with datetime
tickets_data = [
    ("1", datetime.strptime('2022-08-01', '%Y-%m-%d'), datetime.strptime('2022-08-03', '%Y-%m-%d')),
    ("2", datetime.strptime('2022-08-01', '%Y-%m-%d'), datetime.strptime('2022-08-12', '%Y-%m-%d')),
    ("3", datetime.strptime('2022-08-01', '%Y-%m-%d'), datetime.strptime('2022-08-16', '%Y-%m-%d'))
]

# Schema
tickets_schema = StructType([
    StructField("ticket_id", StringType(), True),
    StructField("create_date", TimestampType(), True),
    StructField("resolved_date", TimestampType(), True)
])

# Create DataFrame
tickets = spark.createDataFrame(tickets_data, schema=tickets_schema)

# Create or replace a temp view
tickets.createOrReplaceTempView("tickets")

# -----------------------
# Create Holidays Table
# -----------------------

# Data with datetime
holidays_data = [
    (datetime.strptime('2022-08-11', '%Y-%m-%d'), 'Rakhi'),
    (datetime.strptime('2022-08-15', '%Y-%m-%d'), 'Independence day')
]

# Schema
holidays_schema = StructType([
    StructField("holiday_date", TimestampType(), True),
    StructField("reason", StringType(), True)
])

# Create DataFrame
holidays = spark.createDataFrame(holidays_data, schema=holidays_schema)

# Create or replace a temp view
holidays.createOrReplaceTempView("holidays")

# -----------------------
# Check your tables!
# -----------------------

# Sample queries
spark.sql("SELECT * FROM tickets").show(truncate=False)
spark.sql("SELECT * FROM holidays").show(truncate=False)


+---------+-------------------+-------------------+
|ticket_id|create_date        |resolved_date      |
+---------+-------------------+-------------------+
|1        |2022-08-01 00:00:00|2022-08-03 00:00:00|
|2        |2022-08-01 00:00:00|2022-08-12 00:00:00|
|3        |2022-08-01 00:00:00|2022-08-16 00:00:00|
+---------+-------------------+-------------------+

+-------------------+----------------+
|holiday_date       |reason          |
+-------------------+----------------+
|2022-08-11 00:00:00|Rakhi           |
|2022-08-15 00:00:00|Independence day|
+-------------------+----------------+



In [None]:
    -- datediff(day, create_date, resolved_date) - 2* datediff(week, create_date, resolved_date) as actualdays


In [9]:
spark.sql("""
    select *,
    datediff(day, create_date, resolved_date) - 2* datediff(week, create_date, resolved_date) - no_of_holidays as actualdays
    from (
    select ticket_id, create_date, resolved_date, count(holiday_date) as no_of_holidays
    from tickets left join holidays on holiday_date between create_date and resolved_date
    group by ticket_id, create_date, resolved_date)
""").show()


+---------+-------------------+-------------------+--------------+----------+
|ticket_id|        create_date|      resolved_date|no_of_holidays|actualdays|
+---------+-------------------+-------------------+--------------+----------+
|        1|2022-08-01 00:00:00|2022-08-03 00:00:00|             0|         2|
|        2|2022-08-01 00:00:00|2022-08-12 00:00:00|             1|         8|
|        3|2022-08-01 00:00:00|2022-08-16 00:00:00|             2|         9|
+---------+-------------------+-------------------+--------------+----------+



In [13]:
import pyspark.sql.functions as F
joined_df = (
    tickets.alias("t")
    .join(
        holidays.alias("h"),
        (F.col("h.holiday_date").between(F.col("t.create_date"), F.col("t.resolved_date"))),
        how="left"
    )
)

# Aggregate to count number of holidays
aggregated_df = (
    joined_df
    .groupBy("ticket_id", "create_date", "resolved_date")
    .agg(F.count("h.holiday_date").alias("no_of_holidays"))
)

# Calculate actualdays
result_df = (
    aggregated_df
    .withColumn(
        "actualdays",
        F.datediff(F.col("resolved_date"), F.col("create_date"))
        - 2 * F.datediff(F.weekofyear(F.col("resolved_date")), F.weekofyear(F.col("create_date")))
        - F.col("no_of_holidays")
    )
)

# Show the result
result_df.show()

AnalysisException: cannot resolve 'datediff(weekofyear(CAST(t.resolved_date AS DATE)), weekofyear(CAST(t.create_date AS DATE)))' due to data type mismatch: argument 1 requires date type, however, 'weekofyear(CAST(t.resolved_date AS DATE))' is of int type. argument 2 requires date type, however, 'weekofyear(CAST(t.create_date AS DATE))' is of int type.;
'Project [ticket_id#226, create_date#227, resolved_date#228, no_of_holidays#279L, ((datediff(cast(resolved_date#228 as date), cast(create_date#227 as date)) - (datediff(weekofyear(cast(resolved_date#228 as date)), weekofyear(cast(create_date#227 as date))) * 2)) - no_of_holidays#279L) AS actualdays#284]
+- Aggregate [ticket_id#226, create_date#227, resolved_date#228], [ticket_id#226, create_date#227, resolved_date#228, count(holiday_date#232) AS no_of_holidays#279L]
   +- Join LeftOuter, ((holiday_date#232 >= create_date#227) AND (holiday_date#232 <= resolved_date#228))
      :- SubqueryAlias t
      :  +- LogicalRDD [ticket_id#226, create_date#227, resolved_date#228], false
      +- SubqueryAlias h
         +- LogicalRDD [holiday_date#232, reason#233], false
