<a href="https://colab.research.google.com/github/ARBINDA765/PysparkIntreviewSeries/blob/main/Income_Tax_Returns.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.5.1/spark-3.5.1-bin-hadoop3.tgz
!tar xf spark-3.5.1-bin-hadoop3.tgz
!pip install -q findspark
import findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.1-bin-hadoop3"

findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # Property used to format output tables better
spark

In [15]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType
from datetime import datetime
from pyspark.sql.functions import *

# Initialize SparkSession
spark = SparkSession.builder.appName("Create DataFrame").getOrCreate()

# Define the schema
users_data_schema = StructType([
    StructField("user_id", IntegerType(), True),
    StructField("financial_year", StringType(), True),
    StructField("return_file_date", DateType(), True)
])

# Create the data and convert date strings to datetime.date objects
users_data= [
    (1, "FY20", datetime.strptime("2020-05-10", "%Y-%m-%d").date()),
    (1, "FY21", datetime.strptime("2021-10-10", "%Y-%m-%d").date()),
    (1, "FY23", datetime.strptime("2023-08-20", "%Y-%m-%d").date()),
    (2, "FY20", datetime.strptime("2020-05-15", "%Y-%m-%d").date()),
    (2, "FY21", datetime.strptime("2021-09-10", "%Y-%m-%d").date()),
    (2, "FY22", datetime.strptime("2022-08-20", "%Y-%m-%d").date()),
    (2, "FY23", datetime.strptime("2023-10-10", "%Y-%m-%d").date())
]

# Create DataFrame
users_df = spark.createDataFrame(users_data, users_data_schema )

# Show DataFrame
users_df.show()




# Initialize SparkSession
spark = SparkSession.builder.appName("Create DataFrame").getOrCreate()

# Define the schema
income_tax_dates_schema = StructType([
    StructField("financial_year", StringType(), True),
    StructField("file_start_date", DateType(), True),
    StructField("file_due_date", DateType(), True)
])

# Create the data and convert date strings to datetime.date objects
income_tax_dates_data = [
    ("FY20", datetime.strptime("2020-05-01", "%Y-%m-%d").date(), datetime.strptime("2020-08-31", "%Y-%m-%d").date()),
    ("FY21", datetime.strptime("2021-06-01", "%Y-%m-%d").date(), datetime.strptime("2021-09-30", "%Y-%m-%d").date()),
    ("FY22", datetime.strptime("2022-05-05", "%Y-%m-%d").date(), datetime.strptime("2022-08-29", "%Y-%m-%d").date()),
    ("FY23", datetime.strptime("2023-05-05", "%Y-%m-%d").date(), datetime.strptime("2023-08-31", "%Y-%m-%d").date())
]

# Create DataFrame
income_tax_dates_df = spark.createDataFrame(income_tax_dates_data, income_tax_dates_schema)

# Show DataFrame
income_tax_dates_df.show()



+-------+--------------+----------------+
|user_id|financial_year|return_file_date|
+-------+--------------+----------------+
|      1|          FY20|      2020-05-10|
|      1|          FY21|      2021-10-10|
|      1|          FY23|      2023-08-20|
|      2|          FY20|      2020-05-15|
|      2|          FY21|      2021-09-10|
|      2|          FY22|      2022-08-20|
|      2|          FY23|      2023-10-10|
+-------+--------------+----------------+

+--------------+---------------+-------------+
|financial_year|file_start_date|file_due_date|
+--------------+---------------+-------------+
|          FY20|     2020-05-01|   2020-08-31|
|          FY21|     2021-06-01|   2021-09-30|
|          FY22|     2022-05-05|   2022-08-29|
|          FY23|     2023-05-05|   2023-08-31|
+--------------+---------------+-------------+



In [16]:
#We will do the cross join both the table so assigning a value to both table
users_df=users_df.withColumn("key",lit(1))
income_tax_dates_df=income_tax_dates_df.withColumn("key",lit(1))


In [47]:
#Cross join on key columns
all_users_year=income_tax_dates_df.alias("itd").join(users_df.alias("a"),on="key",how="outer").drop("key")[["a.user_id","itd.financial_year","itd.file_due_date"]] # Use "itd" as the alias for user_id column
#Removing duplicates
all_users_year=all_users_year.drop_duplicates()
#Join with user table to get the output
ans_df = all_users_year.join(users_df, on=["user_id", "financial_year"], how="left").drop("key")
#sort on FY year
ans_df = ans_df.orderBy("user_id", "financial_year")

#filter the ans_df["return_file_date"]>ans_df["file_due_date"])  and isnull(ans_df["return_file_date"]
ans_df=ans_df.filter((ans_df["return_file_date"]>ans_df["file_due_date"]) | (isnull(ans_df["return_file_date"])))
#Categories the data
ans_df=ans_df.withColumn("comment",when(ans_df["return_file_date"]>ans_df["file_due_date"],"late").
                         when(isnull(ans_df["return_file_date"]),"missed").
                         otherwise("unknown")
                         )
ans_df=ans_df.drop("return_file_date","file_due_date")
ans_df.show()



+-------+--------------+-------+
|user_id|financial_year|comment|
+-------+--------------+-------+
|      1|          FY21|   late|
|      1|          FY22| missed|
|      2|          FY23|   late|
+-------+--------------+-------+

