Creating spark session

In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
                    .appName("Retail Transactions") \
                    .getOrCreate()
spark

#### Data Ingestion & Schema Handling


1. Load the CSV using inferred schema.


In [5]:
from google.colab import drive

drive.mount('/content/drive')
employee_df = spark.read.format('csv').option("header", True) \
                                      .option("inferSchema", True) \
                                      .load('/content/drive/MyDrive/Assignment/Employee_Timesheet.csv')

employee_df.printSchema()

Mounted at /content/drive
root
 |-- EmployeeID: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- Project: string (nullable = true)
 |-- WorkHours: integer (nullable = true)
 |-- WorkDate: date (nullable = true)
 |-- Location: string (nullable = true)
 |-- Mode: string (nullable = true)



2. Load the same file with schema explicitly defined.


In [6]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType

employee_schema = StructType([
    StructField("EmployeeID", StringType(), True),
    StructField("Name", StringType(), True),
    StructField("Department", StringType(), True),
    StructField("Project", StringType(), True),
    StructField("WorkHours", IntegerType(), True),
    StructField("WorkDate", DateType(), True),
    StructField("Location", StringType(), True),
    StructField("Mode", StringType(), True)
])

employee_custom_df = spark.read.format('csv').option("header", True) \
                                      .schema(employee_schema) \
                                      .load('/content/drive/MyDrive/Assignment/Employee_Timesheet.csv')

employee_custom_df.printSchema()

root
 |-- EmployeeID: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- Project: string (nullable = true)
 |-- WorkHours: integer (nullable = true)
 |-- WorkDate: date (nullable = true)
 |-- Location: string (nullable = true)
 |-- Mode: string (nullable = true)



3. Add a new column Weekday extracted from WorkDate

In [7]:
from pyspark.sql.functions import date_format, col

employee_custom_df = employee_custom_df.withColumn("WeekDay", date_format(col("WorkDate"), "EEEE"))
employee_custom_df.select(
                            'EmployeeID',
                            'Name',
                            'Department',
                            'Project',
                            'WorkDate',
                            'WeekDay'
).show()

+----------+-----+----------+-------+----------+---------+
|EmployeeID| Name|Department|Project|  WorkDate|  WeekDay|
+----------+-----+----------+-------+----------+---------+
|      E101|Anita|        IT|  Alpha|2024-05-01|Wednesday|
|      E102|  Raj|        HR|   Beta|2024-05-01|Wednesday|
|      E103| John|   Finance|  Alpha|2024-05-02| Thursday|
|      E101|Anita|        IT|  Alpha|2024-05-03|   Friday|
|      E104|Meena|        IT|  Gamma|2024-05-03|   Friday|
|      E102|  Raj|        HR|   Beta|2024-05-04| Saturday|
+----------+-----+----------+-------+----------+---------+



####  Aggregations & Grouping

4. Calculate total work hours by employee.


In [11]:
from pyspark.sql.functions import sum
total_hours_employee = employee_custom_df.groupBy('EmployeeID') \
                                        .agg(sum('WorkHours').alias('TotalWorkHours')) \
                                        .orderBy('TotalWorkHours', ascending=False)
total_hours_employee.show()


+----------+--------------+
|EmployeeID|TotalWorkHours|
+----------+--------------+
|      E101|            17|
|      E102|            15|
|      E104|             6|
|      E103|             5|
+----------+--------------+



 5. Calculate average work hours per department.

In [12]:
from pyspark.sql.functions import avg, round
employee_custom_df.groupBy('Department') \
                  .agg(round(avg('WorkHours'),2).alias('AverageWorkHours')) \
                  .show()

+----------+----------------+
|Department|AverageWorkHours|
+----------+----------------+
|        HR|             7.5|
|   Finance|             5.0|
|        IT|            7.67|
+----------+----------------+



 6. Get top 2 employees by total hours using window function.


In [13]:
from pyspark.sql.functions import row_number, desc, col
from pyspark.sql.window import Window

window_spec = Window.orderBy(desc('TotalWorkHours'))

total_hours_employee.withColumn('Rank', row_number().over(window_spec)) \
                                      .filter(col('Rank') <= 2) \
                                      .show()

+----------+--------------+----+
|EmployeeID|TotalWorkHours|Rank|
+----------+--------------+----+
|      E101|            17|   1|
|      E102|            15|   2|
+----------+--------------+----+



#### Date Operations

7. Filter entries where WorkDate falls on a weekend

In [14]:
from pyspark.sql.functions import dayofweek
employee_custom_df.filter(dayofweek(col('WorkDate')).isin(1, 7))   \
                  .select(
                            'EmployeeID',
                            'Name',
                            'WorkDate',
                            'WeekDay'
                          ).show()

+----------+----+----------+--------+
|EmployeeID|Name|  WorkDate| WeekDay|
+----------+----+----------+--------+
|      E102| Raj|2024-05-04|Saturday|
+----------+----+----------+--------+



8. Calculate running total of hours per employee using window.


In [15]:
from pyspark.sql.functions import sum
window_emp = Window.partitionBy('EmployeeID').orderBy('WorkDate') \
                   .rowsBetween(Window.unboundedPreceding, Window.currentRow)


employee_custom_df.withColumn('RunningTotal', sum('WorkHours').over(window_emp)) \
                  .select(
                            'EmployeeID',
                            'Name',
                            'WorkDate',
                            'WorkHours',
                            'RunningTotal') \
                  .show()

+----------+-----+----------+---------+------------+
|EmployeeID| Name|  WorkDate|WorkHours|RunningTotal|
+----------+-----+----------+---------+------------+
|      E101|Anita|2024-05-01|        8|           8|
|      E101|Anita|2024-05-03|        9|          17|
|      E102|  Raj|2024-05-01|        7|           7|
|      E102|  Raj|2024-05-04|        8|          15|
|      E103| John|2024-05-02|        5|           5|
|      E104|Meena|2024-05-03|        6|           6|
+----------+-----+----------+---------+------------+



####  Joining DataFrames


9. Create department_location.csv

In [16]:
dept_data = [
              ('IT', 'Anand'),
              ('HR', 'Shruti'),
              ('Finance', 'Kamal')
          ]

dept_columns = ['Department', 'DeptHead']
dept_df = spark.createDataFrame(dept_data, dept_columns)
dept_df.show()

+----------+--------+
|Department|DeptHead|
+----------+--------+
|        IT|   Anand|
|        HR|  Shruti|
|   Finance|   Kamal|
+----------+--------+



10. Join with timesheet data and list all employees with their DeptHead.


In [24]:
from pyspark.sql.functions import first

employee_with_head = employee_custom_df.join(dept_df, on='Department', how='left')

employee_with_head.groupBy('EmployeeID').agg(
                                              first('Name').alias('Name'),
                                              first('Department').alias('Department'),
                                              first('DeptHead').alias('DeptHead')
                                             ).show()

+----------+-----+----------+--------+
|EmployeeID| Name|Department|DeptHead|
+----------+-----+----------+--------+
|      E101|Anita|        IT|   Anand|
|      E102|  Raj|        HR|  Shruti|
|      E103| John|   Finance|   Kamal|
|      E104|Meena|        IT|   Anand|
+----------+-----+----------+--------+



####  Pivot & Unpivot

11. Pivot table: total hours per employee per project.


In [27]:
employee_custom_df.groupBy('EmployeeID') \
                  .pivot('Project') \
                  .agg(sum('WorkHours')) \
                  .show()


+----------+-----+----+-----+
|EmployeeID|Alpha|Beta|Gamma|
+----------+-----+----+-----+
|      E103|    5|NULL| NULL|
|      E104| NULL|NULL|    6|
|      E101|   17|NULL| NULL|
|      E102| NULL|  15| NULL|
+----------+-----+----+-----+



12. Unpivot example: Convert mode-specific hours into rows.


In [30]:
from pyspark.sql.functions import explode, array, struct, lit, coalesce

mode_hours_df = employee_custom_df.groupBy('EmployeeID', 'Mode') \
                                  .agg(sum('WorkHours') \
                                  .alias('ModeHours'))

mode_pivot_df = mode_hours_df.groupBy('EmployeeID') \
                             .pivot('Mode') \
                             .sum('ModeHours')

unpivot_expr = explode(array(
                              struct(lit('Onsite').alias('Mode'), col('Onsite').alias('ModeHours')),
                              struct(lit('Remote').alias('Mode'), col('Remote').alias('ModeHours'))
))

mode_pivot_df.select('EmployeeID', unpivot_expr.alias('kv')) \
             .select('EmployeeID',
                     col('kv.Mode'),
                     coalesce(col('kv.ModeHours'), lit(0)).alias('ModeHours')) \
             .show()



+----------+------+---------+
|EmployeeID|  Mode|ModeHours|
+----------+------+---------+
|      E103|Onsite|        0|
|      E103|Remote|        5|
|      E104|Onsite|        6|
|      E104|Remote|        0|
|      E101|Onsite|        0|
|      E101|Remote|       17|
|      E102|Onsite|        7|
|      E102|Remote|        8|
+----------+------+---------+



####  UDF & Conditional Logic

13. Create a UDF to classify work hours:

In [31]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def workload_tag(hours):
    if hours >= 8:
        return "Full"
    elif hours >= 4:
        return "Partial"
    else:
        return "Light"

workload_udf = udf(workload_tag, StringType())


+----------+-----+----------+---------+----------------+
|EmployeeID| Name|  WorkDate|WorkHours|WorkloadCategory|
+----------+-----+----------+---------+----------------+
|      E102|  Raj|2024-05-01|        7|         Partial|
|      E103| John|2024-05-02|        5|         Partial|
|      E101|Anita|2024-05-01|        8|            Full|
|      E102|  Raj|2024-05-04|        8|            Full|
|      E101|Anita|2024-05-03|        9|            Full|
|      E104|Meena|2024-05-03|        6|         Partial|
+----------+-----+----------+---------+----------------+



14. Add a column
WorkloadCategory using this UDF.

In [32]:
employee_custom_df = employee_custom_df.withColumn("WorkloadCategory", workload_udf(col("WorkHours")))

employee_custom_df.select(
                            'EmployeeID',
                            'Name',
                            'WorkDate',
                            'WorkHours',
                            'WorkloadCategory'
).show()

+----------+-----+----------+---------+----------------+
|EmployeeID| Name|  WorkDate|WorkHours|WorkloadCategory|
+----------+-----+----------+---------+----------------+
|      E102|  Raj|2024-05-01|        7|         Partial|
|      E103| John|2024-05-02|        5|         Partial|
|      E101|Anita|2024-05-01|        8|            Full|
|      E102|  Raj|2024-05-04|        8|            Full|
|      E101|Anita|2024-05-03|        9|            Full|
|      E104|Meena|2024-05-03|        6|         Partial|
+----------+-----+----------+---------+----------------+



####  Nulls and Cleanup

15. Introduce some nulls in Mode column.


In [34]:
from pyspark.sql.functions import when
employee_custom_df = employee_custom_df.withColumn("Mode",
                          when(col("EmployeeID") == "E101", None).otherwise(col("Mode")))

employee_custom_df.select('EmployeeID', 'Mode').show()

+----------+------+
|EmployeeID|  Mode|
+----------+------+
|      E102|Onsite|
|      E103|Remote|
|      E101|  NULL|
|      E102|Remote|
|      E101|  NULL|
|      E104|Onsite|
+----------+------+



16. Fill nulls with "Not Provided".


In [36]:
employee_custom_df = employee_custom_df.fillna({"Mode": "Not Provided"})

employee_custom_df.select('EmployeeID', 'Mode').show()


+----------+------------+
|EmployeeID|        Mode|
+----------+------------+
|      E102|      Onsite|
|      E103|      Remote|
|      E101|Not Provided|
|      E102|      Remote|
|      E101|Not Provided|
|      E104|      Onsite|
+----------+------------+



17. Drop rows where WorkHours < 4.

In [37]:
employee_custom_df = employee_custom_df.filter(col("WorkHours") >= 4)

employee_custom_df.select('EmployeeID', 'WorkHours').show()

+----------+---------+
|EmployeeID|WorkHours|
+----------+---------+
|      E102|        7|
|      E103|        5|
|      E101|        8|
|      E102|        8|
|      E101|        9|
|      E104|        6|
+----------+---------+



####  Advanced Conditions

18. Use when-otherwise to mark employees as "Remote Worker" if >80% entries are Remote.

In [41]:
from pyspark.sql.functions import count, when

remote_ratio_df = employee_custom_df.groupBy('EmployeeID') \
                                    .agg(count("*").alias('TotalEntries'),
                                         count(when(col('Mode') == 'Remote', True)).alias('RemoteCount'))\
                                    .withColumn('RemoteRatio', col('RemoteCount') / col('TotalEntries')) \
                                    .withColumn('WorkType', when(col('RemoteRatio') > 0.8, 'Remote Worker') \
                                                            .otherwise('Hybrid/Office'))

remote_ratio_df.select(
                       'EmployeeID',
                       'RemoteRatio',
                       'WorkType'
                       ).show()

+----------+-----------+-------------+
|EmployeeID|RemoteRatio|     WorkType|
+----------+-----------+-------------+
|      E103|        1.0|Remote Worker|
|      E104|        0.0|Hybrid/Office|
|      E101|        0.0|Hybrid/Office|
|      E102|        0.5|Hybrid/Office|
+----------+-----------+-------------+



19. Add a new column
ExtraHours where hours > 8.

In [42]:
employee_custom_df = employee_custom_df.withColumn('ExtraHours',
                                                   when(col('WorkHours') > 8, col('WorkHours') - 8) \
                                                   .otherwise(0))

employee_custom_df.select(
                          'EmployeeID',
                          'WorkHours',
                          'ExtraHours'
                          ).show()

+----------+---------+----------+
|EmployeeID|WorkHours|ExtraHours|
+----------+---------+----------+
|      E102|        7|         0|
|      E103|        5|         0|
|      E101|        8|         0|
|      E102|        8|         0|
|      E101|        9|         1|
|      E104|        6|         0|
+----------+---------+----------+



#### Union + Duplicate Handling

20. Append a dummy timesheet for new interns using unionByName()


In [47]:
from pyspark.sql.functions import to_date, col

sample_data = [
                ("I001", "Intern1", "IT", "ProjX", 6, "2024-06-10", "Office", "Office"),
                ("I002", "Intern2", "HR", "ProjY", 5, "2024-06-11", "Remote", "Remote")
              ]

sample_columns = ['EmployeeID', 'Name', 'Department', 'Project', 'WorkHours', 'WorkDate', 'WorkMode', 'ActualWorkMode']

sample_df = spark.createDataFrame(sample_data, sample_columns) \
                 .withColumn("WorkDate", to_date(col("WorkDate")))

combined_df = employee_custom_df.unionByName(sample_df, allowMissingColumns=True) # Setting Nulls for missing columns

combined_df.select(
                    'EmployeeID',
                    'Name',
                    'Department',
                    'WorkHours',
                    'WorkDate'
                   ).show()


+----------+-------+----------+---------+----------+
|EmployeeID|   Name|Department|WorkHours|  WorkDate|
+----------+-------+----------+---------+----------+
|      E102|    Raj|        HR|        7|2024-05-01|
|      E103|   John|   Finance|        5|2024-05-02|
|      E101|  Anita|        IT|        8|2024-05-01|
|      E102|    Raj|        HR|        8|2024-05-04|
|      E101|  Anita|        IT|        9|2024-05-03|
|      E104|  Meena|        IT|        6|2024-05-03|
|      I001|Intern1|        IT|        6|2024-06-10|
|      I002|Intern2|        HR|        5|2024-06-11|
+----------+-------+----------+---------+----------+



21. Remove duplicate rows based on all columns.

In [52]:
combined_df = combined_df.unionAll(combined_df)
combined_df.show()

combined_df = combined_df.dropDuplicates()
combined_df.show()

+----------+-------+----------+-------+---------+----------+---------+------------+---------+----------------+----------+--------+--------------+
|EmployeeID|   Name|Department|Project|WorkHours|  WorkDate| Location|        Mode|  WeekDay|WorkloadCategory|ExtraHours|WorkMode|ActualWorkMode|
+----------+-------+----------+-------+---------+----------+---------+------------+---------+----------------+----------+--------+--------------+
|      E104|  Meena|        IT|  Gamma|        6|2024-05-03|Hyderabad|      Onsite|   Friday|         Partial|         0|    NULL|          NULL|
|      E102|    Raj|        HR|   Beta|        8|2024-05-04|   Mumbai|      Remote| Saturday|            Full|         0|    NULL|          NULL|
|      E103|   John|   Finance|  Alpha|        5|2024-05-02|    Delhi|      Remote| Thursday|         Partial|         0|    NULL|          NULL|
|      E101|  Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|Not Provided|   Friday|            Full|         1|   

Saving to parquet file

In [53]:
combined_df.write.mode("overwrite") \
                 .parquet("/content/drive/MyDrive/Assignment/Transformed_Employee_Timesheet")