In [1]:
## PYSPARK INTERVIEW QUESTIONS - ANSH LAMBA

In [2]:
import findspark
findspark.init()

In [3]:
import os
import sys

# Replace this with your actual Python path
python_path = sys.executable  # This is safe — it auto-detects your current Python path

os.environ["PYSPARK_PYTHON"] = python_path
os.environ["PYSPARK_DRIVER_PYTHON"] = python_path

In [4]:
from pyspark.sql.functions import * 
from pyspark.sql.types import *

from pyspark import SparkContext, SparkConf 
from pyspark.conf import SparkConf 
from pyspark.sql import SparkSession, HiveContext,DataFrame
from pyspark.sql.window import Window
from pyspark.sql import functions as f
from pyspark.sql.types import StructType, StringType, StructField, StringType,LongType,DecimalType,DateType,TimestampType, IntegerType,DoubleType

In [5]:
## SparkSession
spark = SparkSession.builder \
                        .appName('example-pyspark-read-and-write-from-hive') \
                        .master("local[*]") \
                        .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0,com.crealytics:spark-excel_2.12:3.3.3_0.20.3") \
                        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
                        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
                        .config("spark.driver.memory", "4g") \
                        .config("spark.executor.memory", "2g") \
                        .config("spark.sql.execution.arrow.pyspark.enabled", "false")\
                        .enableHiveSupport() \
                        .getOrCreate()



In [None]:
### While analyzing customer reviews, you need to identify the most frequently used words in the feedback. How would you implement this?

In [11]:
data = [("customer1", "The product is great"), ("customer2", "Great product, fast delivery"), ("customer3", "Not bad, could be better")]
columns = ["customer_id", "feedback"]

df = spark.createDataFrame(data, columns)

In [12]:
df.show(5,False)

+-----------+----------------------------+
|customer_id|feedback                    |
+-----------+----------------------------+
|customer1  |The product is great        |
|customer2  |Great product, fast delivery|
|customer3  |Not bad, could be better    |
+-----------+----------------------------+



In [13]:
# df = df.withColumn('feedback', split(f.col('feedback'),' '))
# df.show(5,False)

In [18]:
df = df.withColumn('feedback',f.explode(f.split('feedback',' ')))\
        .withColumn('feedback',lower('feedback'))\
        .groupBy('feedback').agg(count('feedback').alias('wordCount'))

In [19]:
df.show()

+--------+---------+
|feedback|wordCount|
+--------+---------+
|   great|        2|
|      is|        1|
|     the|        1|
| product|        1|
|    fast|        1|
|delivery|        1|
|product,|        1|
|   could|        1|
|     not|        1|
|      be|        1|
|    bad,|        1|
|  better|        1|
+--------+---------+



In [None]:
## 9. You need to calculate the cumulative sum of sales over time for each product. How would you approach this?

In [48]:
data = [("product1", "2023-12-01", 100), ("product2", "2023-12-02", 200),
        ("product1", "2023-12-03", 150), ("product2", "2023-12-04", 250)]
columns = ["product_id", "date", "sales"]
df = spark.createDataFrame(data, columns)
df.show()

+----------+----------+-----+
|product_id|      date|sales|
+----------+----------+-----+
|  product1|2023-12-01|  100|
|  product2|2023-12-02|  200|
|  product1|2023-12-03|  150|
|  product2|2023-12-04|  250|
+----------+----------+-----+



In [49]:
## cast date column from string to Datestamp:

df = df.withColumn("date", f.col('date').cast(DateType()))
## applying window function with Sum as aggregation:::


df = df.withColumn("CumalativeSum", sum("sales").over(Window.partitionBy('product_id').orderBy(f.col("date")))) \
        .orderBy(f.col("date"), ascending=True)

#.orderBy(f.col("totalActions"),ascending=False)

In [50]:
df.show()

+----------+----------+-----+-------------+
|product_id|      date|sales|CumalativeSum|
+----------+----------+-----+-------------+
|  product1|2023-12-01|  100|          100|
|  product2|2023-12-02|  200|          200|
|  product1|2023-12-03|  150|          250|
|  product2|2023-12-04|  250|          450|
+----------+----------+-----+-------------+



In [None]:
####################
### 10. While preparing a data pipeline, you notice some duplicate rows in a dataset. How would you remove the duplicates without affecting 
## the original order?

In [60]:
data = [("John", 25), ("Jane", 30), ("John", 25), ("Alice", 22)]
columns = ["name", "age"]
df = spark.createDataFrame(data, columns)
df.show()

+-----+---+
| name|age|
+-----+---+
| John| 25|
| Jane| 30|
| John| 25|
|Alice| 22|
+-----+---+



In [61]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)



In [62]:
## Resolving it with rownumber() window function, when we are getting 2 just filter it out

df = df.withColumn('Flag', row_number().over(Window.partitionBy("name").orderBy(f.col("age").asc()))) \
        .filter(f.col("flag") == 1)\
        .select("name", "age")
        
df.show()

+-----+---+
| name|age|
+-----+---+
|Alice| 22|
| Jane| 30|
| John| 25|
+-----+---+



In [None]:
## 11. You are working with user activity data and need to calculate the average session duration per user. How would you implement this?

In [63]:
data = [("user1", "2023-12-01", 50), ("user1", "2023-12-02", 60), 
        ("user2", "2023-12-01", 45), ("user2", "2023-12-03", 75)]
columns = ["user_id", "session_date", "duration"]
df = spark.createDataFrame(data, columns)

df.show()

+-------+------------+--------+
|user_id|session_date|duration|
+-------+------------+--------+
|  user1|  2023-12-01|      50|
|  user1|  2023-12-02|      60|
|  user2|  2023-12-01|      45|
|  user2|  2023-12-03|      75|
+-------+------------+--------+



In [64]:
## avg:::

df=df.groupBy('user_id').agg(avg('duration').alias("avgDuaration"))
df.show()

+-------+------------+
|user_id|avgDuaration|
+-------+------------+
|  user1|        55.0|
|  user2|        60.0|
+-------+------------+



In [None]:
### 12. While analyzing sales data, you need to find the product with the highest sales for each month. How would you accomplish this?

In [96]:
data = [("product1", "2023-12-01", 100), ("product2", "2023-12-01", 150), 
        ("product1", "2023-12-02", 200), ("product2", "2023-12-02", 250)]
columns = ["product_id", "date", "sales"]
df = spark.createDataFrame(data, columns)
#df.show()

In [71]:
df.printSchema()

root
 |-- product_id: string (nullable = true)
 |-- date: string (nullable = true)
 |-- sales: long (nullable = true)



In [97]:
df = df.withColumn("date", f.col("date").cast(DateType()))\
        .withColumn("month",month("date").alias('month'))

## Best Product FOr each Month --> anserw will be product2
## select product_id, month, sum(sales) from table group by product_id, month order by sum(sales) desc;
## ok great, this will be working like that exactly, then we also need to add window function (any ranking function)


In [73]:
df.show()

+----------+----------+-----+-----+
|product_id|      date|sales|month|
+----------+----------+-----+-----+
|  product1|2023-12-01|  100|   12|
|  product2|2023-12-01|  150|   12|
|  product1|2023-12-02|  200|   12|
|  product2|2023-12-02|  250|   12|
+----------+----------+-----+-----+



In [98]:
df = df.groupBy(f.col("product_id"),f.col("month")).agg(sum('sales').alias("sumSales"))\
        .orderBy(f.col('sumSales'), ascending = False)

In [99]:
df.show()

+----------+-----+--------+
|product_id|month|sumSales|
+----------+-----+--------+
|  product2|   12|     400|
|  product1|   12|     300|
+----------+-----+--------+



In [107]:
## Now we will be aplying window function here Desnse rank   ###.desc with windowFunction --->>> alwaysss .....

df = df.withColumn("ranking",dense_rank().over(Window.partitionBy('month').orderBy(f.col("sumSales").desc())))\
        .filter(f.col("ranking") == 1)\
        .select("product_id","month","sumSales")
    

In [108]:
df.show()

+----------+-----+--------+
|product_id|month|sumSales|
+----------+-----+--------+
|  product2|   12|     400|
+----------+-----+--------+



In [None]:
### 13. You are working with a large Delta table that is frequently updated by multiple users. The data is stored in partitions, 
### and sometimes updates can cause inconsistent reads due to concurrent transactions. How would you ensure ACID compliance and 
### avoid data corruption in PySpark?

In [None]:
## ACID properties can be assured using delta log, which is being created predefined when you create a delta table
### For Data corruption we will be using following upserting syntax

In [109]:
## Reading new data which is in parqut format
df = spark.read.format('parquet').load('path')

from delta.tables import DeltaTable

## Reading data from delta table::

deltaTbl = DeltaTable.forPath(spark, "/path/to/delta/table")

deltaTbl.alias('trg').merge(df.alias('src'),"src.id == trg.id")\
                    .whenNotMatchedInsertAll()\
                    .whenMatchedUpdateAll()\
                    .execute()


In [None]:
## 14. You need to process a large dataset stored in PARQUET format and ensure that all columns have the right schema (Almost). How would you do this?

In [None]:
df = spark.read.format('parquet')\
                .opion("inferSchema", True)\  ## this will infer schema and you not need to get it manually
                .load("path")

In [None]:
### 15. You are reading a CSV file and need to handle corrupt records gracefully by skipping them. How would you configure this in PySpark?

df = spark.read.format('csv')\
                .opion("mode", "DROPMALFORMED")\  ## this will drop all malformed records
                .load("path")