In [2]:
# =============================
# SETUP: Install Dependencies
# =============================

!pip install -q pyspark dask pandas

from pyspark.sql import SparkSession
import pandas as pd
import dask.dataframe as dd
from pyspark.sql.functions import col, when, year, month, current_date, datediff, to_date, explode, get_json_object, udf
from pyspark.sql.types import StringType
import json
import datetime

# =============================
# LOAD DATA
# =============================

# ---- Using Pandas ----
pandas_df = pd.read_csv('Sales_Dataset__500_Records_.csv')

# ---- Using PySpark ----
spark = SparkSession.builder.appName("SalesAnalysis").getOrCreate()
spark_df = spark.read.option("header", "true").option("inferSchema", "true").csv('Sales_Dataset__500_Records_.csv')

# ---- Using Dask ----
dask_df = dd.read_csv('Sales_Dataset__500_Records_.csv')

# =============================
# TASK 1: DataFrame Creation and Inspection
# =============================

# ---- Pandas ----
print("Pandas Head:\n", pandas_df.head())
print("Pandas Tail:\n", pandas_df.tail())
print("Pandas Info:\n", pandas_df.dtypes)

# ---- PySpark ----
spark_df.show(5)
spark_df.printSchema()

# ---- Dask ----
print("Dask Head:\n", dask_df.head())
print("Dask Tail:\n", dask_df.tail())
print("Dask dtypes:\n", dask_df.dtypes)

# =============================
# TASK 2: Selection, Renaming, and Filtering
# =============================

# ---- Pandas ----
pandas_selected = pandas_df[['OrderID', 'CustomerName', 'Amount']].copy()
pandas_selected.rename(columns={'Amount': 'OrderAmount'}, inplace=True)
filtered_pandas = pandas_selected[pandas_selected['OrderAmount'] > 500]

# ---- PySpark ----
spark_selected = spark_df.select("OrderID", "CustomerName", "Amount").withColumnRenamed("Amount", "OrderAmount")
spark_filtered = spark_selected.filter(col("OrderAmount") > 500)

# ---- Dask ----
dask_selected = dask_df[['OrderID', 'CustomerName', 'Amount']].rename(columns={'Amount': 'OrderAmount'})
dask_filtered = dask_selected[dask_selected['OrderAmount'] > 500]

# =============================
# TASK 3: Data Manipulation
# =============================

# ---- Pandas ----
pandas_df.drop(columns=['CustomerSince'], inplace=True)
pandas_df['FinalAmount'] = pandas_df['Amount'] - (pandas_df['Amount'] * pandas_df['Discount'])
pandas_df.sort_values(by='FinalAmount', ascending=False, inplace=True)
pandas_df['DeliveryStatus'] = pandas_df['DeliveryStatus'].replace("Cancelled", "Order Cancelled")

# ---- PySpark ----
spark_df = spark_df.drop("CustomerSince")
spark_df = spark_df.withColumn("FinalAmount", col("Amount") - (col("Amount") * col("Discount")))
spark_df = spark_df.orderBy(col("FinalAmount").desc())
spark_df = spark_df.withColumn("DeliveryStatus", when(col("DeliveryStatus") == "Cancelled", "Order Cancelled").otherwise(col("DeliveryStatus")))

# ---- Dask ----
dask_df = dask_df.drop("CustomerSince", axis=1)
dask_df['FinalAmount'] = dask_df['Amount'] - (dask_df['Amount'] * dask_df['Discount'])
dask_df = dask_df.sort_values("FinalAmount", ascending=False)
dask_df['DeliveryStatus'] = dask_df['DeliveryStatus'].replace("Cancelled", "Order Cancelled")

# =============================
# TASK 4: Aggregations and GroupBy
# =============================

# ---- Pandas ----
print(pandas_df['DeliveryStatus'].value_counts())
print(pandas_df.groupby('ProductCategory')['Amount'].mean())
print(pandas_df.groupby('City')['Amount'].sum())

# ---- PySpark ----
spark_df.groupBy("DeliveryStatus").count().show()
spark_df.groupBy("ProductCategory").avg("Amount").show()
spark_df.groupBy("City").sum("Amount").show()

# ---- Dask ----
print(dask_df['DeliveryStatus'].value_counts().compute())
print(dask_df.groupby('ProductCategory')['Amount'].mean().compute())
print(dask_df.groupby('City')['Amount'].sum().compute())

# =============================
# TASK 5: Null Handling & Update
# =============================

# ---- Pandas ----
pandas_df.loc[::50, 'City'] = None
pandas_df.fillna({'City': 'Unknown'}, inplace=True)

# ---- PySpark ----
from pyspark.sql.functions import lit
spark_df = spark_df.withColumn("City", when((col("City") == "") | col("City").isNull(), lit(None)).otherwise(col("City")))
spark_df = spark_df.fillna({'City': 'Unknown'})
spark_df = spark_df.withColumn("CustomerTag", when(col("Amount") > 800, "High-Value").otherwise("Regular"))

# ---- Dask ----
dask_df.loc[::50, 'City'] = None
dask_df = dask_df.fillna({'City': 'Unknown'})

# =============================
# TASK 6: Date & Time Functions
# =============================

# ---- Pandas ----
pandas_df['OrderDate'] = pd.to_datetime(pandas_df['OrderDate'])
pandas_df['Year'] = pandas_df['OrderDate'].dt.year
pandas_df['Month'] = pandas_df['OrderDate'].dt.month

# ---- PySpark ----
spark_df = spark_df.withColumn("OrderDate", to_date(col("OrderDate"), "yyyy-MM-dd"))
spark_df = spark_df.withColumn("Year", year(col("OrderDate")))
spark_df = spark_df.withColumn("Month", month(col("OrderDate")))

# ---- Dask ----
dask_df['OrderDate'] = dd.to_datetime(dask_df['OrderDate'])
dask_df['Year'] = dask_df['OrderDate'].dt.year
dask_df['Month'] = dask_df['OrderDate'].dt.month

# =============================
# TASK 7: Joins and Unions
# =============================

# ---- Setup a simple mapping DataFrame ----
city_region_data = pd.DataFrame({
    "City": pandas_df['City'].unique(),
    "Region": ["North", "South", "East", "West"] * 20  # random repeating
})
region_df_spark = spark.createDataFrame(city_region_data)
region_df_dask = dd.from_pandas(city_region_data, npartitions=1)

# ---- Joins ----
spark_df.join(region_df_spark, on="City", how="inner").show()
spark_df.join(region_df_spark, on="City", how="left").show()

# ---- Union ----
df_2023 = spark_df.filter(year("OrderDate") == 2023)
df_2024 = spark_df.filter(year("OrderDate") == 2024)
df_union = df_2023.union(df_2024)
df_union.show()

# =============================
# TASK 8: Complex JSON Simulation (Advanced)
# =============================

# ---- Pandas ----
json_series = pandas_df.apply(lambda row: row.to_json(), axis=1)
json_df = pd.read_json(json_series.to_json(), typ='series')

# ---- PySpark ----
spark_df_json = spark_df.withColumn("json", col("OrderID").cast(StringType()))
# Normally you'd use `to_json(struct(...))` and explode a nested array

# =============================
# TASK 9: Applying Functions
# =============================

# ---- Pandas ----
def tag_order(amount):
    if amount > 800:
        return "Big"
    elif amount > 400:
        return "Medium"
    else:
        return "Small"

pandas_df['OrderTag'] = pandas_df['Amount'].apply(tag_order)

# ---- PySpark ----
@udf(returnType=StringType())
def tag_udf(amount):
    if amount > 800:
        return "Big"
    elif amount > 400:
        return "Medium"
    else:
        return "Small"

spark_df = spark_df.withColumn("OrderTag", tag_udf(col("Amount")))

# ---- Dask ----
dask_df['OrderTag'] = dask_df['Amount'].map(tag_order)


Pandas Head:
    OrderID    CustomerName ProductCategory  Amount   OrderDate DeliveryStatus  \
0     2824   Donald Walker           Books  783.04  2024-12-26       Returned   
1     7912    Brandon Hall       Groceries  905.00  2024-09-12      Cancelled   
2     4611    Donald Booth         Fashion  657.96  2025-01-12       Returned   
3     3547  Phillip Garcia         Fashion  606.89  2024-03-24       Returned   
4     8527    Valerie Gray            Toys   77.87  2024-08-04      Delivered   

   Discount              City  PaymentMode CustomerSince  
0      0.15      Lake Joyside  Credit Card    2020-10-15  
1      0.03     New Jamesside       Wallet    2022-03-15  
2      0.01      Lake Roberto       Wallet    2021-08-07  
3      0.15  West Melanieview       Wallet    2020-08-08  
4      0.17         Mariastad         Cash    2022-11-15  
Pandas Tail:
      OrderID     CustomerName ProductCategory  Amount   OrderDate  \
495     2930     Jaime Harris         Fashion  680.00  2025-02

TypeError: 'LocIndexer' object does not support item assignment