#Step 1: Sample Dataset (create CSV)

In [31]:
data = """OrderID,OrderDate,Customer,Segment,Region,Product,Category,SubCategory,Quantity,UnitPrice,Discount,Profit
CA-1001,2023-01-15,Ravi,Consumer,South,Laptop,Technology,Computers,1,55000,0.10,5000
CA-1002,2023-02-20,Priya,Corporate,North,Printer,Technology,Peripherals,2,12000,0.15,1800
CA-1003,2023-01-25,Amit,Consumer,East,Notebook,Office Supplies,Paper,3,200,0.05,150
CA-1004,2023-03-01,Anita,Home Office,West,Table,Furniture,Tables,1,18000,0.20,-1500
CA-1005,2023-02-05,Divya,Consumer,South,Phone,Technology,Phones,2,20000,0.00,3000"""
with open("superstore.csv", "w") as f:
    f.write(data)


#PART 1: Pandas DataFrame Operations

In [32]:
# 1. Load the CSV using pandas .
import pandas as pd
df=pd.read_csv('superstore.csv')
print(df)

   OrderID   OrderDate Customer      Segment Region   Product  \
0  CA-1001  2023-01-15     Ravi     Consumer  South    Laptop   
1  CA-1002  2023-02-20    Priya    Corporate  North   Printer   
2  CA-1003  2023-01-25     Amit     Consumer   East  Notebook   
3  CA-1004  2023-03-01    Anita  Home Office   West     Table   
4  CA-1005  2023-02-05    Divya     Consumer  South     Phone   

          Category  SubCategory  Quantity  UnitPrice  Discount  Profit  
0       Technology    Computers         1      55000      0.10    5000  
1       Technology  Peripherals         2      12000      0.15    1800  
2  Office Supplies        Paper         3        200      0.05     150  
3        Furniture       Tables         1      18000      0.20   -1500  
4       Technology       Phones         2      20000      0.00    3000  


In [33]:
# 2. Print schema, head, shape, dtypes.
print("Schema:\n",df.dtypes)
print("\nHead:\n",df.head())
print("\nShape\n:", df.shape)


Schema:
 OrderID         object
OrderDate       object
Customer        object
Segment         object
Region          object
Product         object
Category        object
SubCategory     object
Quantity         int64
UnitPrice        int64
Discount       float64
Profit           int64
dtype: object

Head:
    OrderID   OrderDate Customer      Segment Region   Product  \
0  CA-1001  2023-01-15     Ravi     Consumer  South    Laptop   
1  CA-1002  2023-02-20    Priya    Corporate  North   Printer   
2  CA-1003  2023-01-25     Amit     Consumer   East  Notebook   
3  CA-1004  2023-03-01    Anita  Home Office   West     Table   
4  CA-1005  2023-02-05    Divya     Consumer  South     Phone   

          Category  SubCategory  Quantity  UnitPrice  Discount  Profit  
0       Technology    Computers         1      55000      0.10    5000  
1       Technology  Peripherals         2      12000      0.15    1800  
2  Office Supplies        Paper         3        200      0.05     150  
3        F

In [34]:
# 3. Select Customer , Product , Profit columns.
print(df[['Customer','Product','Profit']])

  Customer   Product  Profit
0     Ravi    Laptop    5000
1    Priya   Printer    1800
2     Amit  Notebook     150
3    Anita     Table   -1500
4    Divya     Phone    3000


In [35]:
# 4. Filter orders where Profit > 2000 and Discount = 0 .
print(df[(df['Profit'] > 2000) & (df['Discount'] == 0)])

   OrderID   OrderDate Customer   Segment Region Product    Category  \
4  CA-1005  2023-02-05    Divya  Consumer  South   Phone  Technology   

  SubCategory  Quantity  UnitPrice  Discount  Profit  
4      Phones         2      20000       0.0    3000  


In [36]:
# 5. Sort by Profit descending.
profit_desc = df.sort_values(by='Profit',ascending = False)
print(profit_desc[['OrderID','Profit']])

   OrderID  Profit
0  CA-1001    5000
4  CA-1005    3000
1  CA-1002    1800
2  CA-1003     150
3  CA-1004   -1500


In [37]:
# 6. GroupBy Category → Total Profit, Avg Discount.
print( df.groupby('Category').agg({'Profit': 'sum', 'Discount': 'mean'}))

                 Profit  Discount
Category                         
Furniture         -1500  0.200000
Office Supplies     150  0.050000
Technology         9800  0.083333


In [38]:
# 7. Add a column TotalPrice = Quantity * UnitPrice .
df['TotalPrice'] = df['Quantity'] * df['UnitPrice']
print(df)

   OrderID   OrderDate Customer      Segment Region   Product  \
0  CA-1001  2023-01-15     Ravi     Consumer  South    Laptop   
1  CA-1002  2023-02-20    Priya    Corporate  North   Printer   
2  CA-1003  2023-01-25     Amit     Consumer   East  Notebook   
3  CA-1004  2023-03-01    Anita  Home Office   West     Table   
4  CA-1005  2023-02-05    Divya     Consumer  South     Phone   

          Category  SubCategory  Quantity  UnitPrice  Discount  Profit  \
0       Technology    Computers         1      55000      0.10    5000   
1       Technology  Peripherals         2      12000      0.15    1800   
2  Office Supplies        Paper         3        200      0.05     150   
3        Furniture       Tables         1      18000      0.20   -1500   
4       Technology       Phones         2      20000      0.00    3000   

   TotalPrice  
0       55000  
1       24000  
2         600  
3       18000  
4       40000  


In [39]:
# 8. Drop the SubCategory column.
df = df.drop(columns=['SubCategory'])
print(df)

   OrderID   OrderDate Customer      Segment Region   Product  \
0  CA-1001  2023-01-15     Ravi     Consumer  South    Laptop   
1  CA-1002  2023-02-20    Priya    Corporate  North   Printer   
2  CA-1003  2023-01-25     Amit     Consumer   East  Notebook   
3  CA-1004  2023-03-01    Anita  Home Office   West     Table   
4  CA-1005  2023-02-05    Divya     Consumer  South     Phone   

          Category  Quantity  UnitPrice  Discount  Profit  TotalPrice  
0       Technology         1      55000      0.10    5000       55000  
1       Technology         2      12000      0.15    1800       24000  
2  Office Supplies         3        200      0.05     150         600  
3        Furniture         1      18000      0.20   -1500       18000  
4       Technology         2      20000      0.00    3000       40000  


In [40]:
# 9. Fill nulls in Discount with 0.10.
df['Discount']=df['Discount'].fillna(0.10)
print(df)

   OrderID   OrderDate Customer      Segment Region   Product  \
0  CA-1001  2023-01-15     Ravi     Consumer  South    Laptop   
1  CA-1002  2023-02-20    Priya    Corporate  North   Printer   
2  CA-1003  2023-01-25     Amit     Consumer   East  Notebook   
3  CA-1004  2023-03-01    Anita  Home Office   West     Table   
4  CA-1005  2023-02-05    Divya     Consumer  South     Phone   

          Category  Quantity  UnitPrice  Discount  Profit  TotalPrice  
0       Technology         1      55000      0.10    5000       55000  
1       Technology         2      12000      0.15    1800       24000  
2  Office Supplies         3        200      0.05     150         600  
3        Furniture         1      18000      0.20   -1500       18000  
4       Technology         2      20000      0.00    3000       40000  


In [41]:
# 10. Apply a function to categorize orders:
def classify(row):
    if row['Profit'] > 4000:
        return 'High'
    elif row['Profit'] > 0:
        return 'Medium'
    else:
        return 'Low'

df['ProfitCategory'] = df.apply(classify, axis=1)
print("\nProfit Categories:\n", df[['OrderID', 'Profit', 'ProfitCategory']])


Profit Categories:
    OrderID  Profit ProfitCategory
0  CA-1001    5000           High
1  CA-1002    1800         Medium
2  CA-1003     150         Medium
3  CA-1004   -1500            Low
4  CA-1005    3000         Medium


#ART 2: PySpark DataFrame Operations

In [42]:
# 1. Load the same CSV using PySpark.
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("RetailSalesSuperstore").getOrCreate()
df = spark.read.option("header", True).option("inferSchema", True).csv("superstore.csv")
df.show()

+-------+----------+--------+-----------+------+--------+---------------+-----------+--------+---------+--------+------+
|OrderID| OrderDate|Customer|    Segment|Region| Product|       Category|SubCategory|Quantity|UnitPrice|Discount|Profit|
+-------+----------+--------+-----------+------+--------+---------------+-----------+--------+---------+--------+------+
|CA-1001|2023-01-15|    Ravi|   Consumer| South|  Laptop|     Technology|  Computers|       1|    55000|     0.1|  5000|
|CA-1002|2023-02-20|   Priya|  Corporate| North| Printer|     Technology|Peripherals|       2|    12000|    0.15|  1800|
|CA-1003|2023-01-25|    Amit|   Consumer|  East|Notebook|Office Supplies|      Paper|       3|      200|    0.05|   150|
|CA-1004|2023-03-01|   Anita|Home Office|  West|   Table|      Furniture|     Tables|       1|    18000|     0.2| -1500|
|CA-1005|2023-02-05|   Divya|   Consumer| South|   Phone|     Technology|     Phones|       2|    20000|     0.0|  3000|
+-------+----------+--------+---

In [43]:
# 2. Show schema and first 5 rows.
df.printSchema()
df.show(5)

root
 |-- OrderID: string (nullable = true)
 |-- OrderDate: date (nullable = true)
 |-- Customer: string (nullable = true)
 |-- Segment: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- SubCategory: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- UnitPrice: integer (nullable = true)
 |-- Discount: double (nullable = true)
 |-- Profit: integer (nullable = true)

+-------+----------+--------+-----------+------+--------+---------------+-----------+--------+---------+--------+------+
|OrderID| OrderDate|Customer|    Segment|Region| Product|       Category|SubCategory|Quantity|UnitPrice|Discount|Profit|
+-------+----------+--------+-----------+------+--------+---------------+-----------+--------+---------+--------+------+
|CA-1001|2023-01-15|    Ravi|   Consumer| South|  Laptop|     Technology|  Computers|       1|    55000|     0.1|  5000|
|CA-1002|2023-02-20|   Priya|  

In [44]:
# 3. Select columns, Rename Customer → Client .
from pyspark.sql.functions import col
df.select(col("Customer").alias("Client"), "Product", "Profit").show()

+------+--------+------+
|Client| Product|Profit|
+------+--------+------+
|  Ravi|  Laptop|  5000|
| Priya| Printer|  1800|
|  Amit|Notebook|   150|
| Anita|   Table| -1500|
| Divya|   Phone|  3000|
+------+--------+------+



In [45]:
# 4. Filter Segment = 'Consumer' and Profit < 1000
df.filter((col('Segment') == 'Consumer') & (col("Profit")<1000)).show()

+-------+----------+--------+--------+------+--------+---------------+-----------+--------+---------+--------+------+
|OrderID| OrderDate|Customer| Segment|Region| Product|       Category|SubCategory|Quantity|UnitPrice|Discount|Profit|
+-------+----------+--------+--------+------+--------+---------------+-----------+--------+---------+--------+------+
|CA-1003|2023-01-25|    Amit|Consumer|  East|Notebook|Office Supplies|      Paper|       3|      200|    0.05|   150|
+-------+----------+--------+--------+------+--------+---------------+-----------+--------+---------+--------+------+



In [46]:
# 5. GroupBy Region and show average profit.
df.groupBy('Region').avg('Profit').show()

+------+-----------+
|Region|avg(Profit)|
+------+-----------+
| South|     4000.0|
|  East|      150.0|
|  West|    -1500.0|
| North|     1800.0|
+------+-----------+



In [47]:
# 6. Use withColumn to create TotalPrice = Quantity * UnitPrice .
df = df.withColumn("TotalPrice", col("Quantity") * col("UnitPrice"))
df.select("OrderID", "TotalPrice").show()

+-------+----------+
|OrderID|TotalPrice|
+-------+----------+
|CA-1001|     55000|
|CA-1002|     24000|
|CA-1003|       600|
|CA-1004|     18000|
|CA-1005|     40000|
+-------+----------+



In [48]:
# 7. Use when().otherwise() to classify Profit as:
#   'Profit' > 2000 → 'High'
#   'Profit' <= 0 → 'Loss'
#    else 'Medium'
from pyspark.sql.functions import when
df = df.withColumn("ProfitCategory", when(col("Profit") > 2000, "High").when(col("Profit") <= 0, "Loss").otherwise("Medium"))
df.select("OrderID", "Profit", "ProfitCategory").show()

+-------+------+--------------+
|OrderID|Profit|ProfitCategory|
+-------+------+--------------+
|CA-1001|  5000|          High|
|CA-1002|  1800|        Medium|
|CA-1003|   150|        Medium|
|CA-1004| -1500|          Loss|
|CA-1005|  3000|          High|
+-------+------+--------------+



In [49]:
# 8. Use drop() to remove SubCategory .
df = df.drop('SubCategory')
df.show()

+-------+----------+--------+-----------+------+--------+---------------+--------+---------+--------+------+----------+--------------+
|OrderID| OrderDate|Customer|    Segment|Region| Product|       Category|Quantity|UnitPrice|Discount|Profit|TotalPrice|ProfitCategory|
+-------+----------+--------+-----------+------+--------+---------------+--------+---------+--------+------+----------+--------------+
|CA-1001|2023-01-15|    Ravi|   Consumer| South|  Laptop|     Technology|       1|    55000|     0.1|  5000|     55000|          High|
|CA-1002|2023-02-20|   Priya|  Corporate| North| Printer|     Technology|       2|    12000|    0.15|  1800|     24000|        Medium|
|CA-1003|2023-01-25|    Amit|   Consumer|  East|Notebook|Office Supplies|       3|      200|    0.05|   150|       600|        Medium|
|CA-1004|2023-03-01|   Anita|Home Office|  West|   Table|      Furniture|       1|    18000|     0.2| -1500|     18000|          Loss|
|CA-1005|2023-02-05|   Divya|   Consumer| South|   Phon

In [50]:
# 9. Handle nulls in Discount using fillna(0.10) .
df = df.fillna({"Discount": 0.10})
df.show()

+-------+----------+--------+-----------+------+--------+---------------+--------+---------+--------+------+----------+--------------+
|OrderID| OrderDate|Customer|    Segment|Region| Product|       Category|Quantity|UnitPrice|Discount|Profit|TotalPrice|ProfitCategory|
+-------+----------+--------+-----------+------+--------+---------------+--------+---------+--------+------+----------+--------------+
|CA-1001|2023-01-15|    Ravi|   Consumer| South|  Laptop|     Technology|       1|    55000|     0.1|  5000|     55000|          High|
|CA-1002|2023-02-20|   Priya|  Corporate| North| Printer|     Technology|       2|    12000|    0.15|  1800|     24000|        Medium|
|CA-1003|2023-01-25|    Amit|   Consumer|  East|Notebook|Office Supplies|       3|      200|    0.05|   150|       600|        Medium|
|CA-1004|2023-03-01|   Anita|Home Office|  West|   Table|      Furniture|       1|    18000|     0.2| -1500|     18000|          Loss|
|CA-1005|2023-02-05|   Divya|   Consumer| South|   Phon

In [51]:
# 10. Convert OrderDate to date type and extract year , month .
from pyspark.sql.functions import year,to_date,month
df = df.withColumn("OrderDate", to_date(col("OrderDate"), "yyyy-MM-dd"))
df = df.withColumn("Year", year("OrderDate"))
df = df.withColumn("Month", month("OrderDate"))
df.select("OrderID", "OrderDate", "Year", "Month").show()

+-------+----------+----+-----+
|OrderID| OrderDate|Year|Month|
+-------+----------+----+-----+
|CA-1001|2023-01-15|2023|    1|
|CA-1002|2023-02-20|2023|    2|
|CA-1003|2023-01-25|2023|    1|
|CA-1004|2023-03-01|2023|    3|
|CA-1005|2023-02-05|2023|    2|
+-------+----------+----+-----+



#PART 3: Dask DataFrame Operations (Pandas Alternative)

In [52]:
import dask.dataframe as dd
df = dd.read_csv("superstore.csv")
print(df)

Dask DataFrame Structure:
              OrderID OrderDate Customer Segment  Region Product Category SubCategory Quantity UnitPrice Discount Profit
npartitions=1                                                                                                           
               string    string   string  string  string  string   string      string    int64     int64  float64  int64
                  ...       ...      ...     ...     ...     ...      ...         ...      ...       ...      ...    ...
Dask Name: to_string_dtype, 2 expressions
Expr=ArrowStringConversion(frame=FromMapProjectable(f3fecaf))


In [53]:
# Compute average discount by category.
avg_discount = df.groupby('Category')['Discount'].mean().compute()
print(avg_discount)

Category
Furniture          0.200000
Office Supplies    0.050000
Technology         0.083333
Name: Discount, dtype: float64


In [54]:
# Filter orders with more than 1 quantity and high profit.
filtered = df[(df['Quantity'] > 1) & (df['Profit'] > 2000)]
filtered_result = filtered.compute()
print(filtered_result)

   OrderID   OrderDate Customer   Segment Region Product    Category  \
4  CA-1005  2023-02-05    Divya  Consumer  South   Phone  Technology   

  SubCategory  Quantity  UnitPrice  Discount  Profit  
4      Phones         2      20000       0.0    3000  


In [55]:
# Save filtered data to new CSV.
filtered.to_csv('filtered_high_profit_orders_*.csv', index=False)

['/content/filtered_high_profit_orders_0.csv']

#PART 4: JSON Handling (Complex Nested)

In [56]:
# 1. Create a nested JSON file:
json_data = """[
  {
    "OrderID": "CA-1001",
    "Customer": { "Name": "Ravi", "Segment": "Consumer" },
    "Details": { "Region": "South", "Profit": 5000 }
  },
  {
    "OrderID": "CA-1002",
    "Customer": { "Name": "Priya", "Segment": "Corporate" },
    "Details": { "Region": "North", "Profit": 1800 }
  }
]"""
with open('orders.json', 'w') as f:
  f.write(json_data)



In [57]:
# 2. Load it using PySpark:
df_json = spark.read.option("multiLine", True).json("orders.json")
df_json.show()

+------------------+-------------+-------+
|          Customer|      Details|OrderID|
+------------------+-------------+-------+
|  {Ravi, Consumer}|{5000, South}|CA-1001|
|{Priya, Corporate}|{1800, North}|CA-1002|
+------------------+-------------+-------+



In [58]:
df_json.printSchema()

root
 |-- Customer: struct (nullable = true)
 |    |-- Name: string (nullable = true)
 |    |-- Segment: string (nullable = true)
 |-- Details: struct (nullable = true)
 |    |-- Profit: long (nullable = true)
 |    |-- Region: string (nullable = true)
 |-- OrderID: string (nullable = true)



In [59]:
df_json.select("OrderID", "Customer.Name", "Details.Profit").show()

+-------+-----+------+
|OrderID| Name|Profit|
+-------+-----+------+
|CA-1001| Ravi|  5000|
|CA-1002|Priya|  1800|
+-------+-----+------+

