In [4]:
import pandas as pd

PART 1: Pandas DataFrame Operations


1. Load the CSV using pandas .
2. Print schema, head, shape, dtypes.
3. Select Customer , Product , Profit columns.
4. Filter orders where Profit > 2000 and Discount = 0 .
5. Sort by Profit descending.
6. GroupBy Category → Total Profit, Avg Discount.
7. Add a column TotalPrice = Quantity * UnitPrice .
8. Drop the SubCategory column.
9. Fill nulls in Discount with 0.10.
10. Apply a function to categorize orders:

In [5]:
#1
df = pd.read_csv("superstore.csv")

In [6]:
# Step 2: Schema, head, shape, dtypes
print("Schema:")
print(df.info())
print("\nHead:")
print(df.head())
print("\nShape:", df.shape)
print("\nData Types:\n", df.dtypes)

Schema:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   OrderID      5 non-null      object 
 1   OrderDate    5 non-null      object 
 2   Customer     5 non-null      object 
 3   Segment      5 non-null      object 
 4   Region       5 non-null      object 
 5   Product      5 non-null      object 
 6   Category     5 non-null      object 
 7   SubCategory  5 non-null      object 
 8   Quantity     5 non-null      int64  
 9   UnitPrice    5 non-null      int64  
 10  Discount     5 non-null      float64
 11  Profit       5 non-null      int64  
dtypes: float64(1), int64(3), object(8)
memory usage: 612.0+ bytes
None

Head:
   OrderID   OrderDate Customer      Segment Region   Product  \
0  CA-1001  2023-01-15     Ravi     Consumer  South    Laptop   
1  CA-1002  2023-02-20    Priya    Corporate  North   Printer   
2  CA-1003  2023-01-25     Am

In [7]:
# Step 3: Select Customer, Product, Profit columns
print("\nSelected Columns:")
print(df[["Customer", "Product", "Profit"]])


Selected Columns:
  Customer   Product  Profit
0     Ravi    Laptop    5000
1    Priya   Printer    1800
2     Amit  Notebook     150
3    Anita     Table   -1500
4    Divya     Phone    3000


In [8]:
# Step 4: Filter Profit > 2000 and Discount = 0
filtered_df = df[(df["Profit"] > 2000) & (df["Discount"] == 0.0)]
print("\nFiltered Orders (Profit > 2000 & Discount = 0):")
print(filtered_df)


Filtered Orders (Profit > 2000 & Discount = 0):
   OrderID   OrderDate Customer   Segment Region Product    Category  \
4  CA-1005  2023-02-05    Divya  Consumer  South   Phone  Technology   

  SubCategory  Quantity  UnitPrice  Discount  Profit  
4      Phones         2      20000       0.0    3000  


In [9]:
# Step 5: Sort by Profit descending
sorted_df = df.sort_values(by="Profit", ascending=False)
print("\nSorted by Profit Descending:")
print(sorted_df)


Sorted by Profit Descending:
   OrderID   OrderDate Customer      Segment Region   Product  \
0  CA-1001  2023-01-15     Ravi     Consumer  South    Laptop   
4  CA-1005  2023-02-05    Divya     Consumer  South     Phone   
1  CA-1002  2023-02-20    Priya    Corporate  North   Printer   
2  CA-1003  2023-01-25     Amit     Consumer   East  Notebook   
3  CA-1004  2023-03-01    Anita  Home Office   West     Table   

          Category  SubCategory  Quantity  UnitPrice  Discount  Profit  
0       Technology    Computers         1      55000      0.10    5000  
4       Technology       Phones         2      20000      0.00    3000  
1       Technology  Peripherals         2      12000      0.15    1800  
2  Office Supplies        Paper         3        200      0.05     150  
3        Furniture       Tables         1      18000      0.20   -1500  


In [12]:
# Step 6: GroupBy Category → Total Profit, Avg Discount
grouped = df.groupby("Category").agg({
    "Profit": "sum",
    "Discount": "mean"
}).reset_index()
print("\nGrouped by Category (Total Profit & Avg Discount):")
print(grouped)


Grouped by Category (Total Profit & Avg Discount):
          Category  Profit  Discount
0        Furniture   -1500  0.200000
1  Office Supplies     150  0.050000
2       Technology    9800  0.083333


In [13]:
# Step 7: Add TotalPrice = Quantity * UnitPrice
df["TotalPrice"] = df["Quantity"] * df["UnitPrice"]
print("\nWith TotalPrice Column:")
print(df[["Product", "Quantity", "UnitPrice", "TotalPrice"]])


With TotalPrice Column:
    Product  Quantity  UnitPrice  TotalPrice
0    Laptop         1      55000       55000
1   Printer         2      12000       24000
2  Notebook         3        200         600
3     Table         1      18000       18000
4     Phone         2      20000       40000


In [14]:
# Step 8: Drop SubCategory column
df = df.drop(columns=["SubCategory"])
print("\nAfter Dropping 'SubCategory':")
print(df.columns)


After Dropping 'SubCategory':
Index(['OrderID', 'OrderDate', 'Customer', 'Segment', 'Region', 'Product',
       'Category', 'Quantity', 'UnitPrice', 'Discount', 'Profit',
       'TotalPrice'],
      dtype='object')


In [15]:
# Step 9: Fill nulls in Discount with 0.10
df["Discount"] = df["Discount"].fillna(0.10)

In [16]:
# Step 10: Apply classification function
def classify(row):
    if row['Profit'] > 4000:
        return 'High'
    elif row['Profit'] > 0:
        return 'Medium'
    else:
        return 'Low'

df["ProfitCategory"] = df.apply(classify, axis=1)
print("\nWith ProfitCategory:")
print(df[["Product", "Profit", "ProfitCategory"]])


With ProfitCategory:
    Product  Profit ProfitCategory
0    Laptop    5000           High
1   Printer    1800         Medium
2  Notebook     150         Medium
3     Table   -1500            Low
4     Phone    3000         Medium


PART 2: PySpark DataFrame Operations

In [17]:
pip install pyspark



In [18]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Start Spark session
spark = SparkSession.builder.appName("SuperstoreAnalysis").getOrCreate()


1. Load the same CSV using PySpark.
2. Show schema and first 5 rows.
3. Select columns, Rename Customer → Client .
4. Filter Segment = 'Consumer' and Profit < 1000 .
5. GroupBy Region and show average profit.
6. Use withColumn to create TotalPrice = Quantity * UnitPrice .
7. Use when().otherwise() to classify Profit as:
'Profit' > 2000 → 'High'
'Profit' <= 0 → 'Loss'
else 'Medium'
8. Use drop() to remove SubCategory .
9. Handle nulls in Discount using fillna(0.10) .
10. Convert OrderDate to date type and extract year , month .

In [19]:
#1
df_spark = spark.read.csv("superstore.csv", header=True, inferSchema=True)


In [20]:
#2
df_spark.printSchema()
df_spark.show(5)


root
 |-- OrderID: string (nullable = true)
 |-- OrderDate: date (nullable = true)
 |-- Customer: string (nullable = true)
 |-- Segment: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- SubCategory: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- UnitPrice: integer (nullable = true)
 |-- Discount: double (nullable = true)
 |-- Profit: integer (nullable = true)

+-------+----------+--------+-----------+------+--------+---------------+-----------+--------+---------+--------+------+
|OrderID| OrderDate|Customer|    Segment|Region| Product|       Category|SubCategory|Quantity|UnitPrice|Discount|Profit|
+-------+----------+--------+-----------+------+--------+---------------+-----------+--------+---------+--------+------+
|CA-1001|2023-01-15|    Ravi|   Consumer| South|  Laptop|     Technology|  Computers|       1|    55000|     0.1|  5000|
|CA-1002|2023-02-20|   Priya|  

In [21]:
#3
selected_df = df_spark.select(
    col("Customer").alias("Client"),
    "Product",
    "Profit"
)
selected_df.show()

+------+--------+------+
|Client| Product|Profit|
+------+--------+------+
|  Ravi|  Laptop|  5000|
| Priya| Printer|  1800|
|  Amit|Notebook|   150|
| Anita|   Table| -1500|
| Divya|   Phone|  3000|
+------+--------+------+



In [22]:
#4
filtered_df = df_spark.filter((col("Segment") == "Consumer") & (col("Profit") < 1000))
filtered_df.show()


+-------+----------+--------+--------+------+--------+---------------+-----------+--------+---------+--------+------+
|OrderID| OrderDate|Customer| Segment|Region| Product|       Category|SubCategory|Quantity|UnitPrice|Discount|Profit|
+-------+----------+--------+--------+------+--------+---------------+-----------+--------+---------+--------+------+
|CA-1003|2023-01-25|    Amit|Consumer|  East|Notebook|Office Supplies|      Paper|       3|      200|    0.05|   150|
+-------+----------+--------+--------+------+--------+---------------+-----------+--------+---------+--------+------+



In [23]:
#5
df_spark.groupBy("Region").avg("Profit").show()


+------+-----------+
|Region|avg(Profit)|
+------+-----------+
| South|     4000.0|
|  East|      150.0|
|  West|    -1500.0|
| North|     1800.0|
+------+-----------+



In [24]:
#6
df_spark = df_spark.withColumn("TotalPrice", col("Quantity") * col("UnitPrice"))
df_spark.select("Product", "Quantity", "UnitPrice", "TotalPrice").show()


+--------+--------+---------+----------+
| Product|Quantity|UnitPrice|TotalPrice|
+--------+--------+---------+----------+
|  Laptop|       1|    55000|     55000|
| Printer|       2|    12000|     24000|
|Notebook|       3|      200|       600|
|   Table|       1|    18000|     18000|
|   Phone|       2|    20000|     40000|
+--------+--------+---------+----------+



In [26]:
#7
from pyspark.sql.functions import when

df_spark = df_spark.withColumn(
    "ProfitCategory",
    when(col("Profit") > 2000, "High")
    .when(col("Profit") <= 0, "Loss")
    .otherwise("Medium")
)
df_spark.select("Product", "Profit", "ProfitCategory").show()

+--------+------+--------------+
| Product|Profit|ProfitCategory|
+--------+------+--------------+
|  Laptop|  5000|          High|
| Printer|  1800|        Medium|
|Notebook|   150|        Medium|
|   Table| -1500|          Loss|
|   Phone|  3000|          High|
+--------+------+--------------+



In [27]:
#8
df_spark = df_spark.drop("SubCategory")
print("Remaining Columns:", df_spark.columns)


Remaining Columns: ['OrderID', 'OrderDate', 'Customer', 'Segment', 'Region', 'Product', 'Category', 'Quantity', 'UnitPrice', 'Discount', 'Profit', 'TotalPrice', 'ProfitCategory']


In [28]:
#9
df_spark = df_spark.fillna({"Discount": 0.10})
df_spark.select("Product", "Discount").show()


+--------+--------+
| Product|Discount|
+--------+--------+
|  Laptop|     0.1|
| Printer|    0.15|
|Notebook|    0.05|
|   Table|     0.2|
|   Phone|     0.0|
+--------+--------+



In [30]:
#10
from pyspark.sql.functions import to_date, year, month

df_spark = df_spark.withColumn("OrderDate", to_date(col("OrderDate"), "yyyy-MM-dd"))
df_spark = df_spark.withColumn("Year", year(col("OrderDate")))
df_spark = df_spark.withColumn("Month", month(col("OrderDate")))
df_spark.select("OrderID", "OrderDate", "Year", "Month").show()

+-------+----------+----+-----+
|OrderID| OrderDate|Year|Month|
+-------+----------+----+-----+
|CA-1001|2023-01-15|2023|    1|
|CA-1002|2023-02-20|2023|    2|
|CA-1003|2023-01-25|2023|    1|
|CA-1004|2023-03-01|2023|    3|
|CA-1005|2023-02-05|2023|    2|
+-------+----------+----+-----+



PART 3: Dask DataFrame Operations (Pandas Alternative)

In [31]:
!pip install dask



In [32]:
import dask.dataframe as dd
df = dd.read_csv('superstore.csv')

In [33]:
#1
avg_discount = df.groupby("Category")["Discount"].mean().compute()
print("Average Discount by Category:\n", avg_discount)


Average Discount by Category:
 Category
Furniture          0.200000
Office Supplies    0.050000
Technology         0.083333
Name: Discount, dtype: float64


In [34]:
#2
filtered_df = df[(df["Quantity"] > 1) & (df["Profit"] > 2000)]
filtered_df_result = filtered_df.compute()
print(filtered_df_result)


   OrderID   OrderDate Customer   Segment Region Product    Category  \
4  CA-1005  2023-02-05    Divya  Consumer  South   Phone  Technology   

  SubCategory  Quantity  UnitPrice  Discount  Profit  
4      Phones         2      20000       0.0    3000  


In [35]:
#3
filtered_df.to_csv("filtered_superstore_*.csv", index=False, single_file=True)


['/content/filtered_superstore_*.csv']

PART 4: JSON Handling (Complex Nested)

In [7]:
# Step 1: Import and start Spark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("JSONHandling").getOrCreate()

# Step 2: Create sample JSON file
import json

data = [
    {
        "OrderID": "CA-1001",
        "Customer": {"Name": "Ravi", "Segment": "Consumer"},
        "Details": {"Region": "South", "Profit": 5000}
    },
    {
        "OrderID": "CA-1002",
        "Customer": {"Name": "Priya", "Segment": "Corporate"},
        "Details": {"Region": "North", "Profit": 1800}
    }
]

with open("orders.json", "w") as f:
    json.dump(data, f, indent=2)

# Step 3: Load the JSON using PySpark
df_json = spark.read.json("orders.json", multiLine=True)

# Step 4: Explore the data
df_json.printSchema()
df_json.select("OrderID", "Customer.Name", "Details.Profit").show()


root
 |-- Customer: struct (nullable = true)
 |    |-- Name: string (nullable = true)
 |    |-- Segment: string (nullable = true)
 |-- Details: struct (nullable = true)
 |    |-- Profit: long (nullable = true)
 |    |-- Region: string (nullable = true)
 |-- OrderID: string (nullable = true)

+-------+-----+------+
|OrderID| Name|Profit|
+-------+-----+------+
|CA-1001| Ravi|  5000|
|CA-1002|Priya|  1800|
+-------+-----+------+

