Step 1: Sample Dataset (create CSV)

In [None]:
from google.colab import files
uploaded = files.upload()

PART 1: Pandas DataFrame Operations
1. Load the CSV using pandas .

In [2]:
import pandas as pd
import numpy as np
superstore_df = pd.read_csv('superstore.csv')

2. Print schema, head, shape, dtypes.

In [None]:
print("\nSchema:")
print(superstore_df.info())
print("\n Head:")
print(superstore_df.head())
print("\n Shape:", superstore_df.shape)
print("\n Data Types:")
print(superstore_df.dtypes)

3. Select Customer , Product , Profit columns.

In [None]:
print("\n Selected Columns:")
print(superstore_df[['Customer', 'Product', 'Profit']])

4. Filter orders where Profit > 2000 and Discount = 0

In [None]:
filtered = superstore_df[(superstore_df['Profit'] > 2000) & (superstore_df['Discount'] == 0)]
print(" Filtered Orders :")
print(filtered)


5. Sort by Profit descending.

In [None]:
sorted_df = superstore_df.sort_values(by='Profit', ascending=False)
print("\n Sorted by Profit:")
print(sorted_df[['OrderID', 'Profit']])

6. GroupBy Category → Total Profit, Avg Discount.

In [None]:
grouped = superstore_df.groupby('Category').agg({
    'Profit': 'sum',
    'Discount': 'mean'
}).rename(columns={'Profit': 'Total Profit', 'Discount': 'Avg Discount'}).reset_index()
print("\n Grouped by Category:")
print(grouped)

7. Add a column TotalPrice = Quantity * UnitPrice .

In [None]:
superstore_df['TotalPrice'] = superstore_df['Quantity'] * superstore_df['UnitPrice']
print("\n TotalPrice Column Added:")
print(superstore_df[['OrderID', 'TotalPrice']])

8. Drop the SubCategory column.

In [None]:
superstore_df.drop(columns=['SubCategory'], inplace=True)
print("\n Columns after dropping SubCategory:")
print(superstore_df.columns)

9. Fill nulls in Discount with 0.10.

In [None]:
superstore_df['Discount'] = superstore_df['Discount'].fillna(0.10)
print(superstore_df)



10. Apply a function to categorize orders:

In [None]:
def classify(row):
    if row['Profit'] > 4000:
        return 'High'
    elif row['Profit'] > 0:
        return 'Medium'
    else:
        return 'Low'
superstore_df['OrderCategory'] = superstore_df.apply(classify, axis=1)
print("\n Orders Categorized:")
print(superstore_df[['OrderID', 'Profit', 'OrderCategory']])

PART 2: PySpark DataFrame Operations


In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Superstore Analysis").getOrCreate()
spark

1. Load the same CSV using PySpark.

In [24]:
df = spark.read.csv("superstore.csv", header=True, inferSchema=True)

2. Show schema and first 5 rows.

In [None]:
df.printSchema()
df.show(5)

3. Select columns, Rename Customer → Client .

In [None]:
from pyspark.sql.functions import col, when, year, month
df_selected = df.select(
    col("Customer").alias("Client"),
    "Product", "Profit"
)
df_selected.show()

4. Filter Segment = 'Consumer' and Profit < 1000 .

In [None]:
df_filtered = df.filter((col("Segment") == "Consumer") & (col("Profit") < 1000))
df_filtered.show()

5. GroupBy Region and show average profit.

In [None]:
df_grouped = df.groupBy("Region").avg("Profit")
df_grouped.show()

6. Use withColumn to create TotalPrice = Quantity * UnitPrice .

In [None]:
df = df.withColumn("TotalPrice", col("Quantity") * col("UnitPrice"))
df.show()

7. Use when().otherwise() to classify Profit as:
    'Profit' > 2000 → 'High',
    'Profit' <= 0 → 'Loss',
    else 'Medium'.

In [None]:
df = df.withColumn("ProfitLevel", when(col("Profit") > 2000, "High").when(col("Profit") <= 0, "Loss").otherwise("Medium"))
df.show()

8. Use drop() to remove SubCategory .

In [None]:
df = df.drop("SubCategory")
df.printSchema()

9. Handle nulls in Discount using fillna(0.10) .

In [None]:
df = df.fillna({'Discount': 0.10})
df.show()


10. Convert OrderDate to date type and extract year , month .

In [None]:
from pyspark.sql.functions import to_date
df = df.withColumn("OrderDate", to_date("OrderDate", "yyyy-MM-dd"))
df = df.withColumn("Year", year("OrderDate"))
df = df.withColumn("Month", month("OrderDate"))
df.select("OrderID", "OrderDate", "Year", "Month").show()

PART 3: Dask DataFrame Operations (Pandas Alternative)
1. Install Dask:

In [None]:
!pip install dask

2. Load the same superstore.csv :

In [38]:
import dask.dataframe as dd
df = dd.read_csv('superstore.csv')

1. Compute average discount by category.

In [None]:
avg_discount = df.groupby('Category')['Discount'].mean().compute()
print(avg_discount)

2. Filter orders with more than 1 quantity and high profit.

In [None]:
filtered_df = df[(df['Quantity'] > 1) & (df['Profit'] > 2000)]
filtered_df_result = filtered_df.compute()
print(filtered_df_result)

3. Save filtered data to new CSV.

In [None]:
filtered_df.to_csv('filtered_superstore_*.csv', index=False, single_file=True)
print(" Filtered data saved as 'filtered_superstore_*.csv'")


PART 4: JSON Handling (Complex Nested)
1. Create a nested JSON file:

In [42]:
nested_json = '''
[
  {
    "OrderID": "CA-1001",
    "Customer": {"Name": "Ravi", "Segment": "Consumer"},
    "Details": {"Region": "South", "Profit": 5000}
  },
  {
    "OrderID": "CA-1002",
    "Customer": {"Name": "Priya", "Segment": "Corporate"},
    "Details": {"Region": "North", "Profit": 1800}
  }
]
'''
with open("orders.json", "w") as f:
    f.write(nested_json)


2. Load it using PySpark:

In [None]:
df_json = spark.read.json("orders.json", multiLine=True)
df_json.printSchema()
df_json.select("OrderID", "Customer.Name", "Details.Profit").show()
