In [1]:
import pandas as pd

In [2]:
# creating csv
csv_data = """
OrderID,OrderDate,Customer,Segment,Region,Product,Category,SubCategory,Quantity,UnitPrice,Discount,Profit
CA-1001,2023-01-15,Ravi,Consumer,South,Laptop,Technology,Computers,1,55000,0.10,5000
CA-1002,2023-02-20,Priya,Corporate,North,Printer,Technology,Peripherals,2,12000,0.15,1800
CA-1003,2023-01-25,Amit,Consumer,East,Notebook,Office Supplies,Paper,3,200,0.05,150
CA-1004,2023-03-01,Anita,Home Office,West,Table,Furniture,Tables,1,18000,0.20,-1500
CA-1005,2023-02-05,Divya,Consumer,South,Phone,Technology,Phones,2,20000,0.00,3000
"""

with open ('superstore.csv', 'w') as file:
  file.write(csv_data)

###Part 1: Pandas DataFrame operation

In [3]:
# Loading data using pandas
df = pd.read_csv('superstore.csv')
df

Unnamed: 0,OrderID,OrderDate,Customer,Segment,Region,Product,Category,SubCategory,Quantity,UnitPrice,Discount,Profit
0,CA-1001,2023-01-15,Ravi,Consumer,South,Laptop,Technology,Computers,1,55000,0.1,5000
1,CA-1002,2023-02-20,Priya,Corporate,North,Printer,Technology,Peripherals,2,12000,0.15,1800
2,CA-1003,2023-01-25,Amit,Consumer,East,Notebook,Office Supplies,Paper,3,200,0.05,150
3,CA-1004,2023-03-01,Anita,Home Office,West,Table,Furniture,Tables,1,18000,0.2,-1500
4,CA-1005,2023-02-05,Divya,Consumer,South,Phone,Technology,Phones,2,20000,0.0,3000


In [4]:
# Printing schema, head, shape, dtypes
print(df.head())

   OrderID   OrderDate Customer      Segment Region   Product  \
0  CA-1001  2023-01-15     Ravi     Consumer  South    Laptop   
1  CA-1002  2023-02-20    Priya    Corporate  North   Printer   
2  CA-1003  2023-01-25     Amit     Consumer   East  Notebook   
3  CA-1004  2023-03-01    Anita  Home Office   West     Table   
4  CA-1005  2023-02-05    Divya     Consumer  South     Phone   

          Category  SubCategory  Quantity  UnitPrice  Discount  Profit  
0       Technology    Computers         1      55000      0.10    5000  
1       Technology  Peripherals         2      12000      0.15    1800  
2  Office Supplies        Paper         3        200      0.05     150  
3        Furniture       Tables         1      18000      0.20   -1500  
4       Technology       Phones         2      20000      0.00    3000  


In [5]:
print(df.shape)

(5, 12)


In [6]:
print(df.dtypes)

OrderID         object
OrderDate       object
Customer        object
Segment         object
Region          object
Product         object
Category        object
SubCategory     object
Quantity         int64
UnitPrice        int64
Discount       float64
Profit           int64
dtype: object


In [7]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   OrderID      5 non-null      object 
 1   OrderDate    5 non-null      object 
 2   Customer     5 non-null      object 
 3   Segment      5 non-null      object 
 4   Region       5 non-null      object 
 5   Product      5 non-null      object 
 6   Category     5 non-null      object 
 7   SubCategory  5 non-null      object 
 8   Quantity     5 non-null      int64  
 9   UnitPrice    5 non-null      int64  
 10  Discount     5 non-null      float64
 11  Profit       5 non-null      int64  
dtypes: float64(1), int64(3), object(8)
memory usage: 612.0+ bytes
None


In [8]:
# Selecting customer, Product, Profit columns
df[['Customer','Product','Profit']]

Unnamed: 0,Customer,Product,Profit
0,Ravi,Laptop,5000
1,Priya,Printer,1800
2,Amit,Notebook,150
3,Anita,Table,-1500
4,Divya,Phone,3000


In [9]:
# Filter orders where Profit > 2000 and Discount = 0 .
filter =df[(df['Profit'] > 2000) & (df['Discount'] == 0)]
print(filter)

   OrderID   OrderDate Customer   Segment Region Product    Category  \
4  CA-1005  2023-02-05    Divya  Consumer  South   Phone  Technology   

  SubCategory  Quantity  UnitPrice  Discount  Profit  
4      Phones         2      20000       0.0    3000  


In [10]:
# Sorting by profit descending
sorted = df.sort_values(by='Profit', ascending=False)
print(sorted)

   OrderID   OrderDate Customer      Segment Region   Product  \
0  CA-1001  2023-01-15     Ravi     Consumer  South    Laptop   
4  CA-1005  2023-02-05    Divya     Consumer  South     Phone   
1  CA-1002  2023-02-20    Priya    Corporate  North   Printer   
2  CA-1003  2023-01-25     Amit     Consumer   East  Notebook   
3  CA-1004  2023-03-01    Anita  Home Office   West     Table   

          Category  SubCategory  Quantity  UnitPrice  Discount  Profit  
0       Technology    Computers         1      55000      0.10    5000  
4       Technology       Phones         2      20000      0.00    3000  
1       Technology  Peripherals         2      12000      0.15    1800  
2  Office Supplies        Paper         3        200      0.05     150  
3        Furniture       Tables         1      18000      0.20   -1500  


In [11]:
# GroupBy Category → Total Profit, Avg Discount.
grouped = df.groupby('Category').agg({'Profit':'sum', 'Discount':'mean'})
grouped.rename(columns={'Profit':'TotalProfit', 'Discount':'AvgDiscount'}, inplace=True)
print(grouped)

                 TotalProfit  AvgDiscount
Category                                 
Furniture              -1500     0.200000
Office Supplies          150     0.050000
Technology              9800     0.083333


In [12]:
# Adding column TotalPrice to df
df['TotalPrice'] = df['Quantity'] * df['UnitPrice']
df

Unnamed: 0,OrderID,OrderDate,Customer,Segment,Region,Product,Category,SubCategory,Quantity,UnitPrice,Discount,Profit,TotalPrice
0,CA-1001,2023-01-15,Ravi,Consumer,South,Laptop,Technology,Computers,1,55000,0.1,5000,55000
1,CA-1002,2023-02-20,Priya,Corporate,North,Printer,Technology,Peripherals,2,12000,0.15,1800,24000
2,CA-1003,2023-01-25,Amit,Consumer,East,Notebook,Office Supplies,Paper,3,200,0.05,150,600
3,CA-1004,2023-03-01,Anita,Home Office,West,Table,Furniture,Tables,1,18000,0.2,-1500,18000
4,CA-1005,2023-02-05,Divya,Consumer,South,Phone,Technology,Phones,2,20000,0.0,3000,40000


In [13]:
# Drop the SubCategory column.

dropped = df.drop('SubCategory', axis=1)
print(dropped)

   OrderID   OrderDate Customer      Segment Region   Product  \
0  CA-1001  2023-01-15     Ravi     Consumer  South    Laptop   
1  CA-1002  2023-02-20    Priya    Corporate  North   Printer   
2  CA-1003  2023-01-25     Amit     Consumer   East  Notebook   
3  CA-1004  2023-03-01    Anita  Home Office   West     Table   
4  CA-1005  2023-02-05    Divya     Consumer  South     Phone   

          Category  Quantity  UnitPrice  Discount  Profit  TotalPrice  
0       Technology         1      55000      0.10    5000       55000  
1       Technology         2      12000      0.15    1800       24000  
2  Office Supplies         3        200      0.05     150         600  
3        Furniture         1      18000      0.20   -1500       18000  
4       Technology         2      20000      0.00    3000       40000  


In [14]:
# Fill nulls in discount with 0.10

df['Discount'].fillna(0.10, inplace=True)
df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Discount'].fillna(0.10, inplace=True)


Unnamed: 0,OrderID,OrderDate,Customer,Segment,Region,Product,Category,SubCategory,Quantity,UnitPrice,Discount,Profit,TotalPrice
0,CA-1001,2023-01-15,Ravi,Consumer,South,Laptop,Technology,Computers,1,55000,0.1,5000,55000
1,CA-1002,2023-02-20,Priya,Corporate,North,Printer,Technology,Peripherals,2,12000,0.15,1800,24000
2,CA-1003,2023-01-25,Amit,Consumer,East,Notebook,Office Supplies,Paper,3,200,0.05,150,600
3,CA-1004,2023-03-01,Anita,Home Office,West,Table,Furniture,Tables,1,18000,0.2,-1500,18000
4,CA-1005,2023-02-05,Divya,Consumer,South,Phone,Technology,Phones,2,20000,0.0,3000,40000


In [15]:
# Apply a function to categorize orders:
def classify(row):
  if row['Profit'] > 4000:
    return 'High'
  elif row['Profit'] > 0:
    return 'Medium'
  else:
    return 'Low'

df['OrderCategory'] = df.apply(classify, axis=1)
print(df)

   OrderID   OrderDate Customer      Segment Region   Product  \
0  CA-1001  2023-01-15     Ravi     Consumer  South    Laptop   
1  CA-1002  2023-02-20    Priya    Corporate  North   Printer   
2  CA-1003  2023-01-25     Amit     Consumer   East  Notebook   
3  CA-1004  2023-03-01    Anita  Home Office   West     Table   
4  CA-1005  2023-02-05    Divya     Consumer  South     Phone   

          Category  SubCategory  Quantity  UnitPrice  Discount  Profit  \
0       Technology    Computers         1      55000      0.10    5000   
1       Technology  Peripherals         2      12000      0.15    1800   
2  Office Supplies        Paper         3        200      0.05     150   
3        Furniture       Tables         1      18000      0.20   -1500   
4       Technology       Phones         2      20000      0.00    3000   

   TotalPrice OrderCategory  
0       55000          High  
1       24000        Medium  
2         600        Medium  
3       18000           Low  
4       40000 

### Part 2: PySpark DataFrame Operations

In [16]:
# Creating spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder \
        .appName("PySparks") \
        .getOrCreate()

# Running it
spark


In [17]:
# Loading data from csv
df = spark.read.csv('superstore.csv',header=True, inferSchema=True)
df.show()

+-------+----------+--------+-----------+------+--------+---------------+-----------+--------+---------+--------+------+
|OrderID| OrderDate|Customer|    Segment|Region| Product|       Category|SubCategory|Quantity|UnitPrice|Discount|Profit|
+-------+----------+--------+-----------+------+--------+---------------+-----------+--------+---------+--------+------+
|CA-1001|2023-01-15|    Ravi|   Consumer| South|  Laptop|     Technology|  Computers|       1|    55000|     0.1|  5000|
|CA-1002|2023-02-20|   Priya|  Corporate| North| Printer|     Technology|Peripherals|       2|    12000|    0.15|  1800|
|CA-1003|2023-01-25|    Amit|   Consumer|  East|Notebook|Office Supplies|      Paper|       3|      200|    0.05|   150|
|CA-1004|2023-03-01|   Anita|Home Office|  West|   Table|      Furniture|     Tables|       1|    18000|     0.2| -1500|
|CA-1005|2023-02-05|   Divya|   Consumer| South|   Phone|     Technology|     Phones|       2|    20000|     0.0|  3000|
+-------+----------+--------+---

In [18]:
# Schema and first 5 rows
df.printSchema()

root
 |-- OrderID: string (nullable = true)
 |-- OrderDate: date (nullable = true)
 |-- Customer: string (nullable = true)
 |-- Segment: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- SubCategory: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- UnitPrice: integer (nullable = true)
 |-- Discount: double (nullable = true)
 |-- Profit: integer (nullable = true)



In [19]:
df.show(5)

+-------+----------+--------+-----------+------+--------+---------------+-----------+--------+---------+--------+------+
|OrderID| OrderDate|Customer|    Segment|Region| Product|       Category|SubCategory|Quantity|UnitPrice|Discount|Profit|
+-------+----------+--------+-----------+------+--------+---------------+-----------+--------+---------+--------+------+
|CA-1001|2023-01-15|    Ravi|   Consumer| South|  Laptop|     Technology|  Computers|       1|    55000|     0.1|  5000|
|CA-1002|2023-02-20|   Priya|  Corporate| North| Printer|     Technology|Peripherals|       2|    12000|    0.15|  1800|
|CA-1003|2023-01-25|    Amit|   Consumer|  East|Notebook|Office Supplies|      Paper|       3|      200|    0.05|   150|
|CA-1004|2023-03-01|   Anita|Home Office|  West|   Table|      Furniture|     Tables|       1|    18000|     0.2| -1500|
|CA-1005|2023-02-05|   Divya|   Consumer| South|   Phone|     Technology|     Phones|       2|    20000|     0.0|  3000|
+-------+----------+--------+---

In [20]:
# Select columns, Rename Customer → Client .
from pyspark.sql.functions import col, sum, avg

#df.select("*").show()
df.select(df.columns).show()
df_selected = df.select(col("OrderID"), col("Customer").alias("Client"), col("Region"))
df_selected.show(5)


+-------+----------+--------+-----------+------+--------+---------------+-----------+--------+---------+--------+------+
|OrderID| OrderDate|Customer|    Segment|Region| Product|       Category|SubCategory|Quantity|UnitPrice|Discount|Profit|
+-------+----------+--------+-----------+------+--------+---------------+-----------+--------+---------+--------+------+
|CA-1001|2023-01-15|    Ravi|   Consumer| South|  Laptop|     Technology|  Computers|       1|    55000|     0.1|  5000|
|CA-1002|2023-02-20|   Priya|  Corporate| North| Printer|     Technology|Peripherals|       2|    12000|    0.15|  1800|
|CA-1003|2023-01-25|    Amit|   Consumer|  East|Notebook|Office Supplies|      Paper|       3|      200|    0.05|   150|
|CA-1004|2023-03-01|   Anita|Home Office|  West|   Table|      Furniture|     Tables|       1|    18000|     0.2| -1500|
|CA-1005|2023-02-05|   Divya|   Consumer| South|   Phone|     Technology|     Phones|       2|    20000|     0.0|  3000|
+-------+----------+--------+---

In [21]:
# Filter Segment = 'Consumer' and Profit < 1000 .
filtered = df.filter((col("Segment") == "Consumer") & (col("Profit") < 1000))
filtered.show()

+-------+----------+--------+--------+------+--------+---------------+-----------+--------+---------+--------+------+
|OrderID| OrderDate|Customer| Segment|Region| Product|       Category|SubCategory|Quantity|UnitPrice|Discount|Profit|
+-------+----------+--------+--------+------+--------+---------------+-----------+--------+---------+--------+------+
|CA-1003|2023-01-25|    Amit|Consumer|  East|Notebook|Office Supplies|      Paper|       3|      200|    0.05|   150|
+-------+----------+--------+--------+------+--------+---------------+-----------+--------+---------+--------+------+



In [22]:
# GroupBy Region and show average profit.
avg_profit = df.groupBy("Region").agg(avg("Profit").alias("AverageProfit"))
avg_profit.show()

+------+-------------+
|Region|AverageProfit|
+------+-------------+
| South|       4000.0|
|  East|        150.0|
|  West|      -1500.0|
| North|       1800.0|
+------+-------------+



In [23]:
# Use withColumn to create TotalPrice = Quantity * UnitPrice .
# withColumn is used to create new column
df = df.withColumn("TotalPrice", col("Quantity")*col("UnitPrice"))
df.show()

+-------+----------+--------+-----------+------+--------+---------------+-----------+--------+---------+--------+------+----------+
|OrderID| OrderDate|Customer|    Segment|Region| Product|       Category|SubCategory|Quantity|UnitPrice|Discount|Profit|TotalPrice|
+-------+----------+--------+-----------+------+--------+---------------+-----------+--------+---------+--------+------+----------+
|CA-1001|2023-01-15|    Ravi|   Consumer| South|  Laptop|     Technology|  Computers|       1|    55000|     0.1|  5000|     55000|
|CA-1002|2023-02-20|   Priya|  Corporate| North| Printer|     Technology|Peripherals|       2|    12000|    0.15|  1800|     24000|
|CA-1003|2023-01-25|    Amit|   Consumer|  East|Notebook|Office Supplies|      Paper|       3|      200|    0.05|   150|       600|
|CA-1004|2023-03-01|   Anita|Home Office|  West|   Table|      Furniture|     Tables|       1|    18000|     0.2| -1500|     18000|
|CA-1005|2023-02-05|   Divya|   Consumer| South|   Phone|     Technology|   

In [24]:
# Use when().otherwise() to classify Profit
# when().otherwise() - similar to if-else
from pyspark.sql.functions import when
df_category = df.withColumn("ProfitCategory", when(col("Profit") > 2000, 'High')
            .when(col("Profit") <= 0, "Loss")
            .otherwise("Medium"))
df_category.show()

+-------+----------+--------+-----------+------+--------+---------------+-----------+--------+---------+--------+------+----------+--------------+
|OrderID| OrderDate|Customer|    Segment|Region| Product|       Category|SubCategory|Quantity|UnitPrice|Discount|Profit|TotalPrice|ProfitCategory|
+-------+----------+--------+-----------+------+--------+---------------+-----------+--------+---------+--------+------+----------+--------------+
|CA-1001|2023-01-15|    Ravi|   Consumer| South|  Laptop|     Technology|  Computers|       1|    55000|     0.1|  5000|     55000|          High|
|CA-1002|2023-02-20|   Priya|  Corporate| North| Printer|     Technology|Peripherals|       2|    12000|    0.15|  1800|     24000|        Medium|
|CA-1003|2023-01-25|    Amit|   Consumer|  East|Notebook|Office Supplies|      Paper|       3|      200|    0.05|   150|       600|        Medium|
|CA-1004|2023-03-01|   Anita|Home Office|  West|   Table|      Furniture|     Tables|       1|    18000|     0.2| -150

In [25]:
df_category.select("Product","Profit","ProfitCategory").show()

+--------+------+--------------+
| Product|Profit|ProfitCategory|
+--------+------+--------------+
|  Laptop|  5000|          High|
| Printer|  1800|        Medium|
|Notebook|   150|        Medium|
|   Table| -1500|          Loss|
|   Phone|  3000|          High|
+--------+------+--------------+



In [26]:
# Use drop() to remove SubCategory
dropped_df = df.drop("SubCategory")
dropped_df.show(3)

+-------+----------+--------+---------+------+--------+---------------+--------+---------+--------+------+----------+
|OrderID| OrderDate|Customer|  Segment|Region| Product|       Category|Quantity|UnitPrice|Discount|Profit|TotalPrice|
+-------+----------+--------+---------+------+--------+---------------+--------+---------+--------+------+----------+
|CA-1001|2023-01-15|    Ravi| Consumer| South|  Laptop|     Technology|       1|    55000|     0.1|  5000|     55000|
|CA-1002|2023-02-20|   Priya|Corporate| North| Printer|     Technology|       2|    12000|    0.15|  1800|     24000|
|CA-1003|2023-01-25|    Amit| Consumer|  East|Notebook|Office Supplies|       3|      200|    0.05|   150|       600|
+-------+----------+--------+---------+------+--------+---------------+--------+---------+--------+------+----------+
only showing top 3 rows



In [27]:
# Handle nulls in Discount using fillna(0.10)
df_null = df.fillna({"Discount":0.10})
df_null.show()

+-------+----------+--------+-----------+------+--------+---------------+-----------+--------+---------+--------+------+----------+
|OrderID| OrderDate|Customer|    Segment|Region| Product|       Category|SubCategory|Quantity|UnitPrice|Discount|Profit|TotalPrice|
+-------+----------+--------+-----------+------+--------+---------------+-----------+--------+---------+--------+------+----------+
|CA-1001|2023-01-15|    Ravi|   Consumer| South|  Laptop|     Technology|  Computers|       1|    55000|     0.1|  5000|     55000|
|CA-1002|2023-02-20|   Priya|  Corporate| North| Printer|     Technology|Peripherals|       2|    12000|    0.15|  1800|     24000|
|CA-1003|2023-01-25|    Amit|   Consumer|  East|Notebook|Office Supplies|      Paper|       3|      200|    0.05|   150|       600|
|CA-1004|2023-03-01|   Anita|Home Office|  West|   Table|      Furniture|     Tables|       1|    18000|     0.2| -1500|     18000|
|CA-1005|2023-02-05|   Divya|   Consumer| South|   Phone|     Technology|   

In [30]:
# Convert OrderDate to date type and extract year , month .
from pyspark.sql.functions import to_date, year, month

df_new = df.withColumn("OrderDate", to_date("OrderDate", "yyyy-MM-dd"))
df_new = df_new.withColumn("Year", year("OrderDate"))
df_new = df_new.withColumn("Month", month("OrderDate"))

df_new.select("OrderID","OrderDate", "Year", "Month").show()

+-------+----------+----+-----+
|OrderID| OrderDate|Year|Month|
+-------+----------+----+-----+
|CA-1001|2023-01-15|2023|    1|
|CA-1002|2023-02-20|2023|    2|
|CA-1003|2023-01-25|2023|    1|
|CA-1004|2023-03-01|2023|    3|
|CA-1005|2023-02-05|2023|    2|
+-------+----------+----+-----+

