In [1]:
## 1. Import Required Libraries
from pyspark.sql import SparkSession

## 2. Initialize Spark Session
spark = SparkSession.builder.appName("Task1_BigData").getOrCreate()

print("Spark started successfully")

Spark started successfully


In [7]:
## 3. Load the Dataset
df = spark.read.csv(
    r"D:\Documents\Documents\Data_Analytics\internship(CODETECH)\task\task 1\dataset.csv",
    header=True,
    inferSchema=True
)

# View schema and sample rows
df.printSchema()
df.show(5)

# Count total records
print("Total rows:", df.count())

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- order_item_id: integer (nullable = true)
 |-- product_id: string (nullable = true)
 |-- seller_id: string (nullable = true)
 |-- price: double (nullable = true)
 |-- shipping_charges: double (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- payment_installments: integer (nullable = true)
 |-- payment_value: double (nullable = true)
 |-- product_category_name: string (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_state: string (nullable = true)

+--------------------+--------------------+-------------+--------------------+--------------------+-----+----------------+------------+--------------------+-------------+---------------------+-------------+--------------+
|            order_id|         customer_id|order_item_id|          product_id|           seller_id|price|shipping_charges|payment_type|payment_installments|payment_value|product_category_na

In [8]:
## 4. Big Data Analysis
from pyspark.sql.functions import sum

# Total revenue
df.select(sum("payment_value").alias("Total_Revenue")).show()

+--------------------+
|       Total_Revenue|
+--------------------+
|1.7356423249999832E7|
+--------------------+



In [9]:
# Revenue by state
df.groupBy("customer_state") \
  .sum("payment_value") \
  .orderBy("sum(payment_value)", ascending=False) \
  .show()

+--------------+------------------+
|customer_state|sum(payment_value)|
+--------------+------------------+
|            SP| 6725496.580000039|
|            RJ| 2340854.139999998|
|            MG|2042845.9600000004|
|            RS|1004497.9000000018|
|            PR| 892367.8600000006|
|            SC| 688310.5400000007|
|            BA| 621152.0900000001|
|            DF|380847.83999999997|
|            GO|372540.82999999967|
|            ES|         351669.04|
|            PE| 328696.5199999999|
|            CE|253367.46999999988|
|            PA|         198211.27|
|            MT|198076.31999999995|
|            MA| 148784.4100000001|
|            MS|131538.13999999998|
|            PB|         123690.83|
|            RN| 96473.26999999999|
|            PI|          95585.32|
|            AL| 92168.65000000002|
+--------------+------------------+
only showing top 20 rows


In [10]:
# Most used payment method
df.groupBy("payment_type") \
  .count() \
  .orderBy("count", ascending=False) \
  .show()

+------------+-----+
|payment_type|count|
+------------+-----+
| credit_card|88124|
|      wallet|23261|
|     voucher| 6532|
|  debit_card| 1697|
+------------+-----+



In [11]:
# Top product categories
df.groupBy("product_category_name") \
  .count() \
  .orderBy("count", ascending=False) \
  .show(10)

+---------------------+-----+
|product_category_name|count|
+---------------------+-----+
|                 toys|90066|
|        health_beauty| 3245|
|       bed_bath_table| 3011|
|       sports_leisure| 2544|
|      furniture_decor| 2393|
| computers_accesso...| 2348|
|           housewares| 1787|
|        watches_gifts| 1611|
|            telephony| 1271|
|                 auto| 1115|
+---------------------+-----+
only showing top 10 rows


In [12]:
'''
 -> Final Insights from Big Data Analysis :
1️. Overall Revenue

The dataset contains 119,614 transaction records generating a total revenue of approximately 17.36 million.
This demonstrates the ability of PySpark to efficiently process large-scale transactional data.

2️. Sales Distribution by State

Among all states, SP (São Paulo) recorded the highest total sales (~6.7 million), significantly outperforming other regions such as RJ and MG.
This indicates that São Paulo is the primary revenue-generating market and may represent the highest customer demand concentration.

3️. Customer Payment Behavior

The credit card is the most frequently used payment method with 88,124 transactions, far exceeding wallet, voucher, and debit card usage.
This suggests strong customer preference for card-based digital payments, which businesses should prioritize for smooth checkout experiences.

4. Product Popularity

The toys category is the most purchased product category with over 90,000 orders, followed by health_beauty and bed_bath_table.
This highlights high consumer demand in the toys segment, making it a key area for inventory planning and marketing focus.
'''

'\n -> Final Insights from Big Data Analysis :\n1️. Overall Revenue\n\nThe dataset contains 119,614 transaction records generating a total revenue of approximately 17.36 million.\nThis demonstrates the ability of PySpark to efficiently process large-scale transactional data.\n\n2️. Sales Distribution by State\n\nAmong all states, SP (São Paulo) recorded the highest total sales (~6.7 million), significantly outperforming other regions such as RJ and MG.\nThis indicates that São Paulo is the primary revenue-generating market and may represent the highest customer demand concentration.\n\n3️. Customer Payment Behavior\n\nThe credit card is the most frequently used payment method with 88,124 transactions, far exceeding wallet, voucher, and debit card usage.\nThis suggests strong customer preference for card-based digital payments, which businesses should prioritize for smooth checkout experiences.\n\n4. Product Popularity\n\nThe toys category is the most purchased product category with ove