In [None]:
THE PROBLEM-

Sales Data Cleaning + Insights
You’re given a CSV file named sales_data.csv (I’ll describe it below — or you can mock it). The business wants:

Clean the data (handle missing/nulls).

Calculate total and average sales per region.

Find the top 2 salespeople per region based on total sales.

Add a column saying "High Performer" if their total sales > 50,000.

Return the cleaned, enriched DataFrame.

Export the result to CSV

Calculate average deals per salesperson

In [4]:
# Entry point for DataFrame and SQL in PySpark
from pyspark.sql import SparkSession

# Create a SparkSession
# Syntax: SparkSession.builder.appName("AppName").getOrCreate()
spark = SparkSession.builder\
    .appName("Sales_Data")\
    .getOrCreate()

In [22]:
# CSV file path
# Syntax: csv_file_path = "your_path_here"
csv_file_path = r"C:\Users\Admin\OneDrive\Desktop\Study\Pyspark\sales_data.csv"

# Read CSV as DataFrame
# Syntax: spark.read.csv(path, header=True, inferSchema=True)
df = spark.read.csv(csv_file_path, header=True, inferSchema=True)

In [30]:
#Clean Data: Remove rows with missing Region or Sales, fill null Deals with 0
cleaned_df = df.dropna(subset=["Region", "Sales"]).fillna({"Deals": 0})

In [32]:
#Calculate Total and Average Sales per Region
from pyspark.sql.functions import sum as _sum, avg
cleaned_df.groupBy("Region").agg(
    _sum("Sales").alias("TotalSalesPerRegion"),
    avg("Sales").alias("AvgSalesPerRegion")
).show()

+------+-------------------+------------------+
|Region|TotalSalesPerRegion| AvgSalesPerRegion|
+------+-------------------+------------------+
|  East|              52000|17333.333333333332|
|  West|              70000|23333.333333333332|
| North|              28000|           28000.0|
+------+-------------------+------------------+



In [34]:
#Total Sales per Salesperson per Region
sales_by_person = cleaned_df.groupBy("Salesperson", "Region").agg(
    _sum("Sales").alias("TotalSales")
)

In [40]:
#Rank Salespeople within each Region using Window Function
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, col

window_spec = Window.partitionBy("Region").orderBy(col("TotalSales").desc())
ranked_sales = sales_by_person.withColumn("Rank", row_number().over(window_spec))

In [42]:
#Filter Top 2 and Mark High Performers
from pyspark.sql.functions import when

final_df = ranked_sales.filter(col("Rank") <= 2).withColumn(
    "HighPerformer", when(col("TotalSales") > 50000, "Yes").otherwise("No")
)

In [None]:
Final Output
Top 2 salespeople per region
Their total sales
A “Yes”/“No” column saying if they're a HighPerformer (TotalSales > 50000)

In [44]:
final_df.show()

+-----------+------+----------+----+-------------+
|Salesperson|Region|TotalSales|Rank|HighPerformer|
+-----------+------+----------+----+-------------+
|      Aarti|  East|     40000|   1|           No|
|      Sneha|  East|     12000|   2|           No|
|       Ravi| North|     28000|   1|           No|
|     Ramesh|  West|     50000|   1|           No|
|     Vikram|  West|     20000|   2|           No|
+-----------+------+----------+----+-------------+



In [50]:
#Calculate Average Deals per Salesperson
from pyspark.sql.functions import avg

avg_deals = df.groupBy("Salesperson").agg(avg("Deals").alias("AverageDeals"))
avg_deals.show()

+-----------+------------+
|Salesperson|AverageDeals|
+-----------+------------+
|      Aarti|         5.5|
|       Ravi|         7.0|
|     Ramesh|         4.0|
|      Sneha|         3.0|
|     Vikram|         5.0|
+-----------+------------+



In [62]:
# Export Final DataFrame to CSV  
# Syntax: df.write.option("header", True).csv("path") 
final_df.toPandas().to_csv(r"C:\Users\Admin\OneDrive\Desktop\Study\Pyspark\top_salespeople.csv", index=False)