In [1]:
import findspark
findspark.init()
import pyspark
sc=pyspark.SparkContext(appName="MyAppName")
sc


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, min, max

# Step 1: Start Spark
spark = SparkSession.builder \
    .appName("Price Normalization") \
    .getOrCreate()

# Step 2: Sample data
Product_Data = [(101, "P1", 450), (102, "P2", 4034), (103, "P3", 4790)]
columns = ["Pid", "P_Name", "Price"]
Product_DF = spark.createDataFrame(Product_Data, columns)

# Step 3: Compute min and max
Price_min = Product_DF.select(min(col("Price"))).collect()[0][0]
Price_max = Product_DF.select(max(col("Price"))).collect()[0][0]

# Step 4: Apply Min-Max Normalization
Product_DF_normalized = Product_DF.withColumn(
    "Price_Normalized", 
    (col("Price") - Price_min) / (Price_max - Price_min)
)

# Step 5: Show result
Product_DF_normalized.show()

# Step 6: Stop Spark
spark.stop()


+---+------+-----+------------------+
|Pid|P_Name|Price|  Price_Normalized|
+---+------+-----+------------------+
|101|    P1|  450|               0.0|
|102|    P2| 4034|0.8258064516129032|
|103|    P3| 4790|               1.0|
+---+------+-----+------------------+



In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, mean, stddev

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("Standardize Price") \
    .getOrCreate()

# Create sample data
Product_Data = [(101, "P1", 450), (102, "P2", 4034), (103, "P3", 4790)]
columns = ["Pid", "P_Name", "Price"]

Product_DF = spark.createDataFrame(Product_Data, columns)

# Calculate Mean and Standard Deviation
Price_mean = Product_DF.select(mean(col("Price"))).collect()[0][0]
Price_stddev = Product_DF.select(stddev(col("Price"))).collect()[0][0]

# Standardize
Product_DF_standardized = Product_DF.withColumn(
    "Price_Standardized",
    (col("Price") - Price_mean) / Price_stddev
)

# Show result
Product_DF_standardized.show()

# Stop Spark
spark.stop()


+---+------+-----+-------------------+
|Pid|P_Name|Price| Price_Standardized|
+---+------+-----+-------------------+
|101|    P1|  450|-1.1392504769565068|
|102|    P2| 4034|0.40658762605161547|
|103|    P3| 4790| 0.7326628509048912|
+---+------+-----+-------------------+



In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("MyAppName") \
    .master("local[*]") \
    .getOrCreate()

salarydata = [("Class A",), ("Class B",), ("Class C",), ("Class C",), ("Class A",), ("Class B",)]
columns = ["Salary_Grade"]

salary_df = spark.createDataFrame(salarydata, columns)
salary_df.show()


+------------+
|Salary_Grade|
+------------+
|     Class A|
|     Class B|
|     Class C|
|     Class C|
|     Class A|
|     Class B|
+------------+



In [5]:
#label encoding
from pyspark.ml.feature import StringIndexer

# Initialize StringIndexer
indexer = StringIndexer(inputCol="Salary_Grade", outputCol="Salary_Grade_Index")

# Fit and Transform the DataFrame
salary_df_indexed = indexer.fit(salary_df).transform(salary_df)
salary_df_indexed.show()

+------------+------------------+
|Salary_Grade|Salary_Grade_Index|
+------------+------------------+
|     Class A|               0.0|
|     Class B|               1.0|
|     Class C|               2.0|
|     Class C|               2.0|
|     Class A|               0.0|
|     Class B|               1.0|
+------------+------------------+



In [6]:
#one hot encoding
from pyspark.ml.feature import OneHotEncoder

# Initialize OneHotEncoder
encoder = OneHotEncoder(inputCol="Salary_Grade_Index", outputCol="Salary_Grade_OneHotEncoding")

# Transform the DataFrame
salary_df_encoded = encoder.fit(salary_df_indexed).transform(salary_df_indexed)
salary_df_encoded.show()

+------------+------------------+---------------------------+
|Salary_Grade|Salary_Grade_Index|Salary_Grade_OneHotEncoding|
+------------+------------------+---------------------------+
|     Class A|               0.0|              (2,[0],[1.0])|
|     Class B|               1.0|              (2,[1],[1.0])|
|     Class C|               2.0|                  (2,[],[])|
|     Class C|               2.0|                  (2,[],[])|
|     Class A|               0.0|              (2,[0],[1.0])|
|     Class B|               1.0|              (2,[1],[1.0])|
+------------+------------------+---------------------------+



In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Start Spark Session
spark = SparkSession.builder.appName("PolynomialFeature").getOrCreate()

# Sample DataFrame
Product_Data = [(101, "P1", 450), (102, "P2", 4034), (103, "P3", 4790)]
columns = ["Pid", "P_Name", "Price"]
Product_DF = spark.createDataFrame(Product_Data, columns)

# Feature Engineering: Create Price^2
df_new = Product_DF.withColumn("Price_Squared", col("Price") ** 2)

# Show results
df_new.show()


+---+------+-----+-------------+
|Pid|P_Name|Price|Price_Squared|
+---+------+-----+-------------+
|101|    P1|  450|     202500.0|
|102|    P2| 4034|  1.6273156E7|
|103|    P3| 4790|    2.29441E7|
+---+------+-----+-------------+



In [8]:
# Interaction Features : Interaction features are created by combining two or more features.

# Sample DataFrame
data = [("P1",1, 500), ("P2",2, 200), ("P3",3, 300), ("P4",4, 400)]
columns = ["Pname","Units_Sold", "Price_Per_Unit"]

df = spark.createDataFrame(data, columns)

# Create an interaction feature: Total_Sales_Value = Units_Sold * Price_Per_Unit
df_interaction = df.withColumn("Total_Sales_Value", col("Units_Sold") * col("Price_Per_Unit"))
df_interaction.show()

+-----+----------+--------------+-----------------+
|Pname|Units_Sold|Price_Per_Unit|Total_Sales_Value|
+-----+----------+--------------+-----------------+
|   P1|         1|           500|              500|
|   P2|         2|           200|              400|
|   P3|         3|           300|              900|
|   P4|         4|           400|             1600|
+-----+----------+--------------+-----------------+



Question 1 – Student & Class DataFrames

In [10]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

spark = SparkSession.builder.appName("StudentClass").getOrCreate()

# Create student DataFrame
student_data = [
    (1, "Alice", "Pune", 23, "F"),
    (2, "Bob", "Mumbai", 21, "M"),
    (3, "Charlie", "Nagpur", 24, "M"),
    (4, "Daisy", "Pune", 20, "F"),
    (5, "Ethan", "Delhi", 22, "M")
]
student_columns = ["rollno", "name", "address", "age", "gender"]
student_df = spark.createDataFrame(student_data, student_columns)

# Create class DataFrame
class_data = [
    (101, "Data Science"),
    (102, "AI"),
    (103, "ML"),
    (104, "Cyber Security"),
    (105, "IoT")
]
class_columns = ["classid", "classname"]
class_df = spark.createDataFrame(class_data, class_columns)

# Display DataFrames
student_df.show()
class_df.show()

# Display student names with gender = M and F separately
print("Male Students:")
student_df.filter(col("gender") == "M").select("name").show()

print("Female Students:")
student_df.filter(col("gender") == "F").select("name").show()

# Display students of a specific class (Example: classid = 101)
print("Students of classid 101:")
student_df.join(class_df, student_df.rollno == class_df.classid, "inner").show()

# Students whose age > 22
print("Students with age > 22:")
student_df.filter(col("age") > 22).show()

# Add new column grade
student_df = student_df.withColumn("grade", col("age") * 2)  # sample logic
student_df.show()

# Display classnames and its associated students
student_with_class = student_df.join(class_df, student_df.rollno == class_df.classid, "left")
student_with_class.select("classname", "name").show()


+------+-------+-------+---+------+
|rollno|   name|address|age|gender|
+------+-------+-------+---+------+
|     1|  Alice|   Pune| 23|     F|
|     2|    Bob| Mumbai| 21|     M|
|     3|Charlie| Nagpur| 24|     M|
|     4|  Daisy|   Pune| 20|     F|
|     5|  Ethan|  Delhi| 22|     M|
+------+-------+-------+---+------+

+-------+--------------+
|classid|     classname|
+-------+--------------+
|    101|  Data Science|
|    102|            AI|
|    103|            ML|
|    104|Cyber Security|
|    105|           IoT|
+-------+--------------+

Male Students:
+-------+
|   name|
+-------+
|    Bob|
|Charlie|
|  Ethan|
+-------+

Female Students:
+-----+
| name|
+-----+
|Alice|
|Daisy|
+-----+

Students of classid 101:
+------+----+-------+---+------+-------+---------+
|rollno|name|address|age|gender|classid|classname|
+------+----+-------+---+------+-------+---------+
+------+----+-------+---+------+-------+---------+

Students with age > 22:
+------+-------+-------+---+------+
|rollno

In [12]:
Question 2 – Employee & Department DataFrames

SyntaxError: invalid character '–' (U+2013) (1650815831.py, line 1)

In [11]:
# Create emp and dept DataFrames
emp_data = [
    (1, "John", "M", "Manager", "Pune", 25000, 10),
    (2, "Priya", "F", "Analyst", "Mumbai", 18000, 20),
    (3, "Karan", "M", "Clerk", "Delhi", 15000, 10),
    (4, "Neha", "F", "Manager", "Pune", 27000, 30),
    (5, "Ravi", "M", "Analyst", "Nagpur", 16000, 20)
]
emp_columns = ["eno", "ename", "gender", "designation", "city", "salary", "dno"]
emp_df = spark.createDataFrame(emp_data, emp_columns)

dept_data = [
    (10, "HR"),
    (20, "Finance"),
    (30, "IT"),
    (40, "Marketing"),
    (50, "Admin")
]
dept_columns = ["dno", "dname"]
dept_df = spark.createDataFrame(dept_data, dept_columns)

# Print schema
emp_df.printSchema()
dept_df.printSchema()

# Filter based on designation and salary
emp_df.filter(col("designation") == "Analyst").show()
emp_df.filter(col("salary") > 20000).show()

# Departments for female employees
emp_df.filter(col("gender") == "F").join(dept_df, "dno").show()

# Increase salary of Managers
emp_df = emp_df.withColumn("salary", 
                           col("salary") + 5000 * (col("designation") == "Manager").cast("int"))
emp_df.show()

# Add 3 more records to each dataframe
more_emp = [
    (6, "Divya", "F", "Clerk", "Nashik", 14000, 10),
    (7, "Amit", "M", "Manager", "Mumbai", 30000, 30),
    (8, "Sneha", "F", "Analyst", "Pune", 19000, 20)
]
more_dept = [
    (60, "Legal"),
    (70, "Tech"),
    (80, "Logistics")
]

emp_df = emp_df.union(spark.createDataFrame(more_emp, emp_columns))
dept_df = dept_df.union(spark.createDataFrame(more_dept, dept_columns))

emp_df.printSchema()
dept_df.printSchema()

# Use join to fetch unique records
emp_df.join(dept_df, "dno").dropDuplicates().show()

# Department-wise list of employees
emp_df.join(dept_df, "dno").select("dname", "ename").show()

# Employees with salary < 20000 and designation = 'Analyst'
emp_df.filter((col("salary") < 20000) & (col("designation") == "Analyst")).show()


root
 |-- eno: long (nullable = true)
 |-- ename: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- designation: string (nullable = true)
 |-- city: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- dno: long (nullable = true)

root
 |-- dno: long (nullable = true)
 |-- dname: string (nullable = true)

+---+-----+------+-----------+------+------+---+
|eno|ename|gender|designation|  city|salary|dno|
+---+-----+------+-----------+------+------+---+
|  2|Priya|     F|    Analyst|Mumbai| 18000| 20|
|  5| Ravi|     M|    Analyst|Nagpur| 16000| 20|
+---+-----+------+-----------+------+------+---+

+---+-----+------+-----------+----+------+---+
|eno|ename|gender|designation|city|salary|dno|
+---+-----+------+-----------+----+------+---+
|  1| John|     M|    Manager|Pune| 25000| 10|
|  4| Neha|     F|    Manager|Pune| 27000| 30|
+---+-----+------+-----------+----+------+---+

+---+---+-----+------+-----------+------+------+-------+
|dno|eno|ename|gender|desig

 Question 3 – Product, Customer, Orders, Order_Items

In [13]:
from pyspark.sql.functions import month, year, min, max, count, sum, to_date, dayofmonth

# Create DataFrames
product_data = [
    (1, "Pen", "Stationery", 10),
    (2, "Notebook", "Stationery", 40),
    (3, "Mouse", "Electronics", 500),
    (4, "Keyboard", "Electronics", 700)
]
product_df = spark.createDataFrame(product_data, ["product_id", "pname", "ptype", "price"])

customer_data = [
    (1, "Asha", "9999999999", "Pune"),
    (2, "Rahul", "8888888888", "Mumbai"),
    (3, "Meena", "7777777777", "Pune"),
    (4, "Rohan", "6666666666", "Nagpur")
]
customer_df = spark.createDataFrame(customer_data, ["cust_id", "cname", "mobileno", "city"])

orders_data = [
    (101, "2013-08-01", 1, "COMPLETE"),
    (102, "2013-08-15", 2, "CLOSED"),
    (103, "2013-07-01", 3, "PENDING"),
    (104, "2013-08-01", 4, "COMPLETE")
]
orders_df = spark.createDataFrame(orders_data, ["order_id", "order_date", "order_customer_id", "order_status"])

order_items_data = [
    (101, 1, 2, 20),
    (101, 2, 1, 40),
    (102, 3, 1, 500),
    (104, 1, 3, 30)
]
order_items_df = spark.createDataFrame(order_items_data, 
    ["order_item_order_id", "order_item_product_id", "order_item_quantity", "order_item_subtotal"])

# Get customers from Pune
customer_df.filter(col("city") == "Pune").show()

# Orders with subtotal > 100 in August
orders_df = orders_df.withColumn("order_date", to_date(col("order_date")))
order_items_df.join(orders_df, order_items_df.order_item_order_id == orders_df.order_id)\
    .filter((col("order_item_subtotal") > 100) & (month("order_date") == 8))\
    .show()

# Orders in ascending order of subtotal
order_items_df.orderBy("order_item_subtotal").show()

# Customer with min and max order
order_totals = order_items_df.groupBy("order_item_order_id").agg(sum("order_item_subtotal").alias("total"))
min_max = order_totals.agg(min("total").alias("min"), max("total").alias("max")).collect()
min_val, max_val = min_max[0]["min"], min_max[0]["max"]

orders_df.join(order_totals, orders_df.order_id == order_totals.order_item_order_id)\
    .filter((col("total") == min_val) | (col("total") == max_val))\
    .join(customer_df, orders_df.order_customer_id == customer_df.cust_id)\
    .select("cname", "total").show()

# Orders with status COMPLETE or CLOSED
orders_df.filter(col("order_status").isin("COMPLETE", "CLOSED")).show()

# Orders in Aug 2013 with COMPLETE or CLOSED
orders_df.filter(
    (col("order_status").isin("COMPLETE", "CLOSED")) & 
    (month("order_date") == 8) & (year("order_date") == 2013)
).show()

# Wrong subtotal check
wrong_subtotal = order_items_df.join(product_df, order_items_df.order_item_product_id == product_df.product_id)
wrong_subtotal.filter(col("order_item_subtotal") != (col("order_item_quantity") * col("price"))).show()

# Orders placed on first of every month
orders_df.filter(dayofmonth("order_date") == 1).show()

# Count by status
orders_df.groupBy("order_status").agg(count("*").alias("count")).show()

# Revenue per order id
order_items_df.groupBy("order_item_order_id").agg(sum("order_item_subtotal").alias("revenue")).show()

# Daily product revenue
orders_df.join(order_items_df, orders_df.order_id == order_items_df.order_item_order_id)\
    .groupBy("order_date", "order_item_product_id")\
    .agg(sum("order_item_subtotal").alias("daily_revenue")).show()


+-------+-----+----------+----+
|cust_id|cname|  mobileno|city|
+-------+-----+----------+----+
|      1| Asha|9999999999|Pune|
|      3|Meena|7777777777|Pune|
+-------+-----+----------+----+

+-------------------+---------------------+-------------------+-------------------+--------+----------+-----------------+------------+
|order_item_order_id|order_item_product_id|order_item_quantity|order_item_subtotal|order_id|order_date|order_customer_id|order_status|
+-------------------+---------------------+-------------------+-------------------+--------+----------+-----------------+------------+
|                102|                    3|                  1|                500|     102|2013-08-15|                2|      CLOSED|
+-------------------+---------------------+-------------------+-------------------+--------+----------+-----------------+------------+

+-------------------+---------------------+-------------------+-------------------+
|order_item_order_id|order_item_product_id|orde