In [1]:
import findspark
findspark.init()
import pyspark
sc=pyspark.SparkContext(appName="MyAppName")
sc


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, min, max

# Step 1: Start Spark
spark = SparkSession.builder \
    .appName("Price Normalization") \
    .getOrCreate()

# Step 2: Sample data
Product_Data = [(101, "P1", 450), (102, "P2", 4034), (103, "P3", 4790)]
columns = ["Pid", "P_Name", "Price"]
Product_DF = spark.createDataFrame(Product_Data, columns)

# Step 3: Compute min and max
Price_min = Product_DF.select(min(col("Price"))).collect()[0][0]
Price_max = Product_DF.select(max(col("Price"))).collect()[0][0]

# Step 4: Apply Min-Max Normalization
Product_DF_normalized = Product_DF.withColumn(
    "Price_Normalized", 
    (col("Price") - Price_min) / (Price_max - Price_min)
)

# Step 5: Show result
Product_DF_normalized.show()

# Step 6: Stop Spark
spark.stop()


+---+------+-----+------------------+
|Pid|P_Name|Price|  Price_Normalized|
+---+------+-----+------------------+
|101|    P1|  450|               0.0|
|102|    P2| 4034|0.8258064516129032|
|103|    P3| 4790|               1.0|
+---+------+-----+------------------+



In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, mean, stddev

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("Standardize Price") \
    .getOrCreate()

# Create sample data
Product_Data = [(101, "P1", 450), (102, "P2", 4034), (103, "P3", 4790)]
columns = ["Pid", "P_Name", "Price"]

Product_DF = spark.createDataFrame(Product_Data, columns)

# Calculate Mean and Standard Deviation
Price_mean = Product_DF.select(mean(col("Price"))).collect()[0][0]
Price_stddev = Product_DF.select(stddev(col("Price"))).collect()[0][0]

# Standardize
Product_DF_standardized = Product_DF.withColumn(
    "Price_Standardized",
    (col("Price") - Price_mean) / Price_stddev
)

# Show result
Product_DF_standardized.show()

# Stop Spark
spark.stop()


+---+------+-----+-------------------+
|Pid|P_Name|Price| Price_Standardized|
+---+------+-----+-------------------+
|101|    P1|  450|-1.1392504769565068|
|102|    P2| 4034|0.40658762605161547|
|103|    P3| 4790| 0.7326628509048912|
+---+------+-----+-------------------+



In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("MyAppName") \
    .master("local[*]") \
    .getOrCreate()

salarydata = [("Class A",), ("Class B",), ("Class C",), ("Class C",), ("Class A",), ("Class B",)]
columns = ["Salary_Grade"]

salary_df = spark.createDataFrame(salarydata, columns)
salary_df.show()


+------------+
|Salary_Grade|
+------------+
|     Class A|
|     Class B|
|     Class C|
|     Class C|
|     Class A|
|     Class B|
+------------+



In [5]:
#label encoding
from pyspark.ml.feature import StringIndexer

# Initialize StringIndexer
indexer = StringIndexer(inputCol="Salary_Grade", outputCol="Salary_Grade_Index")

# Fit and Transform the DataFrame
salary_df_indexed = indexer.fit(salary_df).transform(salary_df)
salary_df_indexed.show()

+------------+------------------+
|Salary_Grade|Salary_Grade_Index|
+------------+------------------+
|     Class A|               0.0|
|     Class B|               1.0|
|     Class C|               2.0|
|     Class C|               2.0|
|     Class A|               0.0|
|     Class B|               1.0|
+------------+------------------+



In [6]:
#one hot encoding
from pyspark.ml.feature import OneHotEncoder

# Initialize OneHotEncoder
encoder = OneHotEncoder(inputCol="Salary_Grade_Index", outputCol="Salary_Grade_OneHotEncoding")

# Transform the DataFrame
salary_df_encoded = encoder.fit(salary_df_indexed).transform(salary_df_indexed)
salary_df_encoded.show()

+------------+------------------+---------------------------+
|Salary_Grade|Salary_Grade_Index|Salary_Grade_OneHotEncoding|
+------------+------------------+---------------------------+
|     Class A|               0.0|              (2,[0],[1.0])|
|     Class B|               1.0|              (2,[1],[1.0])|
|     Class C|               2.0|                  (2,[],[])|
|     Class C|               2.0|                  (2,[],[])|
|     Class A|               0.0|              (2,[0],[1.0])|
|     Class B|               1.0|              (2,[1],[1.0])|
+------------+------------------+---------------------------+



In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Start Spark Session
spark = SparkSession.builder.appName("PolynomialFeature").getOrCreate()

# Sample DataFrame
Product_Data = [(101, "P1", 450), (102, "P2", 4034), (103, "P3", 4790)]
columns = ["Pid", "P_Name", "Price"]
Product_DF = spark.createDataFrame(Product_Data, columns)

# Feature Engineering: Create Price^2
df_new = Product_DF.withColumn("Price_Squared", col("Price") ** 2)

# Show results
df_new.show()


+---+------+-----+-------------+
|Pid|P_Name|Price|Price_Squared|
+---+------+-----+-------------+
|101|    P1|  450|     202500.0|
|102|    P2| 4034|  1.6273156E7|
|103|    P3| 4790|    2.29441E7|
+---+------+-----+-------------+



In [8]:
# Interaction Features : Interaction features are created by combining two or more features.

# Sample DataFrame
data = [("P1",1, 500), ("P2",2, 200), ("P3",3, 300), ("P4",4, 400)]
columns = ["Pname","Units_Sold", "Price_Per_Unit"]

df = spark.createDataFrame(data, columns)

# Create an interaction feature: Total_Sales_Value = Units_Sold * Price_Per_Unit
df_interaction = df.withColumn("Total_Sales_Value", col("Units_Sold") * col("Price_Per_Unit"))
df_interaction.show()

+-----+----------+--------------+-----------------+
|Pname|Units_Sold|Price_Per_Unit|Total_Sales_Value|
+-----+----------+--------------+-----------------+
|   P1|         1|           500|              500|
|   P2|         2|           200|              400|
|   P3|         3|           300|              900|
|   P4|         4|           400|             1600|
+-----+----------+--------------+-----------------+

