# K Means Clustering #

In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType
from pyspark.ml.feature import Imputer, StringIndexer, OneHotEncoder
from pyspark.ml import Pipeline

In [6]:
# Initialize Spark Session
spark = SparkSession.builder.appName("PredictiveModel").getOrCreate()

# Define the schema for the dataset
schema = StructType([
    StructField("Age", IntegerType(), True),
    StructField("Income", DoubleType(), True),
    StructField("Gender", StringType(), True),
    StructField("Purchased", IntegerType(), True)  # 1 for 'Yes', 0 for 'No'
])

# Hardcoded data
data = [
    (25, 50000.0, "Male", 1),
    (45, 64000.0, "Female", 0),
    (35, 57000.0, "Female", 1),
    (50, None, "Male", 0),      # Missing income value
    (23, 52000.0, None, 1),     # Missing gender value
    (31, 60000.0, "Female", 0),
    (38, 58000.0, "Male", 1)
]

# Create DataFrame
df = spark.createDataFrame(data, schema=schema)

# Display the schema and the dataset
df.printSchema()
df.show()
df.describe().show()


root
 |-- Age: integer (nullable = true)
 |-- Income: double (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Purchased: integer (nullable = true)

+---+-------+------+---------+
|Age| Income|Gender|Purchased|
+---+-------+------+---------+
| 25|50000.0|  Male|        1|
| 45|64000.0|Female|        0|
| 35|57000.0|Female|        1|
| 50|   NULL|  Male|        0|
| 23|52000.0|  NULL|        1|
| 31|60000.0|Female|        0|
| 38|58000.0|  Male|        1|
+---+-------+------+---------+

+-------+------------------+------------------+------+------------------+
|summary|               Age|            Income|Gender|         Purchased|
+-------+------------------+------------------+------+------------------+
|  count|                 7|                 6|     6|                 7|
|   mean|35.285714285714285|56833.333333333336|  NULL|0.5714285714285714|
| stddev|  9.94508732514511| 5154.286242213044|  NULL|0.5345224838248488|
|    min|                23|           50000.0|Female|

In [7]:
# Fill missing categorical values for "Gender" column
df = df.fillna({"Gender": "Unknown"})

# Step 1: Impute missing values for "Income"
imputer = Imputer(inputCols=["Income"], outputCols=["Income_imputed"])

# Step 2: Index the "Gender" column
gender_indexer = StringIndexer(inputCol="Gender", outputCol="Gender_indexed")

# Step 3: One-Hot Encode the indexed "Gender" column
gender_encoder = OneHotEncoder(inputCol="Gender_indexed", outputCol="Gender_encoded")

# Create a pipeline for the transformations
pipeline = Pipeline(stages=[imputer, gender_indexer, gender_encoder])

# Fit the pipeline and transform the data
try:
    df_transformed = pipeline.fit(df).transform(df)
    df_transformed.show()
except Exception as e:
    print(f"An error occurred: {e}")

+---+-------+-------+---------+------------------+--------------+--------------+
|Age| Income| Gender|Purchased|    Income_imputed|Gender_indexed|Gender_encoded|
+---+-------+-------+---------+------------------+--------------+--------------+
| 25|50000.0|   Male|        1|           50000.0|           1.0| (2,[1],[1.0])|
| 45|64000.0| Female|        0|           64000.0|           0.0| (2,[0],[1.0])|
| 35|57000.0| Female|        1|           57000.0|           0.0| (2,[0],[1.0])|
| 50|   NULL|   Male|        0|56833.333333333336|           1.0| (2,[1],[1.0])|
| 23|52000.0|Unknown|        1|           52000.0|           2.0|     (2,[],[])|
| 31|60000.0| Female|        0|           60000.0|           0.0| (2,[0],[1.0])|
| 38|58000.0|   Male|        1|           58000.0|           1.0| (2,[1],[1.0])|
+---+-------+-------+---------+------------------+--------------+--------------+



In [8]:
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Assemble features into a single vector
assembler = VectorAssembler(inputCols=["Age", "Income_imputed", "Gender_encoded"], outputCol="features")
data = assembler.transform(df_transformed)

# Split data into training and test sets
train_data, test_data = data.randomSplit([0.7, 0.3], seed=42)

# Initialize and train the Decision Tree model
dt = DecisionTreeClassifier(labelCol="Purchased", featuresCol="features")
model = dt.fit(train_data)


24/11/05 19:27:02 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 32 to 4 (= number of training instances)


In [9]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Make predictions
predictions = model.transform(test_data)

# Initialize evaluators for different metrics
accuracy_evaluator = MulticlassClassificationEvaluator(labelCol="Purchased", predictionCol="prediction", metricName="accuracy")
precision_evaluator = MulticlassClassificationEvaluator(labelCol="Purchased", predictionCol="prediction", metricName="weightedPrecision")
recall_evaluator = MulticlassClassificationEvaluator(labelCol="Purchased", predictionCol="prediction", metricName="weightedRecall")

# Calculate metrics
accuracy = accuracy_evaluator.evaluate(predictions)
precision = precision_evaluator.evaluate(predictions)
recall = recall_evaluator.evaluate(predictions)

# Display metrics
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")


Accuracy: 1.0
Precision: 1.0
Recall: 1.0
