Q1 — Car Evaluation Classification

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import when, col
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics

Create Spark Session

In [2]:
spark = SparkSession.builder.appName("CarPriceClassification").getOrCreate()

Load dataset

In [3]:
data_path = "car_data.csv"  # Change file name if different
df = spark.read.option("header", True).option("inferSchema", True).csv(data_path)

print("Dataset Loaded Successfully!")
df.printSchema()
df.show(5)

Dataset Loaded Successfully!
root
 |-- Make: string (nullable = true)
 |-- Model: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Engine Fuel Type: string (nullable = true)
 |-- Engine HP: integer (nullable = true)
 |-- Engine Cylinders: integer (nullable = true)
 |-- Transmission Type: string (nullable = true)
 |-- Driven_Wheels: string (nullable = true)
 |-- Number of Doors: integer (nullable = true)
 |-- Market Category: string (nullable = true)
 |-- Vehicle Size: string (nullable = true)
 |-- Vehicle Style: string (nullable = true)
 |-- highway MPG: integer (nullable = true)
 |-- city mpg: integer (nullable = true)
 |-- Popularity: integer (nullable = true)
 |-- MSRP: integer (nullable = true)

+----+----------+----+--------------------+---------+----------------+-----------------+----------------+---------------+--------------------+------------+-------------+-----------+--------+----------+-----+
|Make|     Model|Year|    Engine Fuel Type|Engine HP|Engine Cylin

Drop rows with missing MSRP

In [4]:
df = df.na.drop(subset=["MSRP"])

Create target variable (price category)

In [5]:
df = df.withColumn(
    "price_category",
    when(col("MSRP") < 20000, "Low")
    .when((col("MSRP") >= 20000) & (col("MSRP") <= 40000), "Medium")
    .otherwise("High")
)


Drop unnecessary columns (Make, Model, Year optional)

In [6]:
df = df.drop("Make", "Model", "Market Category")

Identify categorical and numerical features

In [7]:
categorical_cols = ["Engine Fuel Type", "Transmission Type", "Driven_Wheels", "Vehicle Size", "Vehicle Style"]
numeric_cols = ["Engine HP", "Engine Cylinders", "Number of Doors", "highway MPG", "city mpg", "Popularity"]

Handle missing numeric values

In [8]:
for c in numeric_cols:
    df = df.na.fill({c: 0})

StringIndexer + OneHotEncoder for categorical columns

In [9]:
indexers = [StringIndexer(inputCol=c, outputCol=c + "_idx", handleInvalid="keep") for c in categorical_cols]
encoder = OneHotEncoder(
    inputCols=[c + "_idx" for c in categorical_cols],
    outputCols=[c + "_ohe" for c in categorical_cols]
)

Index target variable

In [10]:
label_indexer = StringIndexer(inputCol="price_category", outputCol="label")

Assemble all features

In [11]:
assembler = VectorAssembler(
    inputCols=[c + "_ohe" for c in categorical_cols] + numeric_cols,
    outputCol="features_raw"
)
scaler = StandardScaler(inputCol="features_raw", outputCol="features")

Build model

In [12]:
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=100, seed=42)

Create pipeline

In [13]:
pipeline = Pipeline(stages=indexers + [encoder, label_indexer, assembler, scaler, rf])

Split data

In [14]:
train, test = df.randomSplit([0.8, 0.2], seed=42)

Train model

In [15]:
model = pipeline.fit(train)

In [16]:
preds = model.transform(test)

Evaluate

In [17]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(preds)
print(f"\nModel Accuracy: {accuracy:.4f}")


Model Accuracy: 0.7899


Classification report with safe check for empty RDD

In [18]:
if preds.count() == 0:
    print("No predictions available; test set may be empty or has issues.")
else:
    preds_and_labels = preds.select("prediction", "label").rdd.map(lambda r: (float(r[0]), float(r[1])))
    metrics = MulticlassMetrics(preds_and_labels)

    print("\nClassification Report")
    labels = preds_and_labels.map(lambda x: x[1]).distinct().collect()
    for label in labels:
        print(f"Class {label}: Precision={metrics.precision(label):.3f}, Recall={metrics.recall(label):.3f}, F1={metrics.fMeasure(label):.3f}")

    print("\nConfusion Matrix")
    print(metrics.confusionMatrix().toArray())




Classification Report
Class 2.0: Precision=0.844, Recall=0.613, F1=0.710
Class 0.0: Precision=0.742, Recall=0.899, F1=0.813
Class 1.0: Precision=0.879, Recall=0.724, F1=0.794

Confusion Matrix
[[1053.   64.   54.]
 [ 185.  486.    0.]
 [ 181.    3.  292.]]


Save model

In [19]:
model.save("models/car_price_rf_model")
print("\nModel saved successfully as 'models/car_price_rf_model'")


Model saved successfully as 'models/car_price_rf_model'


In [20]:
spark.stop()