In [1]:
import findspark
findspark.init()
import pyspark

In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

In [3]:
# Initialize Spark session
spark = SparkSession.builder \
    .appName("Car evaluation") \
    .getOrCreate()

# Step 1: Read the dataset
input_file = r"D:\Data Science\car_evaluation.csv"
data = spark.read.csv(input_file, header=True, inferSchema=True)


In [4]:
data.head()

Row(vhigh0='vhigh', vhigh1='vhigh', 22='2', 23='2', small='small', low='med', unacc='unacc')

In [14]:
# Step 1: Assemble feature columns into a feature vector
feature_assembler = VectorAssembler(inputCols=[
    "vhigh0_index",
    "vhigh1_index",
    "small_index",
    "low_index",
], outputCol="features")

# Step 2: Create a Logistic Regression model
lr = LogisticRegression(featuresCol="features", labelCol="unacc_index")

# Step 3: Create a pipeline with the assembler and logistic regression
pipeline = Pipeline(stages=[feature_assembler, lr])

# Step 4: Fit the model
model = pipeline.fit(new_df)

# Step 5: Transform the data to get predictions
data_transformed = model.transform(new_df)

In [15]:
# Step 6: Evaluate the model using different metrics
evaluator = MulticlassClassificationEvaluator(
    labelCol="unacc_index", predictionCol="prediction", metricName="accuracy"
)

# Compute accuracy
accuracy = evaluator.evaluate(data_transformed)
print(f"Accuracy: {accuracy}")

# Additional evaluation metrics: F1-score, precision, recall
f1_evaluator = MulticlassClassificationEvaluator(
    labelCol="unacc_index", predictionCol="prediction", metricName="f1"
)
precision_evaluator = MulticlassClassificationEvaluator(
    labelCol="unacc_index", predictionCol="prediction", metricName="weightedPrecision"
)
recall_evaluator = MulticlassClassificationEvaluator(
    labelCol="unacc_index", predictionCol="prediction", metricName="weightedRecall"
)

f1_score = f1_evaluator.evaluate(data_transformed)
precision = precision_evaluator.evaluate(data_transformed)
recall = recall_evaluator.evaluate(data_transformed)

# Print additional metrics
print(f"F1 Score: {f1_score}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")

Accuracy: 0.6844238563983787
F1 Score: 0.6460104226983208
Precision: 0.6138909010508511
Recall: 0.6844238563983787


In [5]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

# List of columns to encode
columns_to_encode = ['vhigh0', 'vhigh1', 'small', 'low', 'unacc']

# Create a list of StringIndexers for each column
indexers = [StringIndexer(inputCol=col, outputCol=col + "_index") for col in columns_to_encode]

# Create a pipeline to apply all indexers
pipeline_encode = Pipeline(stages=indexers)

# Fit and transform the data using the pipeline
model = pipeline_encode.fit(data)
encoded_data = model.transform(data)

# Show the encoded data
encoded_data.select([col + "_index" for col in columns_to_encode]).show()


+------------+------------+-----------+---------+-----------+
|vhigh0_index|vhigh1_index|small_index|low_index|unacc_index|
+------------+------------+-----------+---------+-----------+
|         3.0|         3.0|        2.0|      1.0|        0.0|
|         3.0|         3.0|        2.0|      0.0|        0.0|
|         3.0|         3.0|        1.0|      2.0|        0.0|
|         3.0|         3.0|        1.0|      1.0|        0.0|
|         3.0|         3.0|        1.0|      0.0|        0.0|
|         3.0|         3.0|        0.0|      2.0|        0.0|
|         3.0|         3.0|        0.0|      1.0|        0.0|
|         3.0|         3.0|        0.0|      0.0|        0.0|
|         3.0|         3.0|        2.0|      2.0|        0.0|
|         3.0|         3.0|        2.0|      1.0|        0.0|
|         3.0|         3.0|        2.0|      0.0|        0.0|
|         3.0|         3.0|        1.0|      2.0|        0.0|
|         3.0|         3.0|        1.0|      1.0|        0.0|
|       

In [8]:
columns_to_encode = ['vhigh0', 'vhigh1', 'small', 'low', 'unacc']
columns_without_encoding = ['22', '23']
final_columns = [col + "_index" for col in columns_to_encode] + columns_without_encoding
# Create a new DataFrame with the required columns
new_df = encoded_data.select(final_columns)

# Show the resulting DataFrame
new_df.show()

+------------+------------+-----------+---------+-----------+---+----+
|vhigh0_index|vhigh1_index|small_index|low_index|unacc_index| 22|  23|
+------------+------------+-----------+---------+-----------+---+----+
|         3.0|         3.0|        2.0|      1.0|        0.0|  2|   2|
|         3.0|         3.0|        2.0|      0.0|        0.0|  2|   2|
|         3.0|         3.0|        1.0|      2.0|        0.0|  2|   2|
|         3.0|         3.0|        1.0|      1.0|        0.0|  2|   2|
|         3.0|         3.0|        1.0|      0.0|        0.0|  2|   2|
|         3.0|         3.0|        0.0|      2.0|        0.0|  2|   2|
|         3.0|         3.0|        0.0|      1.0|        0.0|  2|   2|
|         3.0|         3.0|        0.0|      0.0|        0.0|  2|   2|
|         3.0|         3.0|        2.0|      2.0|        0.0|  2|   4|
|         3.0|         3.0|        2.0|      1.0|        0.0|  2|   4|
|         3.0|         3.0|        2.0|      0.0|        0.0|  2|   4|
|     

In [9]:
from pyspark.ml.feature import StringIndexer, VectorAssembler


# Show the initial data
print("Initial Data:")
new_df.show()

# Step 2: Data Preprocessing

# Convert the label column to numerical values
label_indexer = StringIndexer(inputCol="unacc_index", outputCol="label")  # Ensure "diagnosis" is your label column

# Drop existing features column if it exists
if "features" in new_df.columns:
    new_df = new_df.drop("features")

# Create feature vector
feature_assembler = VectorAssembler(inputCols=[
    "vhigh0_index",
    "vhigh1_index",
    "small_index",
    "low_index"
], outputCol="features")

# Create the pipeline
pipeline = Pipeline(stages=[label_indexer, feature_assembler])

# Fit the model (this applies both transformations in the pipeline)
model = pipeline.fit(new_df)

# Transform the data with the fitted model
data_transformed = model.transform(new_df)

# Show the transformed data
data_transformed.select("label", "features").show()

Initial Data:
+------------+------------+-----------+---------+-----------+---+----+
|vhigh0_index|vhigh1_index|small_index|low_index|unacc_index| 22|  23|
+------------+------------+-----------+---------+-----------+---+----+
|         3.0|         3.0|        2.0|      1.0|        0.0|  2|   2|
|         3.0|         3.0|        2.0|      0.0|        0.0|  2|   2|
|         3.0|         3.0|        1.0|      2.0|        0.0|  2|   2|
|         3.0|         3.0|        1.0|      1.0|        0.0|  2|   2|
|         3.0|         3.0|        1.0|      0.0|        0.0|  2|   2|
|         3.0|         3.0|        0.0|      2.0|        0.0|  2|   2|
|         3.0|         3.0|        0.0|      1.0|        0.0|  2|   2|
|         3.0|         3.0|        0.0|      0.0|        0.0|  2|   2|
|         3.0|         3.0|        2.0|      2.0|        0.0|  2|   4|
|         3.0|         3.0|        2.0|      1.0|        0.0|  2|   4|
|         3.0|         3.0|        2.0|      0.0|        0.0|  

In [12]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


# Initialize evaluator for accuracy
evaluator_accuracy = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="unacc_index", metricName="accuracy")

# Initialize evaluator for F1 score
evaluator_f1 = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="unacc_index", metricName="f1")

# Compute accuracy and F1 score
accuracy = evaluator_accuracy.evaluate(data_transformed)
f1_score = evaluator_f1.evaluate(data_transformed)

# Print the evaluation results
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1_score:.4f}")

Accuracy: 1.0000
F1 Score: 1.0000
