# **Lab 5: Machine Learning Algorithms**

Nmae: Madhavi Chitnis

PRN: 20220802070



# **Decision Trees**



---



In [None]:
from google.colab import files
uploaded = files.upload()

Saving pima.csv to pima.csv


### **Step: 1 Import necessary libraries**

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

### **Step: 2 Initialize Spark Session**

In [None]:
#Creating a Spark Session
spark = SparkSession.builder \
.appName("Decision Trees") \
.getOrCreate()

### **Step: 3 Load Dataset**

In [None]:
# Load the CSV file into a DataFrame
df = spark.read.csv("pima.csv", header=True, inferSchema=True)


### **Step: 4 Data Preprocessing**

In [None]:
df.printSchema()

root
 |-- preg: integer (nullable = true)
 |-- plas: integer (nullable = true)
 |-- pres: integer (nullable = true)
 |-- skin: integer (nullable = true)
 |-- test: integer (nullable = true)
 |-- mass: double (nullable = true)
 |-- pedi: double (nullable = true)
 |-- age: integer (nullable = true)
 |-- class: integer (nullable = true)



In [None]:
df.describe().show()

+-------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+
|summary|              preg|             plas|              pres|              skin|              test|              mass|              pedi|               age|             class|
+-------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+
|  count|               768|              768|               768|               768|               768|               768|               768|               768|               768|
|   mean|3.8450520833333335|     120.89453125|       69.10546875|20.536458333333332| 79.79947916666667|31.992578124999977|0.4718763020833327|33.240885416666664|0.3489583333333333|
| stddev|  3.36957806269887|31.97261819513622|19.355807170644777|15.952217567727642|115.244002351338

### **Handle Missing Values (if any) (Replacing 0s with median)**

In [None]:
columns_to_fix = ["plas", "pres", "skin", "test", "mass"]
for col_name in columns_to_fix:
    median_value = df.approxQuantile(col_name, [0.5], 0.0)[0]  # Compute median
    df = df.withColumn(col_name, col(col_name).cast("double"))  # Ensure column is double type
    df = df.withColumn(col_name, when(col(col_name) == 0, median_value).otherwise(col(col_name)))  # Replace 0s with median

### **Step: 5 Convert Features into Sparse Vectors**

In [None]:
feature_cols = [col for col in df.columns if col != "class"]
vector_assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
df = vector_assembler.transform(df).select("features", col("class").alias("label"))

### **Step: 6 Split Data into Training and Testing Sets**

In [None]:
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)

### **Step: 7 Train Decision Tree Model**

In [None]:
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", maxDepth=5)
dt_model = dt.fit(train_df)

### **Step: 8 Make Predictions**

In [None]:
predictions = dt_model.transform(test_df)

### **Step: 9 Model Evaluation**

In [None]:
accuracy_eval = MulticlassClassificationEvaluator(labelCol="label", metricName="accuracy")
precision_eval = MulticlassClassificationEvaluator(labelCol="label", metricName="weightedPrecision")
recall_eval = MulticlassClassificationEvaluator(labelCol="label", metricName="weightedRecall")
auc_eval = BinaryClassificationEvaluator(labelCol="label", metricName="areaUnderROC")

accuracy = accuracy_eval.evaluate(predictions)
precision = precision_eval.evaluate(predictions)
recall = recall_eval.evaluate(predictions)
auc = auc_eval.evaluate(predictions)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"AUC: {auc:.4f}")

Accuracy: 0.7561
Precision: 0.7504
Recall: 0.7561
AUC: 0.7640


### **Step: 10 Feature Importance**

In [None]:
feature_importance = dt_model.featureImportances
for i, importance in enumerate(feature_importance):
    print(f"Feature {feature_cols[i]}: {importance:.4f}")

Feature preg: 0.0416
Feature plas: 0.4950
Feature pres: 0.0234
Feature skin: 0.0227
Feature test: 0.0224
Feature mass: 0.1551
Feature pedi: 0.0592
Feature age: 0.1807


### **Stop Spark Session**

In [None]:
spark.stop()