# Decision Trees #

In [16]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.functions import mean
from pyspark.ml.feature import StringIndexer , OneHotEncoder, VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


In [17]:
sc = SparkSession.builder.appName("Decision Trees").getOrCreate()
data_path = "/Users/arnavkarnik/Documents/MIT-Manipal_CSE-AI-ML/Year3/Big_Data_Analytics-Lab/data.csv"
df = sc.read.csv(data_path , header=True, inferSchema= True)
df.show()

+------+------+-------+-------------+----+
|   age|income|student|credit_rating|buys|
+------+------+-------+-------------+----+
| young|  high|     no|         fair|  no|
| young|  high|     no|    excellent|  no|
|middle|  high|     no|         fair| yes|
|senior|medium|     no|         fair| yes|
|senior|   low|    yes|         fair| yes|
|senior|   low|    yes|    excellent|  no|
|middle|   low|    yes|    excellent| yes|
| young|medium|     no|         fair|  no|
| young|   low|    yes|         fair| yes|
|senior|medium|    yes|         fair| yes|
| young|medium|    yes|    excellent| yes|
|middle|medium|     no|    excellent| yes|
|middle|  high|    yes|         fair| yes|
|senior|medium|     no|    excellent|  no|
+------+------+-------+-------------+----+



In [18]:
# Step 1: Handle Missing Values

# Define categorical and numerical columns
categorical_columns = ["age", "income", "student", "credit_rating"]
target_column = "buys"
numerical_columns = [col_name for col_name, dtype in df.dtypes if dtype in ("int", "double")]

# Fill missing values for categorical columns with the mode
for column in categorical_columns:
    mode_value = df.groupBy(column).count().orderBy("count", ascending=False).first()[0]
    df = df.fillna({column: mode_value})

# Fill missing values for numerical columns with the mean
for column in numerical_columns:
    mean_value = df.select(mean(col(column))).first()[0]
    df = df.fillna({column: mean_value})

print("Data after handling missing values:")
df.show()

# Step 2: Encode Categorical Features

# Manually apply StringIndexer and OneHotEncoder transformations for each categorical column
indexers = {}
encoders = {}
for column in categorical_columns:
    # Index the column
    indexer = StringIndexer(inputCol=column, outputCol=column + "_index").fit(df)
    df = indexer.transform(df)
    indexers[column] = indexer
    
    # One-hot encode the indexed column
    encoder = OneHotEncoder(inputCol=column + "_index", outputCol=column + "_encoded")
    df = encoder.fit(df).transform(df)
    encoders[column] = encoder

# Index the target column
target_indexer = StringIndexer(inputCol=target_column, outputCol=target_column + "_index").fit(df)
df = target_indexer.transform(df)

# Display data after encoding
print("Data after encoding categorical features:")
df.show()

# Step 3: Assemble Features

# Assemble all encoded features and numerical columns into a single vector
assembler = VectorAssembler(
    inputCols=[column + "_encoded" for column in categorical_columns] + numerical_columns,
    outputCol="features"
)
df = assembler.transform(df)

# Select final columns for modeling
final_df = df.select("features", target_column + "_index")

print("Final Processed Data:")
final_df.show()

Data after handling missing values:
+------+------+-------+-------------+----+
|   age|income|student|credit_rating|buys|
+------+------+-------+-------------+----+
| young|  high|     no|         fair|  no|
| young|  high|     no|    excellent|  no|
|middle|  high|     no|         fair| yes|
|senior|medium|     no|         fair| yes|
|senior|   low|    yes|         fair| yes|
|senior|   low|    yes|    excellent|  no|
|middle|   low|    yes|    excellent| yes|
| young|medium|     no|         fair|  no|
| young|   low|    yes|         fair| yes|
|senior|medium|    yes|         fair| yes|
| young|medium|    yes|    excellent| yes|
|middle|medium|     no|    excellent| yes|
|middle|  high|    yes|         fair| yes|
|senior|medium|     no|    excellent|  no|
+------+------+-------+-------------+----+

Data after encoding categorical features:
+------+------+-------+-------------+----+---------+-------------+------------+--------------+-------------+---------------+-------------------+---

In [19]:
# Split the data into training and test sets
train_df, test_df = final_df.randomSplit([0.8, 0.2], seed=42)

# Step 2: Train a Decision Tree Model

# Initialize the Decision Tree Classifier
dt = DecisionTreeClassifier(labelCol=target_column + "_index", featuresCol="features", maxDepth=5)

# Train the model
dt_model = dt.fit(train_df)

# Step 3: Make Predictions and Evaluate the Model

# Make predictions on the test set
predictions = dt_model.transform(test_df)

# Evaluate the model using accuracy
evaluator = MulticlassClassificationEvaluator(
    labelCol=target_column + "_index", predictionCol="prediction", metricName="accuracy"
)
accuracy = evaluator.evaluate(predictions)

print(f"Test Accuracy: {accuracy:.2f}")

Test Accuracy: 0.50


24/11/05 17:21:37 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 32 to 10 (= number of training instances)


In [14]:
# Initialize evaluator for different metrics
evaluator_accuracy = MulticlassClassificationEvaluator(
    labelCol=target_column + "_index", predictionCol="prediction", metricName="accuracy"
)
evaluator_precision = MulticlassClassificationEvaluator(
    labelCol=target_column + "_index", predictionCol="prediction", metricName="weightedPrecision"
)
evaluator_recall = MulticlassClassificationEvaluator(
    labelCol=target_column + "_index", predictionCol="prediction", metricName="weightedRecall"
)

# Calculate accuracy, precision, and recall
accuracy = evaluator_accuracy.evaluate(predictions)
precision = evaluator_precision.evaluate(predictions)
recall = evaluator_recall.evaluate(predictions)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")

# Stop the Spark session
sc.stop()

Accuracy: 0.50
Precision: 0.83
Recall: 0.50


In [20]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder
from pyspark.sql.functions import mean, col

# Initialize Spark Session
spark = SparkSession.builder.appName("RandomForestEvaluation").getOrCreate()

# Load the CSV file into a DataFrame (path/to/your/data.csv should be replaced with your actual file path)
data_path = "/Users/arnavkarnik/Documents/MIT-Manipal_CSE-AI-ML/Year3/Big_Data_Analytics-Lab/data.csv"
df = spark.read.csv(data_path, header=True, inferSchema=True)

# Data Preparation (Refer to previous code for full processing steps)

# Define categorical and numerical columns
categorical_columns = ["age", "income", "student", "credit_rating"]
target_column = "buys"
numerical_columns = [col_name for col_name, dtype in df.dtypes if dtype in ("int", "double")]

# Fill missing values (mode for categorical, mean for numerical)
for column in categorical_columns:
    mode_value = df.groupBy(column).count().orderBy("count", ascending=False).first()[0]
    df = df.fillna({column: mode_value})

for column in numerical_columns:
    mean_value = df.select(mean(col(column))).first()[0]
    df = df.fillna({column: mean_value})

# Encode Categorical Features
for column in categorical_columns:
    indexer = StringIndexer(inputCol=column, outputCol=column + "_index").fit(df)
    df = indexer.transform(df)
    encoder = OneHotEncoder(inputCol=column + "_index", outputCol=column + "_encoded")
    df = encoder.fit(df).transform(df)

target_indexer = StringIndexer(inputCol=target_column, outputCol=target_column + "_index").fit(df)
df = target_indexer.transform(df)

# Assemble Features
assembler = VectorAssembler(
    inputCols=[column + "_encoded" for column in categorical_columns] + numerical_columns,
    outputCol="features"
)
df = assembler.transform(df)

# Select final DataFrame
final_df = df.select("features", target_column + "_index")

# Split data into training and test sets
train_df, test_df = final_df.randomSplit([0.8, 0.2], seed=42)

# Train Random Forest Model
rf = RandomForestClassifier(labelCol=target_column + "_index", featuresCol="features", numTrees=100, maxDepth=5)
rf_model = rf.fit(train_df)

# Make Predictions
predictions = rf_model.transform(test_df)

# Step 3: Evaluate the Model

# Initialize evaluator for different metrics
evaluator_accuracy = MulticlassClassificationEvaluator(
    labelCol=target_column + "_index", predictionCol="prediction", metricName="accuracy"
)
evaluator_precision = MulticlassClassificationEvaluator(
    labelCol=target_column + "_index", predictionCol="prediction", metricName="weightedPrecision"
)
evaluator_recall = MulticlassClassificationEvaluator(
    labelCol=target_column + "_index", predictionCol="prediction", metricName="weightedRecall"
)

# Calculate accuracy, precision, and recall
accuracy = evaluator_accuracy.evaluate(predictions)
precision = evaluator_precision.evaluate(predictions)
recall = evaluator_recall.evaluate(predictions)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")

# Stop the Spark session
spark.stop()


24/11/05 17:21:40 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
24/11/05 17:21:41 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 32 to 10 (= number of training instances)


Accuracy: 0.75
Precision: 0.88
Recall: 0.75
