# Decision Trees #

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import StringIndexer , OneHotEncoder, VectorAssembler
from pyspark.sql.functions import mean
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


In [7]:
sc = SparkSession.builder.appName("Decision Trees").getOrCreate()
data_path = "/Users/arnavkarnik/Documents/MIT-Manipal_CSE-AI-ML/Year3/Big_Data_Analytics-Lab/data.csv"
df = sc.read.csv(data_path , header=True, inferSchema= True)
df.show()

+------+------+-------+-------------+----+
|   age|income|student|credit_rating|buys|
+------+------+-------+-------------+----+
| young|  high|     no|         fair|  no|
| young|  high|     no|    excellent|  no|
|middle|  high|     no|         fair| yes|
|senior|medium|     no|         fair| yes|
|senior|   low|    yes|         fair| yes|
|senior|   low|    yes|    excellent|  no|
|middle|   low|    yes|    excellent| yes|
| young|medium|     no|         fair|  no|
| young|   low|    yes|         fair| yes|
|senior|medium|    yes|         fair| yes|
| young|medium|    yes|    excellent| yes|
|middle|medium|     no|    excellent| yes|
|middle|  high|    yes|         fair| yes|
|senior|medium|     no|    excellent|  no|
+------+------+-------+-------------+----+



In [8]:
# Step 1: Handle Missing Values

# Define categorical and numerical columns
categorical_columns = ["age", "income", "student", "credit_rating"]
target_column = "buys"
numerical_columns = [col_name for col_name, dtype in df.dtypes if dtype in ("int", "double")]

# Fill missing values for categorical columns with the mode
for column in categorical_columns:
    mode_value = df.groupBy(column).count().orderBy("count", ascending=False).first()[0]
    df = df.fillna({column: mode_value})

# Fill missing values for numerical columns with the mean
for column in numerical_columns:
    mean_value = df.select(mean(col(column))).first()[0]
    df = df.fillna({column: mean_value})

print("Data after handling missing values:")
df.show()

# Step 2: Encode Categorical Features

# Manually apply StringIndexer and OneHotEncoder transformations for each categorical column
indexers = {}
encoders = {}
for column in categorical_columns:
    # Index the column
    indexer = StringIndexer(inputCol=column, outputCol=column + "_index").fit(df)
    df = indexer.transform(df)
    indexers[column] = indexer
    
    # One-hot encode the indexed column
    encoder = OneHotEncoder(inputCol=column + "_index", outputCol=column + "_encoded")
    df = encoder.fit(df).transform(df)
    encoders[column] = encoder

# Index the target column
target_indexer = StringIndexer(inputCol=target_column, outputCol=target_column + "_index").fit(df)
df = target_indexer.transform(df)

# Display data after encoding
print("Data after encoding categorical features:")
df.show()

# Step 3: Assemble Features

# Assemble all encoded features and numerical columns into a single vector
assembler = VectorAssembler(
    inputCols=[column + "_encoded" for column in categorical_columns] + numerical_columns,
    outputCol="features"
)
df = assembler.transform(df)

# Select final columns for modeling
final_df = df.select("features", target_column + "_index")

print("Final Processed Data:")
final_df.show()

Data after handling missing values:
+------+------+-------+-------------+----+
|   age|income|student|credit_rating|buys|
+------+------+-------+-------------+----+
| young|  high|     no|         fair|  no|
| young|  high|     no|    excellent|  no|
|middle|  high|     no|         fair| yes|
|senior|medium|     no|         fair| yes|
|senior|   low|    yes|         fair| yes|
|senior|   low|    yes|    excellent|  no|
|middle|   low|    yes|    excellent| yes|
| young|medium|     no|         fair|  no|
| young|   low|    yes|         fair| yes|
|senior|medium|    yes|         fair| yes|
| young|medium|    yes|    excellent| yes|
|middle|medium|     no|    excellent| yes|
|middle|  high|    yes|         fair| yes|
|senior|medium|     no|    excellent|  no|
+------+------+-------+-------------+----+

Data after encoding categorical features:
+------+------+-------+-------------+----+---------+-------------+------------+--------------+-------------+---------------+-------------------+---