# Gradient-Boosted Tree Classification

## Imports

In [1]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler,StandardScaler
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import Imputer

## Start Spark

In [2]:
spark = SparkSession.builder.master("local[*]").config("spark.driver.memory", "6g").getOrCreate()

23/03/24 09:53:00 WARN Utils: Your hostname, MacBook-Pro-de-Matheo.local resolves to a loopback address: 127.0.0.1; using 10.10.24.145 instead (on interface en0)
23/03/24 09:53:00 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
23/03/24 09:53:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


## Load of train set and labels and merge

In [3]:
input_train = spark.read.csv("./InputTrain.csv",header=True,inferSchema=True)
label_train = spark.read.csv("./StepOne_LabelTrain.csv",header=True,inferSchema=True)

train_data = input_train.join(label_train, on="Index",how="inner")

                                                                                

## Classification

### Creation of the assembler and apply on the merged data

In [4]:
# Assemble the features into a vector
assembler = VectorAssembler(inputCols=input_train.columns[:1], outputCol="features")
train_data = assembler.transform(train_data)

### Extract randomly taken training set and a validation set from the data

In [37]:
# Split the data into training and validation sets
(train_set, validation_set) = train_data.randomSplit([0.8, 0.2], seed=42)

### Creation of the GBT and the model

In [38]:
# Train the GBT model
models = []
for i in range(0,5):
    label_col = label_train.columns[2+i]
    gbt = GBTClassifier(labelCol=label_col,featuresCol="features",maxIter=1)
    model = gbt.fit(train_data)
    
    evaluator = MulticlassClassificationEvaluator(labelCol=label_col,metricName="weightedPrecision")
    predictions = model.transform(validation_set)
    weightedPrecision = evaluator.evaluate(predictions)
    print(f"Weighted Precision {label_col} = {weightedPrecision}")
    
    models.append(model)

                                                                                

Weighted Precision Washing Machine = 0.6493315901482685


                                                                                

Weighted Precision Dishwasher = 0.7823009062793221


                                                                                

Weighted Precision Tumble Dryer = 0.9342884111415322


                                                                                

Weighted Precision Microwave = 0.8145491338375916


[Stage 3045:>                                                       (0 + 8) / 8]

Weighted Precision Kettle = 0.78878060388347


                                                                                

### Creation of the evaluator and compute of the predictions

In [26]:
def impute_missing_values(df):
    """Imputes missing values in the DataFrame"""
    imputer = Imputer(inputCols=df.columns, outputCols=["{}_imputed".format(c) for c in df.columns])
    imputed_data = imputer.fit(df).transform(df)
    return imputed_data

def extract_features(df):
    """Extracts features from the DataFrame"""
    feature_cols = [c for c in df.columns if "imputed" in c and "index" not in c and "house_id" not in c]
    assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
    feature_data = assembler.transform(df)
    return feature_data

### Test on `InputTest.csv`

In [40]:
test_data = spark.read.csv("InputTest.csv", header=True, inferSchema=True)

# Apply the same transformations used for the training set
test_data_cleaned = test_data.drop('Index_Imputed')
test_data_features = test_data_cleaned

# Create a vector assembler to combine the features into a single column
vector_assembler = VectorAssembler(inputCols=test_data_features.columns[2:], outputCol='features')
test_data_with_features = vector_assembler.transform(test_data_features).select('Index', 'House_id', 'features')

# apply the trained model on the test data and generate predictions
test_predictions = {}
for i in range(0,5):
    label_col = label_train.columns[2+i]
    model = models[i]
    predictions = model.transform(test_data_with_features)
    predictions = predictions.select('index', 'House_id', 'prediction')
    test_predictions[label_col] = predictions.withColumnRenamed('prediction', label_col)

# merge the individual predictions into a single DataFrame
test_predictions_df = test_predictions[label_train.columns[2]]
for i in range(1,5):
    label_col = label_train.columns[2+i]
    test_predictions_df = test_predictions_df.join(test_predictions[label_col], ['Index'], 'inner')

test_predictions_df = test_predictions_df.drop('house_id')
# write the predictions to a CSV file
test_predictions_df.toPandas().to_csv('StepOne_TestPredictions.csv', index=False)

                                                                                