## Imports

In [None]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

## Start Spark

In [None]:
spark = SparkSession.builder.master("local[*]").config("spark.driver.memory", "8g").getOrCreate()

## Load of train set and labels and merge

In [None]:
train = pd.read_csv("./InputTrain.csv")
test = pd.read_csv("./InputTest.csv")

dishwasher = pd.read_csv("./StepTwo_LabelTrain_Dishwasher.csv")
kettle = pd.read_csv("./StepTwo_LabelTrain_Kettle.csv")
microwave = pd.read_csv("./StepTwo_LabelTrain_Microwave.csv")
tumble_dryer = pd.read_csv("./StepTwo_LabelTrain_TumbleDryer.csv")
washing_machine = pd.read_csv("./StepTwo_LabelTrain_WashingMachine.csv")

appliances = [dishwasher, kettle, microwave, tumble_dryer, washing_machine]


In [None]:
# create a feature matrix using VectorAssembler
train_spark = spark.createDataFrame(train)
feature_cols = train_spark.columns[2:]  # exclude index and house_id
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
train_df = assembler.transform(train_spark).select("Index", "features")

In [None]:
def train_appliance_model(appliance_name):
    appliance_cols = ["House_id"] + [f"Timestep_{i}" for i in range(2160)]
    feature_cols = [col for col in train_spark.columns if col not in appliance_cols]
    
    # select feature and target columns
    cols = [f"Timestep_{i}" for i in range(2160)] + feature_cols
    df = train_spark.select(cols).withColumn(appliance_name, col(appliance_name).cast("float"))
    
    # assemble feature vector
    assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
    assembled_df = assembler.transform(df).select("features", appliance_name)
    
    # train linear regression model
    lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
    model = lr.fit(assembled_df)

    return model

In [None]:
appliance_names = ["Dishwasher", "Kettle", "Microwave", "Tumble Dryer", "Washing Machine"]
models = [train_appliance_model(name) for name in appliance_names]