## preparation

In [1]:
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns

from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql.functions import *

import warnings
warnings.filterwarnings('ignore')

In [2]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.feature import StringIndexer, VectorIndexer, VectorAssembler
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from pyspark.ml.evaluation import RegressionEvaluator

In [3]:
spark = SparkSession.builder.appName("ass4_q1").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/25 13:29:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/05/25 13:29:20 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/05/25 13:29:20 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
24/05/25 13:29:20 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


## Q1

### data preprocess

In [4]:
train = spark.read.format("csv").load("data/Q1_house_price_data/train.csv", header=True, inferSchema=True)
test = spark.read.format("csv").load("data/Q1_house_price_data/test.csv", header=True, inferSchema=True)

In [5]:
# numericalCols = [x.name for x in train.schema.fields if x.dataType == IntegerType()]
# numericalCols = [x for x in numericalCols if x != "id" and x != "SalePrice"]
# categoricalCols = [field for (field, dataType) in train.dtypes if dataType == "string"]

In [6]:
numericalCols = ['LotFrontage','LotArea','MasVnrArea','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','1stFlrSF','2ndFlrSF','LowQualFinSF','GrLivArea','TotRmsAbvGrd','Fireplaces','GarageCars','GarageArea','WoodDeckSF','OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea','MiscVal']

fields = [field.name for field in train.schema]

categoricalCols = list(set(fields) - set(numericalCols) - set(["Id","SalePrice"]))
print("The categorical fields are:\n", categoricalCols)
print()
print("The numerical fields are:\n",numericalCols)

The categorical fields are:
 ['Functional', 'MoSold', 'LotShape', 'YearBuilt', 'KitchenAbvGr', 'PavedDrive', 'FireplaceQu', 'BsmtFullBath', 'BsmtExposure', 'Electrical', 'GarageYrBlt', 'Street', 'PoolQC', 'MSSubClass', 'ExterQual', 'Foundation', 'ExterCond', 'BsmtHalfBath', 'MasVnrType', 'Alley', 'YearRemodAdd', 'SaleCondition', 'Exterior1st', 'YrSold', 'HalfBath', 'RoofStyle', 'Condition2', 'GarageFinish', 'GarageCond', 'Heating', 'OverallCond', 'OverallQual', 'CentralAir', 'MiscFeature', 'Condition1', 'BsmtQual', 'Utilities', 'MSZoning', 'BsmtCond', 'GarageQual', 'Fence', 'Neighborhood', 'KitchenQual', 'HeatingQC', 'LandSlope', 'LotConfig', 'RoofMatl', 'BsmtFinType2', 'SaleType', 'HouseStyle', 'FullBath', 'Exterior2nd', 'BldgType', 'GarageType', 'BedroomAbvGr', 'BsmtFinType1', 'LandContour']

The numerical fields are:
 ['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'TotRmsAbvGrd', '

As `id` is the identifier of the record by order, which will not include much information for fitting, so that I scrap `id` in the numerical columns. 

filter some missing values in the numerical columns, then convert them to the `Double` datatype.

``` python
for col_name in numericalCols:
    train = train.where(col(col_name) != "NA").withColumn(col_name, col(col_name).cast(DoubleType()))
```

However, this kind of methods will make the dataframe be a empty table which will cause an error `empty collection` when fitting models. The reason is that some fields in the for loop is not the string type, causing a mismatch when comparing with the string `"NA"`, which will return `False` uniformly in spark. To address this problem, the `NA` should be converted to the real missing value `None` firstly. Then we can feel free to filter the rows with missing values.


In [7]:
train.count()

1460

In [8]:
def process_missing_values(data):
    for numer in numericalCols:
        data = data.withColumn(numer, when(col(numer) == "NA", None).otherwise(col(numer)))
        data = data.filter(col(numer).isNotNull()).withColumn(numer, col(numer).cast("double"))
    return data

pre_train = process_missing_values(train)

In [9]:
pre_train.count()

1195

In [10]:
pre_test = process_missing_values(test)

Then convert the categorical columns to the datatype efficient for model to train through `StringIndexer()` and `VectorIndexer()`

In [11]:
indexOutputCols = [x+"Index" for x in categoricalCols]
categoricallIndexer = StringIndexer(inputCols=categoricalCols, outputCols=indexOutputCols, handleInvalid="skip")
assembler = VectorAssembler(inputCols=numericalCols+indexOutputCols, outputCol="rawFeatures")
featureIndexer = VectorIndexer(inputCol="rawFeatures", outputCol="indexedFeatures", maxCategories=8)

###  Q1 (1)

In [13]:

dt_tune = DecisionTreeRegressor(labelCol="SalePrice", featuresCol="rawFeatures")

# 创建参数网格
paramGrid = ParamGridBuilder()\
    .addGrid(dt_tune.maxDepth, [5, 10,15])\
    .addGrid(dt_tune.minInstancesPerNode, [1, 2, 4])\
    .addGrid(dt_tune.maxBins, [128,256,512]) \
    .build()
    
tvs = TrainValidationSplit(
	estimator=dt_tune, 
    estimatorParamMaps=paramGrid,
	evaluator=RegressionEvaluator(labelCol="SalePrice", predictionCol="prediction", metricName="rmse"), 
	trainRatio=0.8,  # 将训练集的80%用于训练模型，20%用于验证模型
    seed = 1234
    )
pipeline_dt = Pipeline(stages=[categoricallIndexer, assembler,featureIndexer, tvs])
model_dt = pipeline_dt.fit(pre_train)

24/05/25 08:53:46 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [14]:
test_pred = model_dt.transform(pre_test)

test_pred.selectExpr("Id", "prediction as SalePrice").toPandas().to_csv("output/prediction1_dt.csv", index=False)
test_pred.selectExpr("Id", "prediction as SalePrice").show(5)

+----+------------------+
|  Id|         SalePrice|
+----+------------------+
|1961|120626.84541062803|
|1962|120626.84541062803|
|1963|120626.84541062803|
|1964|120626.84541062803|
|1965|176546.15384615384|
+----+------------------+
only showing top 5 rows



### Q1 (2)

In [15]:
from xgboost.spark import SparkXGBRegressor

XGB_tune = SparkXGBRegressor(features_col="rawFeatures", label_col="SalePrice", num_workers=2)

# 创建参数网格
paramGrid = ParamGridBuilder()\
    .addGrid(XGB_tune.max_depth, [5, 10,15])\
    .addGrid(XGB_tune.max_bin, [128,256,512]) \
    .build()
    
tvs = TrainValidationSplit(
	estimator=XGB_tune, 
    estimatorParamMaps=paramGrid,
	evaluator=RegressionEvaluator(labelCol="SalePrice", predictionCol="prediction", metricName="rmse"), 
	trainRatio=0.8,  # 将训练集的80%用于训练模型，20%用于验证模型
    seed = 1234
    )
pipeline_XGB = Pipeline(stages=[categoricallIndexer, assembler,featureIndexer, tvs])
model_XGB = pipeline_XGB.fit(pre_train)


2024-05-25 08:54:33,445 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'device': 'cpu', 'max_bin': 128, 'max_depth': 5, 'objective': 'reg:squarederror', 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
[08:54:36] task 0 got new rank 0                                    (0 + 2) / 2]
[08:54:36] task 1 got new rank 1
2024-05-25 08:54:38,302 INFO XGBoost-PySpark: _fit Finished xgboost training!   
2024-05-25 08:54:38,897 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-05-25 08:54:39,023 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'device': 'cpu', 'max_bin': 256, 'max_depth': 5, 'objective': 'reg:squarederror', 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
[08:54:40] task 0 got new rank 0                                

In [16]:
test_pred = model_XGB.transform(pre_test)

test_pred.selectExpr("Id", "prediction as SalePrice").toPandas().to_csv("output/prediction2_XGB.csv", index=False)
test_pred.selectExpr("Id", "prediction as SalePrice").show(5)

2024-05-25 08:55:19,626 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs


+----+--------------+
|  Id|     SalePrice|
+----+--------------+
|1961|118747.1328125|
|1962|  95382.140625|
|1963|    108862.875|
|1964|116094.1015625|
|1965|   139126.1875|
+----+--------------+
only showing top 5 rows



INFO:XGBoost-PySpark:Do the inference on the CPUs


### Q1 (3)

In [12]:
spark.stop()

In [18]:
from pyspark.sql import SparkSession

s1 = SparkSession.builder.config("spark.log.level", "WARN").config("spark.sql.debug.maxToStringFields", 100).config("spark.jars.packages", "com.microsoft.azure:synapseml_2.12:1.0.4").appName("Q1").getOrCreate()

In [19]:
from synapse.ml.lightgbm import LightGBMRegressor

GBM_tune = LightGBMRegressor(featuresCol="rawFeatures", labelCol="SalePrice",objective="regression")

# 创建参数网格
paramGrid = ParamGridBuilder()\
    .addGrid(GBM_tune.maxDepth, [5, 10,15])\
    .addGrid(GBM_tune.maxBin, [128,256,512]) \
    .build()
    
tvs = TrainValidationSplit(
	estimator=GBM_tune, 
    estimatorParamMaps=paramGrid,
	evaluator=RegressionEvaluator(labelCol="SalePrice", predictionCol="prediction", metricName="rmse"), 
	trainRatio=0.8,  # 将训练集的80%用于训练模型，20%用于验证模型
    seed = 1234
    )
pipeline_GBM = Pipeline(stages=[categoricallIndexer, assembler,featureIndexer, tvs])
model_GBM = pipeline_GBM.fit(pre_train)

TypeError: 'JavaPackage' object is not callable

In [None]:
test_pred = model_GBM.transform(pre_test)

test_pred.selectExpr("Id", "prediction as SalePrice").toPandas().to_csv("output/prediction3_GBM.csv", index=False)
test_pred.selectExpr("Id", "prediction as SalePrice").show(5)