In [1]:
# Imports
import pandas as pd
from sklearn.model_selection import train_test_split
import os.path
import numpy as np

from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator


In [2]:
# Paths
data_path = "/Users/alek/Downloads/H3/MMDS_H3/data"
dataset_path = os.path.join(data_path, "dataset/diabetes_binary_health_indicators_BRFSS2015.csv")
trained_model_path = os.path.join(data_path, "trained_models")

offline_path = os.path.join(trained_model_path, "offline.csv")
online_path = os.path.join(trained_model_path, "online.csv")


## Create 80/20 random split

In [10]:
df = pd.read_csv(dataset_path)

# Diabetes_binary is the label column
y = df["Diabetes_binary"]  
X = df.drop("Diabetes_binary", axis=1)

X_offline, X_online, y_offline, y_online = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Combine X_offline + y_offline and X_online + y_online
offline = pd.concat([X_offline, y_offline], axis=1)
online = pd.concat([X_online, y_online], axis=1)

offline.to_csv(offline_path, index=False)
online.to_csv(online_path, index=False)

## Offline Spark functionality

In [3]:
spark = SparkSession.builder \
    .appName("OfflineModelTraining") \
    .config("spark.driver.memory", "12g") \
    .config("spark.executor.memory", "12g") \
    .getOrCreate()

df_offline = spark.read.csv(offline_path, header=True, inferSchema=True) # we're inferring the schema, but we can create one as well

label_column = "Diabetes_binary"
feature_columns = [c for c in df_offline.columns if c != label_column]
df_offline = df_offline.withColumnRenamed(label_column, "label")

assembler = VectorAssembler(inputCols=feature_columns, outputCol="features_assembled") # assembles feature vector

scaler = StandardScaler(inputCol="features_assembled", outputCol="features", withStd = True, withMean = False)

# Classifiers
lr = LogisticRegression(featuresCol='features', labelCol="label")
rf = RandomForestClassifier(featuresCol='features', labelCol="label")
gbt = GBTClassifier(featuresCol='features', labelCol="label")

pipeline_lr = Pipeline(stages=[assembler, scaler, lr])
pipeline_rf = Pipeline(stages=[assembler, scaler, rf])
pipeline_gbt = Pipeline(stages=[assembler, scaler, gbt])

paramGridLR = (ParamGridBuilder()
    .addGrid(lr.regParam, [0.01, 0.1, 0.5])
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])
    .build()
)

paramGridRF = (ParamGridBuilder()
    .addGrid(rf.numTrees, [10, 20, 50])
    .addGrid(rf.maxDepth, [5, 10])
    .addGrid(rf.featureSubsetStrategy, ['auto', 'sqrt', 'log2'])
    .build()
)

paramGridGBT = (ParamGridBuilder()
    .addGrid(gbt.maxIter, [10, 20])
    .addGrid(gbt.maxDepth, [3, 5])
    .build()
)


evaluator = MulticlassClassificationEvaluator(labelCol="label", metricName="f1")

cv_lr = CrossValidator(
    estimator=pipeline_lr,
    estimatorParamMaps=paramGridLR,
    evaluator=evaluator,
    numFolds=3
)

cv_rf = CrossValidator(
    estimator=pipeline_rf,
    estimatorParamMaps=paramGridRF,
    evaluator=evaluator,
    numFolds=3
)

cv_gbt = CrossValidator(
    estimator=pipeline_gbt,
    estimatorParamMaps=paramGridGBT,
    evaluator=evaluator,
    numFolds=3
)

25/02/13 15:27:07 WARN Utils: Your hostname, Aleks-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 192.168.180.135 instead (on interface en0)
25/02/13 15:27:07 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/13 15:27:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

In [4]:
cvModel_lr = cv_lr.fit(df_offline)   # fits pipeline_lr with paramGridLR

25/02/13 15:27:18 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
25/02/13 15:27:19 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
25/02/13 15:27:21 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
25/02/13 15:27:21 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS


In [5]:
cvModel_rf = cv_rf.fit(df_offline)   # fits pipeline_rf with paramGridRF

25/02/13 15:27:47 WARN DAGScheduler: Broadcasting large task binary with size 1318.4 KiB
25/02/13 15:27:49 WARN DAGScheduler: Broadcasting large task binary with size 1318.4 KiB
25/02/13 15:27:50 WARN DAGScheduler: Broadcasting large task binary with size 1318.4 KiB
25/02/13 15:27:55 WARN DAGScheduler: Broadcasting large task binary with size 1456.1 KiB
25/02/13 15:27:56 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB
25/02/13 15:27:57 WARN DAGScheduler: Broadcasting large task binary with size 1343.1 KiB
25/02/13 15:27:58 WARN DAGScheduler: Broadcasting large task binary with size 1456.1 KiB
25/02/13 15:27:59 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB
25/02/13 15:28:00 WARN DAGScheduler: Broadcasting large task binary with size 1343.1 KiB
25/02/13 15:28:02 WARN DAGScheduler: Broadcasting large task binary with size 1456.1 KiB
25/02/13 15:28:02 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB
25/02/13 15:28:03 WARN DAGSche

In [6]:
cvModel_gbt = cv_gbt.fit(df_offline) # fits pipeline_gbt with paramGridGBT

In [7]:
bestF1_lr = max(cvModel_lr.avgMetrics)  # best F1 from logistic regression grid search
bestF1_rf = max(cvModel_rf.avgMetrics)  # best F1 from random forest grid search
bestF1_gbt = max(cvModel_gbt.avgMetrics) # best F1 from GBT grid search

print("LR best F1: ", bestF1_lr)
print("RF best F1: ", bestF1_rf)
print("GBT best F1:", bestF1_gbt)

# Determine overall best
if bestF1_lr >= bestF1_rf and bestF1_lr >= bestF1_gbt:
    best_overall_model = cvModel_lr.bestModel
    best_score = bestF1_lr
    best_model_name = "LogisticRegression"
elif bestF1_rf >= bestF1_lr and bestF1_rf >= bestF1_gbt:
    best_overall_model = cvModel_rf.bestModel
    best_score = bestF1_rf
    best_model_name = "RandomForest"
else:
    best_overall_model = cvModel_gbt.bestModel
    best_score = bestF1_gbt
    best_model_name = "GBT"

print(f"Overall best model is {best_model_name} with F1 = {best_score}")

LR best F1:  0.8269692260838889
RF best F1:  0.8231307961690278
GBT best F1: 0.8301072199885207
Overall best model is GBT with F1 = 0.8301072199885207


In [8]:
best_overall_model.write().overwrite().save(trained_model_path)
print(f"Saved best model ({best_model_name}) to {trained_model_path}")

                                                                                

Saved best model (GBT) to /Users/alek/Downloads/H3/MMDS_H3/data/trained_models


In [9]:
spark.stop()