In [1]:
import findspark
findspark.init()


In [2]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("MySparkApp").getOrCreate()
print("Spark version:", spark.version)

Spark version: 3.5.0


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

dataset_path = 'diabetes_binary_health_indicators_BRFSS2015.csv'
data = pd.read_csv(dataset_path)

print(data.head())

X = data.drop('Diabetes_binary', axis=1)
y = data['Diabetes_binary']

test_size = 0.2  # 80% training, 20% testing
random_seed = 42 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_seed)

print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


   Diabetes_binary  HighBP  HighChol  CholCheck   BMI  Smoker  Stroke  \
0              0.0     1.0       1.0        1.0  40.0     1.0     0.0   
1              0.0     0.0       0.0        0.0  25.0     1.0     0.0   
2              0.0     1.0       1.0        1.0  28.0     0.0     0.0   
3              0.0     1.0       0.0        1.0  27.0     0.0     0.0   
4              0.0     1.0       1.0        1.0  24.0     0.0     0.0   

   HeartDiseaseorAttack  PhysActivity  Fruits  ...  AnyHealthcare  \
0                   0.0           0.0     0.0  ...            1.0   
1                   0.0           1.0     0.0  ...            0.0   
2                   0.0           0.0     1.0  ...            1.0   
3                   0.0           1.0     1.0  ...            1.0   
4                   0.0           1.0     1.0  ...            1.0   

   NoDocbcCost  GenHlth  MentHlth  PhysHlth  DiffWalk  Sex   Age  Education  \
0          0.0      5.0      18.0      15.0       1.0  0.0   9.0   

In [4]:
offline_df = pd.concat([X_train, y_train], axis=1)
online_df = pd.concat([X_test, y_test], axis=1)

# Save them as CSV files
offline_df.to_csv('offline.csv', index=False)
online_df.to_csv('online.csv', index=False)

In [5]:
offline_spark_df = spark.read.csv("offline.csv", header=True, inferSchema=True)

offline_spark_df.show()

+------+--------+---------+----+------+------+--------------------+------------+------+-------+-----------------+-------------+-----------+-------+--------+--------+--------+---+----+---------+------+---------------+
|HighBP|HighChol|CholCheck| BMI|Smoker|Stroke|HeartDiseaseorAttack|PhysActivity|Fruits|Veggies|HvyAlcoholConsump|AnyHealthcare|NoDocbcCost|GenHlth|MentHlth|PhysHlth|DiffWalk|Sex| Age|Education|Income|Diabetes_binary|
+------+--------+---------+----+------+------+--------------------+------------+------+-------+-----------------+-------------+-----------+-------+--------+--------+--------+---+----+---------+------+---------------+
|   0.0|     1.0|      1.0|20.0|   1.0|   0.0|                 0.0|         1.0|   1.0|    1.0|              0.0|          1.0|        0.0|    2.0|     0.0|     0.0|     0.0|1.0|12.0|      6.0|   8.0|            0.0|
|   0.0|     0.0|      1.0|34.0|   0.0|   0.0|                 0.0|         1.0|   0.0|    1.0|              0.0|          1.0|     

In [6]:
offline_spark_df.printSchema()

root
 |-- HighBP: double (nullable = true)
 |-- HighChol: double (nullable = true)
 |-- CholCheck: double (nullable = true)
 |-- BMI: double (nullable = true)
 |-- Smoker: double (nullable = true)
 |-- Stroke: double (nullable = true)
 |-- HeartDiseaseorAttack: double (nullable = true)
 |-- PhysActivity: double (nullable = true)
 |-- Fruits: double (nullable = true)
 |-- Veggies: double (nullable = true)
 |-- HvyAlcoholConsump: double (nullable = true)
 |-- AnyHealthcare: double (nullable = true)
 |-- NoDocbcCost: double (nullable = true)
 |-- GenHlth: double (nullable = true)
 |-- MentHlth: double (nullable = true)
 |-- PhysHlth: double (nullable = true)
 |-- DiffWalk: double (nullable = true)
 |-- Sex: double (nullable = true)
 |-- Age: double (nullable = true)
 |-- Education: double (nullable = true)
 |-- Income: double (nullable = true)
 |-- Diabetes_binary: double (nullable = true)


In [7]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

label_col = "Diabetes_binary"

# Assemble features
feature_cols = offline_spark_df.columns[:-1]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

# Create classification models
lr = LogisticRegression(labelCol=label_col, featuresCol="features")
rf = RandomForestClassifier(labelCol=label_col, featuresCol="features")
gbt = GBTClassifier(labelCol=label_col, featuresCol="features")

# Create pipelines
lr_pipeline = Pipeline(stages=[assembler, lr])
rf_pipeline = Pipeline(stages=[assembler, rf])
gbt_pipeline = Pipeline(stages=[assembler, gbt])

# Split the data into training and testing sets
train_data, test_data = offline_spark_df.randomSplit([0.8, 0.2], seed=42)

# Set up parameter grids for hyperparameter tuning
lr_param_grid = ParamGridBuilder().build()
rf_param_grid = ParamGridBuilder().build()
gbt_param_grid = ParamGridBuilder().build()

# Set up evaluators
evaluator = MulticlassClassificationEvaluator(labelCol=label_col, predictionCol="prediction", metricName="f1")

# Perform cross-validation for each model
lr_cv = CrossValidator(estimator=lr_pipeline, estimatorParamMaps=lr_param_grid, evaluator=evaluator, numFolds=5)
rf_cv = CrossValidator(estimator=rf_pipeline, estimatorParamMaps=rf_param_grid, evaluator=evaluator, numFolds=5)
gbt_cv = CrossValidator(estimator=gbt_pipeline, estimatorParamMaps=gbt_param_grid, evaluator=evaluator, numFolds=5)

# Fit models and perform cross-validation
lr_model = lr_cv.fit(train_data)
rf_model = rf_cv.fit(train_data)
gbt_model = gbt_cv.fit(train_data)

# Make predictions on the test data
lr_predictions = lr_model.transform(test_data)
rf_predictions = rf_model.transform(test_data)
gbt_predictions = gbt_model.transform(test_data)

# Evaluate F1 scores
lr_f1 = evaluator.evaluate(lr_predictions)
rf_f1 = evaluator.evaluate(rf_predictions)
gbt_f1 = evaluator.evaluate(gbt_predictions)

print(f"F1 Score - Logistic Regression: {lr_f1}")
print(f"F1 Score - Random Forest: {rf_f1}")
print(f"F1 Score - Gradient Boosted Trees: {gbt_f1}")

# Choose the best model based on F1 score
best_model = lr_model if lr_f1 > rf_f1 and lr_f1 > gbt_f1 else (rf_model if rf_f1 > gbt_f1 else gbt_model)

# Save the best model
best_model_path = "best_model"
best_model.write().overwrite().save(best_model_path)
spark.stop()

F1 Score - Logistic Regression: 0.8265798437942979
F1 Score - Random Forest: 0.7930987584589925
F1 Score - Gradient Boosted Trees: 0.82907661910427
