# set up environment

In [23]:
# !pip install -r requirements.txt
# !pip install --upgrade pip

In [24]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as sf
from pyspark.ml.feature import StringIndexer, VectorAssembler,OneHotEncoder
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline

In [25]:
spark = SparkSession.builder.appName("bigdata-project_withSparkML")\
    .config("spark.hadoop.fs.defaultFS", "file://") \
    .getOrCreate() 

    # .config("spark.hadoop.fs.defaultFS", "hdfs://localhost:8501") \

In [26]:
Df = spark.read.option("header", "true").option("inferSchema", "true").csv("file:///e:/third year/Big_Data_Tech/Project/Disease_prediction/data/Training.csv")
# test = spark.read.option("header", "true").option("inferSchema", "true").csv("file:///e:/third year/Big_Data_Tech/Project/Disease_prediction/data/Testing.csv")


In [27]:
# type(train), type(test)

# EDA

drop unwanted column 

In [28]:
Df = Df.drop('_c133')
# test = test.drop('_c133')

In [29]:
null_counts = Df.select(
    [sf.sum(sf.col(c).isNull().cast("int")).alias(c) for c in Df.columns]
)

null_counts.show()

+-------+---------+--------------------+-------------------+---------+------+----------+------------+-------+----------------+--------------+--------+-------------------+-------------------+-------+-----------+-------+--------------------+-----------+-----------+------------+--------+-----------------+---------------------+-----+----------+-----------+--------------+--------+-----------+-----------+--------+--------------+----------+------+----------------+--------------------+---------+------------+--------------+---------+----------+------------+-----------------+-------------------+----------------+-------------------+-------------------+-------+----------------------------+------+-----------------+---------------+--------------+----------+----------+----------+-----------------+---------------+---------------------------+-------------------+------------+------------------+---------+---------+------+--------+-------+------------+---------------------+-------------------+------------

In [30]:
Df.describe().show()

+-------+-------------------+-------------------+--------------------+--------------------+-------------------+-------------------+-------------------+--------------------+--------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+--------------------+--------------------+--------------------+--------------------+-------------------+--------------------+-------------------+-------------------+---------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+--------------------+-------------------+-------------------+-------------------+------------------+-------------------+--------------------+--------------------+--------------------+------------------+-------------------+-------------------+--------------------+-------------------+--------------------+----------------+--------------------+-------------------+------------------

# Preprocess

dataset has no null values and consists only of categorical data (no need for fillna, scaling, or one-hot encoding)
so we skip that in preprocess

In [31]:
# turn label column to numeric
labelIndexer = StringIndexer(inputCol="prognosis", outputCol="label")

Df = labelIndexer.fit(Df).transform(Df)
df = Df.drop('prognosis')
train, test = df.randomSplit([0.8, 0.2], seed=42)



In [32]:
test.show()

+-------+---------+--------------------+-------------------+---------+------+----------+------------+-------+----------------+--------------+--------+-------------------+-------------------+-------+-----------+-------+--------------------+-----------+-----------+------------+--------+-----------------+---------------------+-----+----------+-----------+--------------+--------+-----------+-----------+--------+--------------+----------+------+----------------+--------------------+---------+------------+--------------+---------+----------+------------+-----------------+-------------------+----------------+-------------------+-------------------+-------+----------------------------+------+-----------------+---------------+--------------+----------+----------+----------+-----------------+---------------+---------------------------+-------------------+------------+------------------+---------+---------+------+--------+-------+------------+---------------------+-------------------+------------

In [33]:
train.show(5)

+-------+---------+--------------------+-------------------+---------+------+----------+------------+-------+----------------+--------------+--------+-------------------+-------------------+-------+-----------+-------+--------------------+-----------+-----------+------------+--------+-----------------+---------------------+-----+----------+-----------+--------------+--------+-----------+-----------+--------+--------------+----------+------+----------------+--------------------+---------+------------+--------------+---------+----------+------------+-----------------+-------------------+----------------+-------------------+-------------------+-------+----------------------------+------+-----------------+---------------+--------------+----------+----------+----------+-----------------+---------------+---------------------------+-------------------+------------+------------------+---------+---------+------+--------+-------+------------+---------------------+-------------------+------------

## Feature selection 

In [34]:
from pyspark.ml.classification import RandomForestClassifier
assembler = VectorAssembler(inputCols=train.columns[:-1],outputCol='features')
train = assembler.transform(train)
data = train.select('features','label')
# Split the data into training and test sets
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

# Initialize RandomForestClassifier
rf = RandomForestClassifier(featuresCol="features", labelCol="label", numTrees=10)

# Train the model
rf_model = rf.fit(train_data)
# Extract feature importances
importances = rf_model.featureImportances

# Get feature names
feature_names = train.columns[:-1]

# Combine feature names with their importance
feature_importances = list(zip(feature_names, importances.toArray()))

# Sort by importance
sorted_features = sorted(feature_importances, key=lambda x: x[1], reverse=True)

# Display feature importances
print("Feature Importances:")
for feature, importance in sorted_features:
    print(f"{feature}: {importance:.4f}")
# Select top 2 features
top_features = [feature for feature, importance in sorted_features[:40]]

print(f"Top Features: {top_features}")

Feature Importances:
stomach_bleeding: 0.0586
pain_in_anal_region: 0.0408
swollen_extremeties: 0.0405
receiving_unsterile_injections: 0.0399
polyuria: 0.0393
skin_peeling: 0.0391
drying_and_tingling_lips: 0.0388
rusty_sputum: 0.0387
foul_smell_of urine: 0.0362
blackheads: 0.0357
unsteadiness: 0.0217
mucoid_sputum: 0.0213
brittle_nails: 0.0211
mood_swings: 0.0211
nodal_skin_eruptions: 0.0208
yellow_crust_ooze: 0.0207
inflammatory_nails: 0.0207
palpitations: 0.0205
spotting_ urination: 0.0203
toxic_look_(typhos): 0.0202
neck_pain: 0.0202
weakness_in_limbs: 0.0200
scurring: 0.0194
anxiety: 0.0192
cramps: 0.0191
acute_liver_failure: 0.0191
continuous_feel_of_urine: 0.0190
history_of_alcohol_consumption: 0.0187
blister: 0.0181
muscle_wasting: 0.0179
blood_in_sputum: 0.0178
irregular_sugar_level: 0.0178
high_fever: 0.0176
weakness_of_one_body_side: 0.0172
family_history: 0.0168
bruising: 0.0163
shivering: 0.0162
yellowing_of_eyes: 0.0161
runny_nose: 0.0159
constipation: 0.0153
yellow_urine: 

# class to train and evaluate data

In [35]:

class ModelComparisonPipeline:
    def __init__(self, spark_session, train, test, top_features: list, label_column: str):
        self.spark = spark_session
        self.models = []  # Store models for comparison
        self.train = train
        self.test = test
        self.top_features = top_features
        self.label_column = label_column

    def _build_pipeline(self, model):
        # Step 1: Assemble features
        assembler = VectorAssembler(
            inputCols=self.top_features, outputCol="assembled_features"
        )
        model.setFeaturesCol("assembled_features")
        # Step 2: Build pipeline with assembler and model
        pipeline = Pipeline(stages=[assembler, model])
        return pipeline

    def compare_models(self, models_with_params):
        # Split the data into training and testing sets
        train_df, test_df = self.train, self.test

        # Initialize evaluator
        evaluator = MulticlassClassificationEvaluator(
            labelCol=self.label_column, predictionCol="prediction", metricName="accuracy"
        )

        # Iterate through models and evaluate
        results = []
        for model_name, model, param_grid in models_with_params:
            print(f"Training and evaluating {model_name}...")

            # Build pipeline with the current model
            # pipeline = self._build_pipeline(model)
            pipeline = Pipeline(stages=[model])

            # Train the model
            trained_model = pipeline.fit(train_df)

            # Evaluate the model
            predictions = trained_model.transform(test_df)
            accuracy = evaluator.evaluate(predictions)
            print(f"{model_name} Accuracy: {accuracy:.2f}")

            # Store results
            results.append((model_name, accuracy, trained_model))
            #    Define CrossValidator with the model and its parameter grid
            # cv = CrossValidator(
            #     estimator=pipeline,
            #     estimatorParamMaps=param_grid,
            #     evaluator=MulticlassClassificationEvaluator(labelCol="label", metricName="accuracy"),
            #     numFolds=3  # Adjust the number of folds as needed
            # )

            # # Fit the model and evaluate
            # cv_model = cv.fit(train)
            # accuracy = cv_model.avgMetrics[0]  # Access accuracy from CrossValidator

        # Save results
        results.append((model_name, accuracy, trained_model))
        # Return sorted results by accuracy
        results.sort(key=lambda x: x[1], reverse=True)
        return results

In [36]:
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier

# Decision Tree Classifier
dt = DecisionTreeClassifier(featuresCol="features", labelCol="label", maxBins=45)
dt_param_grid = ParamGridBuilder() \
    .addGrid(dt.maxDepth, [2, 5, 10]) \
    .addGrid(dt.maxBins, [10, 20, 40]) \
    .build()

# Random Forest Classifier
rf = RandomForestClassifier(featuresCol="features", labelCol="label", maxBins=45)
paramGrid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [50, 100, 150]) \
    .addGrid(rf.maxDepth, [5, 10, 20]) \
    .build()
# List of models with their parameter grids
models_with_params = [
    ("Decision Tree", dt),
    ("Random Forest", rf)
]

In [37]:
# Define features and label columns
feature_columns = [col for col in data.columns if col != "label"]
label_column = "label"

# Assemble features
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")


# Split data into training and test sets
train, test = data.randomSplit([0.8, 0.2], seed=42)

# Initialize classifiers
dt = DecisionTreeClassifier(featuresCol="features", labelCol="label")
rf = RandomForestClassifier(featuresCol="features", labelCol="label")


# Define parameter grids
dt_param_grid = ParamGridBuilder() \
    .addGrid(dt.maxDepth, [5, 10, 15]) \
    .build()

rf_param_grid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [50, 100]) \
    .addGrid(rf.maxDepth, [5, 10]) \
    .build()

# xgb_param_grid = ParamGridBuilder() \
#     .addGrid(xgb.max_depth, [5, 10]) \
#     .addGrid(xgb.learning_rate, [0.1, 0.3]) \
#     .build()

# Combine models and parameter grids
models_with_params = [
    ("Decision Tree", dt, dt_param_grid),
    ("Random Forest", rf, rf_param_grid),
    # ("XGBoost", xgb, xgb_param_grid)
]

# Define evaluator
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy"
)

# Compare models
best_models = []
for model_name, model, param_grid in models_with_params:
    print(f"Training and evaluating {model_name}...")

    # Define pipeline
    pipeline = Pipeline(stages=[model])

    # Define CrossValidator
    cv = CrossValidator(
        estimator=pipeline,
        estimatorParamMaps=param_grid,
        evaluator=evaluator,
        numFolds=3  # Adjust number of folds as needed
    )

    # Fit model using CrossValidator
    cv_model = cv.fit(train)

    # Get best model
    best_model = cv_model.bestModel

    # Evaluate on test data
    predictions = best_model.transform(test)
    accuracy = evaluator.evaluate(predictions)
    print(f"{model_name} Test Accuracy: {accuracy:.2f}")

    # Store the best model and its accuracy
    best_models.append((model_name, accuracy, best_model))

# Sort models by accuracy
best_models.sort(key=lambda x: x[1], reverse=True)

# Print results
print("\nModel Comparison Results:")
for model_name, accuracy, _ in best_models:
    print(f"{model_name}: {accuracy:.2f}")

Training and evaluating Decision Tree...
Decision Tree Test Accuracy: 0.40
Training and evaluating Random Forest...
Random Forest Test Accuracy: 1.00

Model Comparison Results:
Random Forest: 1.00
Decision Tree: 0.40


## Disease classification

In [38]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

# Assemble features for clustering
assembler = VectorAssembler(inputCols=df.columns[:-1], outputCol='features')
df = assembler.transform(df)

# Initialize KMeans
kmeans = KMeans().setK(5).setSeed(1)  # You can adjust the number of clusters (K) as needed

# Fit the model
model = kmeans.fit(df)

# Make predictions
predictions = model.transform(df)

# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
print(f"Silhouette with squared euclidean distance = {silhouette}")

# Show the result
print("Cluster Centers: ")
for center in model.clusterCenters():
    print(center)

Silhouette with squared euclidean distance = 0.2092062968239527
Cluster Centers: 
[0.32022472 0.10674157 0.         0.         0.         0.20786517
 0.42696629 0.         0.         0.         0.         0.73595506
 0.         0.         0.74157303 0.         0.         0.
 0.         0.10674157 0.         0.10674157 0.         0.
 0.         0.42134831 0.         0.         0.         0.
 0.         0.20786517 0.74719101 0.53370787 0.74719101 0.75842697
 0.11235955 0.11235955 0.09550562 0.74719101 0.20224719 0.11235955
 0.10674157 0.65168539 0.10674157 0.         0.         0.
 0.21910112 0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 

In [39]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.sql.functions import col, collect_list, count
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Step 1: Assemble features for clustering
feature_columns = [col for col in data.columns if col != "label"]
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")


# Step 2: Apply KMeans clustering to group diseases
kmeans = KMeans(featuresCol="features", predictionCol="cluster", k=10, seed=42)  # Adjust K as needed
kmeans_model = kmeans.fit(data)

# Add cluster predictions to the dataset
clustered_data = kmeans_model.transform(data)

# Step 3: Summarize clusters to find dominant diseases
cluster_summary = clustered_data.groupBy("cluster").agg(
    collect_list("label").alias("diseases_in_cluster"),
    count("*").alias("count")
)

# Print cluster summaries
cluster_summary.show(truncate=False)

# Step 4: Update dataset with cluster labels as a new feature
clustered_data = clustered_data.withColumn("cluster_label", col("cluster").cast("integer"))

# Step 5: Split data into training and testing sets
train, test = clustered_data.randomSplit([0.8, 0.2], seed=42)

# Step 6: Initialize classifiers
dt = DecisionTreeClassifier(featuresCol="features", labelCol="label")
rf = RandomForestClassifier(featuresCol="features", labelCol="label")

# Define parameter grids
dt_param_grid = ParamGridBuilder() \
    .addGrid(dt.maxDepth, [5, 10, 15]) \
    .build()

rf_param_grid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [50, 100]) \
    .addGrid(rf.maxDepth, [5, 10]) \
    .build()

# Combine models and parameter grids
models_with_params = [
    ("Decision Tree", dt, dt_param_grid),
    ("Random Forest", rf, rf_param_grid),
]

# Define evaluator
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy"
)

# Compare models
best_models = []
for model_name, model, param_grid in models_with_params:
    print(f"Training and evaluating {model_name}...")

    # Define pipeline
    pipeline = Pipeline(stages=[model])

    # Define CrossValidator
    cv = CrossValidator(
        estimator=pipeline,
        estimatorParamMaps=param_grid,
        evaluator=evaluator,
        numFolds=3
    )

    # Fit model using CrossValidator
    cv_model = cv.fit(train)

    # Get best model
    best_model = cv_model.bestModel

    # Evaluate on test data
    predictions = best_model.transform(test)
    accuracy = evaluator.evaluate(predictions)
    print(f"{model_name} Test Accuracy: {accuracy:.2f}")

    # Store the best model and its accuracy
    best_models.append((model_name, accuracy, best_model))

# Sort models by accuracy
best_models.sort(key=lambda x: x[1], reverse=True)

# Print results
print("\nModel Comparison Results:")
for model_name, accuracy, _ in best_models:
    print(f"{model_name}: {accuracy:.2f}")

print(best_model)


+-------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [40]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

# Define the mapping dictionary
cluster_to_category = {
    0: "Infectious Diseases",
    1: "Neurological Disorders",
    2: "Respiratory Disorders",
    3: "Skin Disorders",
    4: "Cardiovascular Diseases",
    5: "Gastrointestinal Disorders",
    6: "Endocrine Disorders",
    7: "Musculoskeletal Disorders",
    8: "Immune System Disorders",
    9: "Lifestyle-related Diseases"
}

# Define UDF for mapping clusters to categories
def map_cluster_to_category(cluster):
    return cluster_to_category.get(cluster, "Unknown Category")

map_cluster_udf = udf(map_cluster_to_category, StringType())

# Add general category column using the UDF
final_data = clustered_data.withColumn("general_category", map_cluster_udf(col("cluster").cast("int")))

# Join with the original DataFrame to get the prognosis column
final_data_with_prognosis = final_data.join(Df.select("label", "prognosis"), on="label", how="left")




In [45]:
# After training your model, save it
 # Example: Your trained RandomForest model
print(rf_model)
# Save the model
rf_model_path = "E:/third year/Big_Data_Tech/Project/Disease_prediction/models/rf_model"
# Create the directory if it doesn't exist
import os
if not os.path.exists(rf_model_path):
    os.makedirs(rf_model_path)

# Try saving the model
rf_model.write().overwrite().save(rf_model_path)
print("Model saved successfully.")

    


RandomForestClassificationModel: uid=RandomForestClassifier_5ec5597a7a51, numTrees=10, numClasses=41, numFeatures=132


Py4JJavaError: An error occurred while calling o27603.save.
: org.apache.spark.SparkException: Job aborted.
	at org.apache.spark.internal.io.SparkHadoopWriter$.write(SparkHadoopWriter.scala:106)
	at org.apache.spark.rdd.PairRDDFunctions.$anonfun$saveAsHadoopDataset$1(PairRDDFunctions.scala:1091)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:408)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopDataset(PairRDDFunctions.scala:1089)
	at org.apache.spark.rdd.PairRDDFunctions.$anonfun$saveAsHadoopFile$4(PairRDDFunctions.scala:1062)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:408)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:1027)
	at org.apache.spark.rdd.PairRDDFunctions.$anonfun$saveAsHadoopFile$3(PairRDDFunctions.scala:1009)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:408)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:1008)
	at org.apache.spark.rdd.PairRDDFunctions.$anonfun$saveAsHadoopFile$2(PairRDDFunctions.scala:965)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:408)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:963)
	at org.apache.spark.rdd.RDD.$anonfun$saveAsTextFile$2(RDD.scala:1596)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:408)
	at org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1596)
	at org.apache.spark.rdd.RDD.$anonfun$saveAsTextFile$1(RDD.scala:1582)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:408)
	at org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1582)
	at org.apache.spark.ml.util.DefaultParamsWriter$.saveMetadata(ReadWrite.scala:413)
	at org.apache.spark.ml.tree.EnsembleModelReadWrite$.saveImpl(treeModels.scala:473)
	at org.apache.spark.ml.classification.RandomForestClassificationModel$RandomForestClassificationModelWriter.saveImpl(RandomForestClassifier.scala:413)
	at org.apache.spark.ml.util.MLWriter.save(ReadWrite.scala:168)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:842)
Caused by: java.lang.UnsatisfiedLinkError: 'boolean org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(java.lang.String, int)'
	at org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(Native Method)
	at org.apache.hadoop.io.nativeio.NativeIO$Windows.access(NativeIO.java:793)
	at org.apache.hadoop.fs.FileUtil.canRead(FileUtil.java:1249)
	at org.apache.hadoop.fs.FileUtil.list(FileUtil.java:1454)
	at org.apache.hadoop.fs.RawLocalFileSystem.listStatus(RawLocalFileSystem.java:601)
	at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:1972)
	at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:2014)
	at org.apache.hadoop.fs.ChecksumFileSystem.listStatus(ChecksumFileSystem.java:761)
	at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:1972)
	at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:2014)
	at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.getAllCommittedTaskPaths(FileOutputCommitter.java:334)
	at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.commitJobInternal(FileOutputCommitter.java:404)
	at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.commitJob(FileOutputCommitter.java:377)
	at org.apache.hadoop.mapred.FileOutputCommitter.commitJob(FileOutputCommitter.java:136)
	at org.apache.hadoop.mapred.OutputCommitter.commitJob(OutputCommitter.java:291)
	at org.apache.spark.internal.io.HadoopMapReduceCommitProtocol.commitJob(HadoopMapReduceCommitProtocol.scala:192)
	at org.apache.spark.internal.io.SparkHadoopWriter$.$anonfun$write$3(SparkHadoopWriter.scala:100)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.util.Utils$.timeTakenMs(Utils.scala:640)
	at org.apache.spark.internal.io.SparkHadoopWriter$.write(SparkHadoopWriter.scala:100)
	... 52 more
