<a href="https://colab.research.google.com/github/4LanCrane/4LanCrane/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from pyspark.sql import SparkSession
from mlxtend.preprocessing import TransactionEncoder
from pyspark.ml.feature import StringIndexer, CountVectorizer, VectorAssembler
from pyspark.ml.fpm import FPGrowth
from pyspark.ml import Pipeline
from pyspark.ml.classification import NaiveBayes
from pyspark.sql.functions import array, lit , explode, collect_list, array_distinct, array_remove, size, array_except, col, array_union, array_contains, concat, split, concat_ws, collect_set, expr
from google.colab import drive
drive.mount('/content/drive')
file_path = "/content/drive/MyDrive/Colab Notebooks/DiseaseAndSymptoms.csv"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# function for creating a spark session
def create_spark_session(app_name="HealthData"):
    return SparkSession.builder.appName(app_name).getOrCreate()

# function for creating a dataframe
def create_Dataframe(spark,file_path):
    return spark.read.csv(file_path, header=True, inferSchema=True)

# extracts symptoms colums from dataframe
def extract_symptoms_column(df):
    symptom_columns = [col_name for col_name in df.columns if col_name != 'Disease']
    df = df.withColumn('Symptoms', array(*symptom_columns))
    return df.select('Disease', 'Symptoms')

# runs fpGrowth and returns model
def run_fp_growth(df_aggregated,items_col_value, minSupport, minConfidence):
    fp = FPGrowth(itemsCol=items_col_value, minSupport=minSupport, minConfidence=minConfidence)
    model = fp.fit(df_aggregated)
    return model
# prints the number of distinct values
def countDistinct(df, columnName):
  print("Number of unique diseases =",df.select(columnName).distinct().count())


In [None]:
spark = create_spark_session()
df = create_Dataframe(spark, file_path)


In [None]:
numOfRecords = df.count()
print("Number of records = ", numOfRecords)

Number of records =  4920


Ref = https://www.kaggle.com/code/megan3/market-basket-analysis-using-pyspark

In [None]:
df = extract_symptoms_column(df)

In [None]:
df.show(10,False)

+----------------+------------------------------------------------------------------------------------------------------------------------------------------------+
|Disease         |Symptoms                                                                                                                                        |
+----------------+------------------------------------------------------------------------------------------------------------------------------------------------+
|Fungal infection|[itching,  skin_rash,  nodal_skin_eruptions,  dischromic _patches, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL]|
|Fungal infection|[ skin_rash,  nodal_skin_eruptions,  dischromic _patches, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL]   |
|Fungal infection|[itching,  nodal_skin_eruptions,  dischromic _patches, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL]      |
|Fungal infectio

In [None]:
df.selectExpr("explode(Symptoms) as Symptom").groupBy("Symptom").count().orderBy("count", ascending=False).show(10)


+------------------+-----+
|           Symptom|count|
+------------------+-----+
|              NULL|46992|
|           fatigue| 1932|
|          vomiting| 1914|
|        high_fever| 1362|
|  loss_of_appetite| 1152|
|            nausea| 1146|
|          headache| 1134|
|    abdominal_pain| 1032|
|    yellowish_skin|  912|
| yellowing_of_eyes|  816|
+------------------+-----+
only showing top 10 rows



In [None]:
df_Symptoms = df.select("Disease", col("Symptoms").alias("Symptoms"))
df_Symptoms.show(3,False)

+----------------+------------------------------------------------------------------------------------------------------------------------------------------------+
|Disease         |Symptoms                                                                                                                                        |
+----------------+------------------------------------------------------------------------------------------------------------------------------------------------+
|Fungal infection|[itching,  skin_rash,  nodal_skin_eruptions,  dischromic _patches, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL]|
|Fungal infection|[ skin_rash,  nodal_skin_eruptions,  dischromic _patches, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL]   |
|Fungal infection|[itching,  nodal_skin_eruptions,  dischromic _patches, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL]      |
+---------------

Ref = https://www.kaggle.com/code/megan3/market-basket-analysis-using-pyspark


In [None]:
df_aggregated = df_Symptoms.select("Disease", array_except("Symptoms", array(lit(None))).alias("Symptoms"))
df_aggregated.show(3,False)

+----------------+------------------------------------------------------------------+
|Disease         |Symptoms                                                          |
+----------------+------------------------------------------------------------------+
|Fungal infection|[itching,  skin_rash,  nodal_skin_eruptions,  dischromic _patches]|
|Fungal infection|[ skin_rash,  nodal_skin_eruptions,  dischromic _patches]         |
|Fungal infection|[itching,  nodal_skin_eruptions,  dischromic _patches]            |
+----------------+------------------------------------------------------------------+
only showing top 3 rows



In [None]:
countDistinct(df_aggregated,"Disease")

Number of unique diseases = 41


In [None]:
model = run_fp_growth(df_aggregated, "Symptoms", 0.05, 0.8)
model.associationRules.show(10, False)

+-----------------------------------------------------------------------------------------------+-------------------+------------------+------------------+--------------------+
|antecedent                                                                                     |consequent         |confidence        |lift              |support             |
+-----------------------------------------------------------------------------------------------+-------------------+------------------+------------------+--------------------+
|[ chills,  headache,  fatigue]                                                                 |[ high_fever]      |0.9629629629629629|3.4785446239190727|0.06341463414634146 |
|[ joint_pain,  yellowing_of_eyes,  nausea,  loss_of_appetite,  vomiting]                       |[ yellowish_skin]  |0.9375            |5.057565789473684 |0.054878048780487805|
|[ joint_pain,  yellowing_of_eyes,  nausea,  loss_of_appetite,  vomiting]                       |[ abdominal_pain] 

Now increase the minsupport and minconfidence

In [None]:
model = run_fp_growth(df_aggregated, "Symptoms", 0.06, 0.8)

Show the association Rules

In [None]:
model.associationRules.show(20, False)

+----------------------------------------------------------------------------+--------------------+------------------+------------------+-------------------+
|antecedent                                                                  |consequent          |confidence        |lift              |support            |
+----------------------------------------------------------------------------+--------------------+------------------+------------------+-------------------+
|[ chills,  headache,  fatigue]                                              |[ high_fever]       |0.9629629629629629|3.4785446239190727|0.06341463414634146|
|[ yellowish_skin,  abdominal_pain,  nausea,  vomiting]                      |[ yellowing_of_eyes]|0.9692307692307692|5.843891402714932 |0.07682926829268293|
|[ yellowish_skin,  abdominal_pain,  nausea,  vomiting]                      |[ loss_of_appetite] |0.9538461538461539|4.073717948717949 |0.07560975609756097|
|[ joint_pain,  yellowish_skin,  nausea]            

Check that prediction is not empty and show 100 results where there is only one prediction given


In [None]:
FilterdOnePrediction = model.transform(df_aggregated).filter(size(col("Prediction")) == 1)
FilterdOnePrediction.show(50, False)

+----------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------+
|Disease                     |Symptoms                                                                                                                                                                           |prediction          |
+----------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------+
|Allergy                     |[ continuous_sneezing,  shivering,  chills,  watering_from_eyes]                                                                                                                   |[ high_fever]       |
|Allergy                     |[ shivering,  chills,  watering_from_eyes]

In [None]:
filterd = model.transform(df_aggregated).filter(size(col("Prediction")) > 0 )
filterd.show(400, False)

+---------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------+
|Disease                                |Symptoms                                                                                                                                                                                                                              |prediction                                      |
+---------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------+
|Allergy                          