In the field of pharmacovigilance, it's crucial to monitor and assess adverse reactions that patients may experience after taking certain medications. Adverse reactions, also known as side effects, can range from mild to severe and can impact the safety and efficacy of a medication.

For each medication, count the number of adverse reactions reported within the first 30 days of the prescription being issued. Assume that the prescription date in the Prescriptions table represents the start date of the medication usage, display the output in ascending order of medication name.


In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType
from pyspark.sql.functions import *

# Initialize Spark session
spark = SparkSession.builder.appName("HealthcareData").getOrCreate()

# Define schema for patients
patients_schema = StructType([
    StructField("patient_id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("gender", StringType(), True)
])

# Data for patients
patients_data = [
    (1, "John Doe", 35, "Male"),
    (2, "Jane Smith", 45, "Female"),
    (3, "Alice Johnson", 25, "Female")
]

# Create DataFrame for patients
patients_df = spark.createDataFrame(patients_data, schema=patients_schema)

# Define schema for medications
medications_schema = StructType([
    StructField("medication_id", IntegerType(), True),
    StructField("medication_name", StringType(), True),
    StructField("manufacturer", StringType(), True)
])

# Data for medications
medications_data = [
    (1, "Aspirin", "Pfizer"),
    (2, "Tylenol", "Johnson & Johnson"),
    (3, "Lipitor", "Pfizer")
]

# Create DataFrame for medications
medications_df = spark.createDataFrame(medications_data, schema=medications_schema)

# Define schema for prescriptions
prescriptions_schema = StructType([
    StructField("prescription_id", IntegerType(), True),
    StructField("patient_id", IntegerType(), True),
    StructField("medication_id", IntegerType(), True),
    StructField("prescription_date", StringType(), True)
])

# Data for prescriptions
prescriptions_data = [
    (1, 1, 1, "2023-01-01"),
    (2, 1, 2, "2023-02-15"),
    (3, 2, 1, "2023-03-10"),
    (4, 3, 3, "2023-04-20")
]

# Create DataFrame for prescriptions
prescriptions_df = spark.createDataFrame(prescriptions_data, schema=prescriptions_schema)

# Define schema for adverse reactions
adverse_reactions_schema = StructType([
    StructField("reaction_id", IntegerType(), True),
    StructField("patient_id", IntegerType(), True),
    StructField("reaction_description", StringType(), True),
    StructField("reaction_date", StringType(), True)
])

# Data for adverse reactions
adverse_reactions_data = [
    (1, 1, "Nausea", "2023-01-05"),
    (2, 2, "Headache", "2023-03-20"),
    (3, 3, "Dizziness", "2023-05-01"),
    (4, 1, "Rash", "2023-01-20"),
    (5, 2, "Nausea", "2023-04-05")
]

# Create DataFrame for adverse reactions
adverse_reactions_df = spark.createDataFrame(adverse_reactions_data, schema=adverse_reactions_schema)

# Show DataFrames
#patients_df.show()
##medications_df.show()
#prescriptions_df.show()
#adverse_reactions_df.show()




In [0]:


# Convert from string to Date data type for  prescription_date and reaction_date 
prescriptions_df = prescriptions_df.withColumn("prescription_date", to_date(col("prescription_date"), "yyyy-MM-dd"))
adverse_reactions_df = adverse_reactions_df.withColumn("reaction_date", to_date(col("reaction_date"), "yyyy-MM-dd"))

# Join medications and prescriptions
df_join = medications_df.join(prescriptions_df, medications_df.medication_id == prescriptions_df.medication_id, how="left")

# Join the resulting DataFrame with adverse_reactions
df_join_1 = df_join.join(
    adverse_reactions_df,
    (df_join.patient_id == adverse_reactions_df.patient_id) &
    (adverse_reactions_df.reaction_date >= df_join.prescription_date) &
    (adverse_reactions_df.reaction_date < date_add(df_join.prescription_date, 30)),
    how="left"
).dropDuplicates(["medication_name", "manufacturer", "reaction_id"])

# Group by medication_name and manufacturer, and count the number of adverse reactions
result_df = df_join_1.groupBy("medication_name", "manufacturer").agg(count(col("reaction_id")).alias("num_adverse_reactions"))

# Display the result in ascending order of medication_name
result_df.orderBy("medication_name").show()


+---------------+-----------------+---------------------+
|medication_name|     manufacturer|num_adverse_reactions|
+---------------+-----------------+---------------------+
|        Aspirin|           Pfizer|                    4|
|        Lipitor|           Pfizer|                    1|
|        Tylenol|Johnson & Johnson|                    0|
+---------------+-----------------+---------------------+

