In [0]:
%run "/Workspace/Users/anirudhp@megnity.com/healthcare_project/src/Explore and Clean"

In [0]:
first_camp_cleaned.printSchema()
second_camp_cleaned.printSchema()
third_camp_cleaned.printSchema()
patient_profiles_cleaned.printSchema()
health_camp_details_cleaned.printSchema()

1. There are 5 tables. See if there is any relationship between them. 
2. If there is a relationship, check if it makes sense to put the related tables together in whichever way it can be required for end user usecase
3. If needed, create new calculated or derived field if needed
4. The end outcome could be unified table or tables

### One unified table that can be created is combining all patient details from first_camp, second_camp, third_camp, and patient_profiles into one table. This can serve as on cosolidated table with all patient details. However, at the end, we will still save all tables as they can still be useful for specific analysis scenarios. 

### 1. Get all patient ids along with all the healthcamps they've attended

In [0]:
# Step 1: Select and rename Health_Camp_ID columns
try:
    # Select only Patient_ID and Health_Camp_ID, renaming Health_Camp_ID for clarity
    df_first_selected = first_camp_cleaned.select(
        col("Patient_ID"),
        col("Health_Camp_ID").alias("First_Camp_Health_ID")
    )
    df_second_selected = second_camp_cleaned.select(
        col("Patient_ID"),
        col("Health_Camp_ID").alias("Second_Camp_Health_ID")
    )
    df_third_selected = third_camp_cleaned.select(
        col("Patient_ID"),
        col("Health_Camp_ID").alias("Third_Camp_Health_ID")
    )
except Exception as e:
    print(f"Error selecting columns: {e}")
    raise

# Step 2: Perform outer joins to combine all Patient_IDs
try:
    # Join the three DataFrames on Patient_ID using outer joins
    all_patients = df_first_selected.join(
        df_second_selected,
        "Patient_ID",
        "outer"
    ).join(
        df_third_selected,
        "Patient_ID",
        "outer"
    )
    print("DataFrames joined successfully.")
except Exception as e:
    print(f"Error joining DataFrames: {e}")
    raise

# Step 3: Display sample output (first 10 rows)
display(all_patients.limit(10))

### 2. Perform an outer join of all_patients with patient_profiles details

In [0]:
# Perform a left join of all_patient_details with patient_profiles on Patient_ID
all_patient_details = all_patients.join(patient_profiles, "Patient_ID", "left")

# Display the first 25 rows of the final DataFrame
display(all_patient_details.limit(25))

# Print schema and row count
all_patient_details.printSchema()
print(f"Row count: {all_patient_details.count()}")

### Combine all_patient_details table with all the information we have on them from the health_camp tables

### 3. Combine donation and health_score from first_camp_cleaned with matching patient_ID in all_patient_details

In [0]:
from pyspark.sql.functions import col

# Perform an outer join with all_patient_details and first_camp_cleaned on Patient_ID and health_camp_id
df_joined = all_patient_details.alias("df1").join(
    first_camp_cleaned.select(
        col("patient_id").alias("Patient_ID"),
        col("health_camp_id").alias("First_Camp_Health_ID"),
        "donation",
        "health_score"
    ).alias("df2"),
    on=[col("df1.Patient_ID") == col("df2.Patient_ID"), col("df1.First_Camp_Health_ID") == col("df2.First_Camp_Health_ID")],
    how="outer"
)

# Select only the required columns and rename health_score to first_camp_health_score
all_patient_details_updated = df_joined.select(
    col("df1.*"),
    col("df2.donation"),
    col("df2.health_score").alias("first_camp_health_score")
)

# Display the resulting DataFrame
display(all_patient_details_updated)

### 4. Now add health score from the second table to all_patient_details_updated

In [0]:
from pyspark.sql.functions import col

# Perform an outer join with all_patient_details_updated and second_camp_cleaned on Patient_ID and health_camp_id
df_joined_second = all_patient_details_updated.alias("df1").join(
    second_camp_cleaned.select(
        col("patient_id").alias("Patient_ID"),
        col("health_camp_id").alias("Second_Camp_Health_ID"),
        col("health_score").alias("second_camp_health_score")
    ).alias("df2"),
    on=[col("df1.Patient_ID") == col("df2.Patient_ID"), col("df1.Second_Camp_Health_ID") == col("df2.Second_Camp_Health_ID")],
    how="outer"
)

# Select only the required columns
all_patient_details_updated = df_joined_second.select(
    col("df1.*"),
    col("df2.second_camp_health_score")
)

# Display the resulting DataFrame
display(all_patient_details_updated)

### 5. Combine the remaining information from the third table into the all_patient_details_updated

In [0]:
# Perform an outer join with all_patient_details_updated and third_camp_cleaned on Patient_ID and health_camp_id
df_joined_third = all_patient_details_updated.alias("df1").join(
    third_camp_cleaned.select(
        col("patient_id").alias("Patient_ID"),
        col("health_camp_id").alias("Third_Camp_Health_ID"),
        col("number_of_stall_visited"),
        col("last_stall_visited_number")
    ).alias("df2"),
    on=[col("df1.Patient_ID") == col("df2.Patient_ID"), col("df1.Third_Camp_Health_ID") == col("df2.Third_Camp_Health_ID")],
    how="outer"
)

# Select only the required columns
all_patient_details_updated = df_joined_third.select(
    col("df1.*"),
    col("df2.number_of_stall_visited").alias("number_of_stall_visited_in_third_camp"),
    col("df2.last_stall_visited_number").alias("last_stall_visited_number_in_third_camp")
)

# Display the resulting DataFrame
display(all_patient_details_updated)