In [0]:
-- Make sure we're in the correct catalog and schema

USE CATALOG healthcare;
USE SCHEMA camp_data;

The objective here is to gain insights on the health camps using our cleaned and transformed data. Here is a framework of thought that could be explored:
- Identify some questions that could be answered based on the available data

Identify some questions that could be answered based on the available data
- Which health camp had the highest number of visits
- Which employer category has the most patients?
- How does the number of stalls visited correlate with the health score?
- Which city type has the highest average health score?
- What is the average donation amount by age group?
- What is the average education score by city type?
- What is the distribution of patients by age group and city type?
- What is the total donation by city type?

In [0]:
-- Which health camp had the highest number of visits? 

SELECT health_camp_id, COUNT(patient_id) AS total_visits
FROM all_patient_details
GROUP BY health_camp_id
ORDER BY total_visits DESC
LIMIT 3;

In [0]:
-- Which employer category has the most patients?

SELECT employer_category, COUNT(patient_id) AS patient_count
FROM all_patient_details
GROUP BY employer_category
ORDER BY patient_count DESC;

In [0]:
-- How does the number of stalls visited correlate with the health score?

SELECT number_of_stall_visited, AVG(health_score) AS avg_health_score
FROM all_patient_details
GROUP BY number_of_stall_visited
ORDER BY avg_health_score DESC;

In [0]:
-- Which city type has the highest average health score?

SELECT city_type, AVG(health_score) AS avg_health_score
FROM all_patient_details
GROUP BY city_type
ORDER BY avg_health_score DESC;

In [0]:
-- What is the average donation amount by age group?
-- Group by age groups: 18-30, 31-40, 41-50, 51-60, 61-70, 71-80, 81-90, 91-100

SELECT CASE 
    WHEN age BETWEEN 18 AND 30 THEN '18-30'
    WHEN age BETWEEN 31 AND 40 THEN '31-40'
    WHEN age BETWEEN 41 AND 50 THEN '41-50'
    WHEN age BETWEEN 51 AND 60 THEN '51-60'
    WHEN age BETWEEN 61 AND 70 THEN '61-70'
    WHEN age BETWEEN 71 AND 80 THEN '71-80'
    WHEN age BETWEEN 81 AND 90 THEN '81-90'
    WHEN age BETWEEN 91 AND 100 THEN '91-100'
    ELSE 'Unknown'
  END AS age_group,
  AVG(donation) AS avg_donation_amount
FROM all_patient_details
GROUP BY age_group
ORDER BY avg_donation_amount DESC

In [0]:
-- What is the average education score by city type?

SELECT city_type, AVG(education_score) AS avg_education_score
FROM all_patient_details
GROUP BY city_type
ORDER BY avg_education_score DESC;

In [0]:
-- What is the distribution of patients by age group and city type

SELECT 
    CASE 
        WHEN age BETWEEN 18 AND 30 THEN '18-30'
        WHEN age BETWEEN 31 AND 40 THEN '31-40'
        WHEN age BETWEEN 41 AND 50 THEN '41-50'
        WHEN age BETWEEN 51 AND 60 THEN '51-60'
        WHEN age BETWEEN 61 AND 70 THEN '61-70'
        WHEN age BETWEEN 71 AND 80 THEN '71-80'
        WHEN age BETWEEN 81 AND 90 THEN '81-90'
        WHEN age BETWEEN 91 AND 100 THEN '91-100'
        ELSE 'Unknown'
    END AS age_group,
    city_type,
    COUNT(*) AS patient_count
FROM all_patient_details
GROUP BY age_group, city_type
ORDER BY patient_count DESC;

In [0]:
-- What is the total donation by city type

SELECT city_type, SUM(donation) AS total_donation
FROM all_patient_details
GROUP BY city_type
ORDER BY total_donation DESC;

In [0]:
%python
# Visualiza the correlation using a heatmap

import seaborn as sns
import matplotlib.pyplot as plt

df = spark.table("all_patient_details").toPandas()
plt.figure(figsize=(12,8))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap for all_patient_details")
plt.show()