In [None]:
#Step 1: Loading Hospitalization Details

In [15]:
# import important library
import pandas as pd
import numpy as np
import os 
import mysql.connector
from dotenv import load_dotenv
from pathlib import Path
from tabulate import tabulate

In [16]:
# Read Dataset
hosp_details = pd.read_csv("Hospitalisation_Details.csv")

In [17]:
#Inspect data
hosp_details.head()

Unnamed: 0,c_id,yr,mth,date?,children?,charges?,host_tier,Ct_tier,st_id,Has_Children,Is_Frequent_Treatment
0,Id2335,1992,Jul,9,0,563.84,tier - 2,tier - 3,R1013,no,no
1,Id2334,1992,Nov,30,0,570.62,tier - 2,tier - 1,R1013,no,no
2,Id2333,1993,,30,0,600.0,tier - 2,tier - 1,R1013,no,no
3,Id2332,1992,Sep,13,0,604.54,tier - 3,tier - 3,R1013,no,no
4,Id2331,1998,Jul,27,0,637.26,tier - 3,tier - 3,R1013,no,no


In [18]:
#Step 2: Identifying Null Values in Hospitalization Details
null_values = hosp_details.isnull().sum()
null_values

c_id                     0
yr                       0
mth                      1
date?                    0
children?                0
charges?                 0
host_tier                0
Ct_tier                  0
st_id                    0
Has_Children             0
Is_Frequent_Treatment    0
dtype: int64

In [19]:
#remove the null values
remove_null_value = hosp_details.dropna(inplace = True)
remove_null_value 

In [20]:
#Step 3: Identifying Data Types in Hospitalization Details
data_types = hosp_details.dtypes
data_types

c_id                      object
yr                         int64
mth                       object
date?                      int64
children?                  int64
charges?                 float64
host_tier                 object
Ct_tier                   object
st_id                     object
Has_Children              object
Is_Frequent_Treatment     object
dtype: object

In [21]:
#Step 4: Identifying Duplicate Data in Hospitalization Details
duplicate_data = hosp_details.duplicated().sum()
duplicate_data

89

In [22]:
# Step 5: rename the columns 

hosp_details.rename(columns = {'c_id' : 'customer_id',
                               'yr' : 'year',
                               'mth' : 'month',
                               'date?' : 'date',
                               'children?' : 'children',
                               'charges?' : 'charges',
                               'host_tier' : 'hospital_tier',
                               'Ct_tier': 'city_tier',
                               'st_id': 'state_id'}, inplace=True)

In [23]:
hosp_details.head(2)

Unnamed: 0,customer_id,year,month,date,children,charges,hospital_tier,city_tier,state_id,Has_Children,Is_Frequent_Treatment
0,Id2335,1992,Jul,9,0,563.84,tier - 2,tier - 3,R1013,no,no
1,Id2334,1992,Nov,30,0,570.62,tier - 2,tier - 1,R1013,no,no


In [24]:
# Data Preprocessing and Cleaning for Hospitalization Details

# remove duplicates and unwanted columns

hosp_details.drop_duplicates(inplace = True)
hosp_details.drop(['Has_Children','Is_Frequent_Treatment'], axis = 1, inplace = True)
hosp_details.head(2)

In [25]:
hosp_details.to_csv('Hospitalisation_Details_Cleaned.csv', index=False, index_label='customer_id')

In [26]:
#Check the clean data
hosp = pd.read_csv('hospitalisation_details_cleaned.csv')
hosp.head()

Unnamed: 0,customer_id,year,month,date,children,charges,hospital_tier,city_tier,state_id
0,Id2335,1992,Jul,9,0,563.84,tier - 2,tier - 3,R1013
1,Id2334,1992,Nov,30,0,570.62,tier - 2,tier - 1,R1013
2,Id2332,1992,Sep,13,0,604.54,tier - 3,tier - 3,R1013
3,Id2331,1998,Jul,27,0,637.26,tier - 3,tier - 3,R1013
4,Id2330,2001,Nov,20,0,646.14,tier - 3,tier - 3,R1012


In [27]:
# Step 6: Loading Medical Examination Data

med_exam = pd.read_csv('Medical_Examinations.csv')
med_exam

Unnamed: 0,cid,b_m_i,HBA1C,h_Issues,any_transplant,cancer_hist,noofmajorsurgeries,smoker??,recovery_period
0,Id1,47.410,7.47,No,No,No,No major surgery,yes,
1,Id2,30.360,5.77,No,No,No,No major surgery,yes,
2,Id3,34.485,11.87,yes,No,No,2,yes,Moderate
3,Id4,38.095,6.05,No,No,No,No major surgery,yes,
4,Id5,35.530,5.45,No,No,No,No major surgery,yes,
...,...,...,...,...,...,...,...,...,...
2369,Id128,32.775,4.72,No,No,No,No major surgery,yes,
2370,Id129,34.200,5.91,yes,No,No,No major surgery,yes,
2371,Id130,30.200,9.58,No,No,No,No major surgery,yes,
2372,Id131,48.320,5.77,No,No,No,No major surgery,yes,


In [28]:
#Step 7: Identifying Null Values in Medical Examination Data
null_values = med_exam.isnull().sum()
null_values

# we see only recovery_period has nulls 

cid                      0
b_m_i                    0
HBA1C                    0
h_Issues                 0
any_transplant           0
cancer_hist              0
noofmajorsurgeries       0
smoker??                 0
recovery_period       1096
dtype: int64

In [30]:
# Replace all occurrences of "nomajorsurgeries" with 0 in the 'nomajorsurgeries' column

med_exam['noofmajorsurgeries'].replace('No major surgery', 0, inplace= True)
med_exam.head(2)

Unnamed: 0,cid,b_m_i,HBA1C,h_Issues,any_transplant,cancer_hist,noofmajorsurgeries,smoker??,recovery_period
0,Id1,47.41,7.47,No,No,No,0,yes,
1,Id2,30.36,5.77,No,No,No,0,yes,


In [31]:
#Step 8: Identifying Data Types in Medical Examination Data

data_tpes = med_exam.dtypes
data_types

c_id                      object
yr                         int64
mth                       object
date?                      int64
children?                  int64
charges?                 float64
host_tier                 object
Ct_tier                   object
st_id                     object
Has_Children              object
Is_Frequent_Treatment     object
dtype: object

In [32]:
#Step 9: Identifying Duplicate Data in Medical Examination Data
duplicates_data = med_exam.duplicated().sum()
duplicates_data

39

In [33]:
#Step 10: Data Preprocessing and Cleaning for Medical Examination Data
# reomve and drop the duplicates and unwated columns


med_exam.drop_duplicates(inplace = True)
med_exam.drop(['recovery_period'], axis = 1, inplace = True)

In [34]:
med_exam.head(2)

Unnamed: 0,cid,b_m_i,HBA1C,h_Issues,any_transplant,cancer_hist,noofmajorsurgeries,smoker??
0,Id1,47.41,7.47,No,No,No,0,yes
1,Id2,30.36,5.77,No,No,No,0,yes


In [35]:
med_exam.rename(columns = { 'cid' : 'customer_id',
                           'b_m_i' : 'BMI',
                           'h_Issues' : 'health_issues',
                           'cancer_hist' : 'cancer_history',
                           'noofmajorsurgeries' : 'numberofmajorsurgeries',
                           'smoker??' : 'smoker'}, inplace= True)

#--- Export the df as "medical_examinations_cleaned.csv" ---

med_exam.to_csv('medical_examinations_cleaned.csv', index = False, index_label = 'customer_id')
med_exam

Unnamed: 0,customer_id,BMI,HBA1C,health_issues,any_transplant,cancer_history,numberofmajorsurgeries,smoker
0,Id1,47.410,7.47,No,No,No,0,yes
1,Id2,30.360,5.77,No,No,No,0,yes
2,Id3,34.485,11.87,yes,No,No,2,yes
3,Id4,38.095,6.05,No,No,No,0,yes
4,Id5,35.530,5.45,No,No,No,0,yes
...,...,...,...,...,...,...,...,...
2330,Id2331,22.340,5.57,No,No,No,1,No
2331,Id2332,17.700,6.28,No,No,No,1,No
2332,Id2333,16.470,6.35,No,No,Yes,1,No
2333,Id2334,17.600,4.39,No,No,No,1,No


In [36]:
#Step 1: Data Download, Import, and Database Connection

In [37]:
# Fill SQL server details according

cwd = os.getcwd()
sql = str(Path(cwd).parents[2]) + "\\Downloads\\sql.env"
load_dotenv(sql)
conn = mysql.connector.connect(
    host=os.getenv("MYSQL_HOST"),
    user=os.getenv("MYSQL_USER"),
    password=os.getenv("MYSQL_PASSWORD"),
    database="Medical"
)

cursor = conn.cursor()
print("Successfully connected to MySQL!")

DatabaseError: 2017 (HY000): Can't open named pipe to host: .  pipe: MySQL (2)

In [None]:
def f(q):
    cursor.execute(q)
    rows = cursor.fetchall()
    headers = [col[0] for col in cursor.description]
    print(tabulate(rows, headers=headers, tablefmt="grid"))

In [None]:
f("SELECT * FROM hospitalisation_details LIMIT 5;")

In [None]:
#Step 2: Average Hospital Charges Analysis
f("SELECT AVG(charges) as avg_charges FROM hospitalisation_details;")

In [None]:
#Step 3: High Charges Analysis
f("select customer_id, year, charges  from hospitalisation_details where charges>700 limit 10;")

In [None]:
#Step 4: High BMI Patients Analysis
f("""SELECT n.name, h.year, h.charges
FROM hospitalisation_details AS h
JOIN medical_examinations AS m ON h.customer_id = m.customer_id
JOIN names AS n ON n.customer_id = m.customer_id
WHERE m.BMI > 35
limit 10;""")

In [None]:
#Step 5: Customers with Major Surgeries
f("""SELECT n.customer_id, n.name
FROM names AS n
JOIN medical_examinations AS m ON n.customer_id = m.customer_id
WHERE m.numberofmajorsurgeries >= 1
limit 10;""")

In [None]:
#Step 6: Average Charges by Hospital Tier in 2000
f("""select hospital_tier, avg(charges) from hospitalisation_details
where year = 2000
group by hospital_tier;""")

In [None]:
#Step 7: Smoking Patients with Transplants Analysis
f("""select m.customer_id, m.BMI, h.charges from medical_examinations as m
JOIN hospitalisation_details as h ON h.customer_id = m.customer_id
where smoker = 'yes' AND any_transplant = 'yes';""")

In [None]:
#Step 8: Patients with Major Surgeries or Cancer History
f("""select n.name from  names as n
JOIN medical_examinations as m ON m.customer_id = n.customer_id
where cancer_history = 'Yes' OR numberofmajorsurgeries >= 2
limit 10;""")

In [None]:
#Step 9: Customer with Most Major Surgeries
f("""select n.customer_id, n.name from  names as n
JOIN medical_examinations as m ON m.customer_id = n.customer_id
order by numberofmajorsurgeries desc
limit 1;""")

In [None]:
#Step 10: Customers with Major Surgeries and City Tiers
f("""SELECT n.customer_id, n.name, h.city_tier
FROM hospitalisation_details AS h
JOIN medical_examinations AS m ON h.customer_id = m.customer_id
JOIN names AS n ON n.customer_id = m.customer_id
where numberofmajorsurgeries > 0
limit 10;""")

In [None]:
#Step 11: Average BMI by City Tier in 1995
f("""select h.city_tier, AVG(m.BMI) as avg_bmi 
FROM hospitalisation_details AS h
JOIN medical_examinations AS m ON h.customer_id = m.customer_id
where year = 1995
group by h.city_tier""")

In [None]:
#Step 12: High BMI Customers with Health Issues
f("""SELECT n.customer_id, n.name, h.charges
FROM hospitalisation_details AS h
JOIN medical_examinations AS m ON h.customer_id = m.customer_id
JOIN names AS n ON n.customer_id = m.customer_id
where health_issues = 'yes' and m.BMI > 30
limit 10;""")

In [None]:
#Step 13: Customers with Highest Charges and City Tier by Year
f("""SELECT h.year, n.name, h.city_tier, max(h.charges) asmax_charges
FROM hospitalisation_details AS h
JOIN names AS n ON n.customer_id = h.customer_id
group by h.year, n.name, h.city_tier
Having max(h.charges) = (select max(charges) from hospitalisation_details where year = h.year)
limit 10;""")

In [66]:
#Step 14: Top 3 Customers with Highest Average Yearly Charges
f("""with Yearlycharges as ( 
    select customer_id, avg(charges) as avg_yearly_Charges 
    from hospitalisation_details group by customer_id, year)
Select n.name, y.avg_yearly_Charges from names n
JOIN Yearlycharges as y ON y.customer_id = n.customer_id
order by y.avg_yearly_Charges desc
limit 3;""")

In [None]:
#Step 15: Ranking Customers by Total Charges
f("""SELECT n.name, sum(h.charges) as total_charges, RANK() OVER (ORDER BY SUM(h.charges) desc) AS charges_rank
FROM hospitalisation_details AS h
JOIN names AS n ON n.customer_id = h.customer_id
group by n.name
order by charges_rank asc
limit 10;""")

In [None]:
#Step 16: Identifying Peak Year for Hospitalizations
f("""WITH YearlyHospitalizations AS (
    SELECT
        year,
        COUNT(*) AS num_hospitalizations
    FROM
        hospitalisation_details
    GROUP BY
        year
)
SELECT
    year,
    num_hospitalizations
FROM
    YearlyHospitalizations
WHERE
    num_hospitalizations = (SELECT MAX(num_hospitalizations) FROM YearlyHospitalizations);""")

In [None]:
#Step 17 :Yearly Trend of Smoking Patients
f("""
SELECT h.year, COUNT(*) AS smoker_count
FROM hospitalisation_details h
JOIN medical_examinations m ON h.customer_id = m.customer_id
WHERE m.smoker = 'yes'
GROUP BY h.year
ORDER BY h.year;
""")


In [None]:
#Step 18 :Repeat Hospitalized Customers
f("""
SELECT customer_id, COUNT(*) AS visit_count
FROM hospitalisation_details
GROUP BY customer_id
HAVING visit_count > 1
ORDER BY visit_count DESC
LIMIT 10;
""")


In [None]:
# Step 19 :BMI Category vs Average Charges
f("""
SELECT
  CASE
    WHEN m.BMI < 18.5 THEN 'Underweight'
    WHEN m.BMI BETWEEN 18.5 AND 24.9 THEN 'Normal'
    WHEN m.BMI BETWEEN 25 AND 29.9 THEN 'Overweight'
    WHEN m.BMI BETWEEN 30 AND 34.9 THEN 'Obese I'
    WHEN m.BMI BETWEEN 35 AND 39.9 THEN 'Obese II'
    ELSE 'Obese III'
  END AS BMI_Category,
  AVG(h.charges) AS avg_charges
FROM hospitalisation_details h
JOIN medical_examinations m ON h.customer_id = m.customer_id
GROUP BY BMI_Category
ORDER BY avg_charges DESC;
""")


In [None]:
# Step 20:Average Charges Over Time by Gender
f("""
SELECT m.gender, h.year, AVG(h.charges) AS avg_charges
FROM hospitalisation_details h
JOIN medical_examinations m ON h.customer_id = m.customer_id
GROUP BY m.gender, h.year
ORDER BY h.year, m.gender;
""")


In [None]:
cursor.close()
conn.close()
print("✅ SQL connection closed successfully!")