<h2 style="text-align:center;">Exploring Dataset</h2>

<h2>🔃Loading Datasets</h2>

In [1]:
import os
import sys

# Detect project root by going up until we find the 'src' directory
current_dir = os.getcwd()
while not os.path.isdir(os.path.join(current_dir, 'src')):
    current_dir = os.path.dirname(current_dir)
    if current_dir == os.path.dirname(current_dir):  # Reached filesystem root
        raise FileNotFoundError("Could not find 'src' directory in any parent folders.")

# Set project root and add it to sys.path
PROJECT_ROOT = current_dir
print(f"Setting project root: {PROJECT_ROOT}")
os.chdir(PROJECT_ROOT)
sys.path.insert(0, PROJECT_ROOT)


from src.data import loader, preprocessor
from src.visualization import exploration_visualized


Setting project root: c:\Users\HP\Desktop\Healthcare_test_results_classification-


In [2]:
  # Make sure loader.py is in your project directory

project_root = r'C:\Users\HP\Desktop\Healthcare_test_results_classification-'  # Replace with the actual path
data_path = os.path.join(project_root, 'data', 'raw')

train_df, test_df = loader.load_data(
    train_path=os.path.join(data_path, 'students_train.csv'),
    test_path=os.path.join(data_path, 'students_test.csv')
)

train_df.head()


Unnamed: 0,ID,Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication,Test Results
0,1,Bobby JacksOn,30,Male,B-,Cancer,1/31/2024,Matthew Smith,Sons and Miller,Blue Cross,18856.28131,328,Urgent,2/2/2024,Paracetamol,Normal
1,2,LesLie TErRy,62,Male,A+,Obesity,8/20/2019,Samantha Davies,Kim Inc,Medicare,33643.32729,265,Emergency,8/26/2019,Ibuprofen,Inconclusive
2,3,DaNnY sMitH,76,Female,A-,Obesity,9/22/2022,Tiffany Mitchell,Cook PLC,Aetna,27955.09608,205,Emergency,10/7/2022,Aspirin,Normal
3,4,andrEw waTtS,28,Female,O+,Diabetes,11/18/2020,Kevin Wells,"Hernandez Rogers and Vang,",Medicare,37909.78241,450,Elective,12/18/2020,Ibuprofen,Abnormal
4,5,adrIENNE bEll,43,Female,AB+,Cancer,9/19/2022,Kathleen Hanna,White-White,Aetna,14238.31781,458,Urgent,10/9/2022,Penicillin,Abnormal


<h2>🔍Data Overview</h2>

In [3]:
overview=data_overview=loader.get_data_overview(train_df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID                  50000 non-null  int64  
 1   Name                50000 non-null  object 
 2   Age                 50000 non-null  int64  
 3   Gender              50000 non-null  object 
 4   Blood Type          50000 non-null  object 
 5   Medical Condition   50000 non-null  object 
 6   Date of Admission   50000 non-null  object 
 7   Doctor              50000 non-null  object 
 8   Hospital            50000 non-null  object 
 9   Insurance Provider  50000 non-null  object 
 10  Billing Amount      50000 non-null  float64
 11  Room Number         50000 non-null  int64  
 12  Admission Type      50000 non-null  object 
 13  Discharge Date      50000 non-null  object 
 14  Medication          50000 non-null  object 
 15  Test Results        50000 non-null  object 
dtypes: f

<h2>Feature Types</h2>

In [4]:
categorical_cols, numerical_cols = loader.classify_features(train_df, target_col="Test Results")


🔸 Found 11 categorical features (excluding target 'Test Results'):
['Name', 'Gender', 'Blood Type', 'Medical Condition', 'Date of Admission', 'Doctor', 'Hospital', 'Insurance Provider', 'Admission Type', 'Discharge Date', 'Medication']

🔹 Found 4 numerical features:
['ID', 'Age', 'Billing Amount', 'Room Number']


<h2>📊Checking For Missing Values</h2>

In [5]:


missing_df = preprocessor.analyze_missing_values(train_df)
# exploration_visualized.plot_missing_data(missing_df)


❌ No missing values found in the dataset.


<h2>📊Checking For Outliers</h2>

In [6]:

outliers, summary = preprocessor.detect_outliers(train_df, method='iqr', return_summary=True)
print(summary)
outliers


{'ID': np.int64(0), 'Age': np.int64(0), 'Billing Amount': np.int64(0), 'Room Number': np.int64(0)}


Unnamed: 0,ID,Age,Billing Amount,Room Number
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
...,...,...,...,...
49995,False,False,False,False
49996,False,False,False,False
49997,False,False,False,False
49998,False,False,False,False


In [7]:
exploration_visualized.plot_outliers_all(train_df)

❌ No outliers found in any numeric column.


In [8]:
exploration_visualized.plot_distribution_analysis(train_df)

<h2>📊Visualizing categorical features realtions with target</h2>

In [9]:
cat_cols = preprocessor.get_plotable_categorical_features(train_df, target_col="Test Results")
figs = exploration_visualized.plot_categorical_by_target(train_df, cat_cols, target_col="Test Results")

for fig in figs:
    fig.show()


<h2>📈Monthly Test Results Trend</h2>

In [10]:
# Get the grouped data first
monthly_data = preprocessor.get_monthly_test_result_counts(train_df)

# Generate the plot
fig = exploration_visualized.plot_monthly_test_result_trends(monthly_data)

# Show the figure
fig.show()


<h2>Statistical Analysis</h2>

<h2>🔎Descriptive Statistics</h2>

In [11]:
preprocessor.compute_descriptive_statistics(train_df)

Unnamed: 0,ID,Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication,Test Results,Month
count,50000.0,50000,50000.0,50000,50000,50000,50000,50000,50000,50000,50000.0,50000.0,50000,50000,50000,50000,50000
unique,,49992,,2,8,6,1827,40341,39876,5,,,3,1856,5,3,61
top,,TInA white,,Female,AB+,Arthritis,3/16/2024,Michael Smith,LLC Smith,Cigna,,,Elective,12/13/2021,Lipitor,Abnormal,2020-08
freq,,2,,25011,6291,8439,45,24,40,10091,,,16827,45,10032,16772,912
mean,25000.5,,51.58036,,,,,,,,25555.691557,301.03226,,,,,
std,14433.901067,,19.582194,,,,,,,,14215.932247,115.228819,,,,,
min,1.0,,18.0,,,,,,,,-2008.49214,101.0,,,,,
25%,12500.75,,35.0,,,,,,,,13239.40309,202.0,,,,,
50%,25000.5,,52.0,,,,,,,,25541.302835,302.0,,,,,
75%,37500.25,,68.0,,,,,,,,37853.996817,400.0,,,,,


<h2>📊Visualizing the Test Results Distribution</h2>

In [12]:

exploration_visualized.plot_test_result_distribution(train_df, target_col="Test Results")

<h2>📊Correlation Matrix</h2>

In [13]:
numerical_cols=preprocessor.get_numerical_features(train_df)
exploration_visualized.plot_correlation_matrix(train_df,numerical_cols)