<h2 style="text-align:center;">Exploring Dataset</h2>

<h2>🔃Loading Datasets</h2>

In [1]:
import os
import sys

# Detect project root by going up until we find the 'src' directory
current_dir = os.getcwd()
while not os.path.isdir(os.path.join(current_dir, 'src')):
    current_dir = os.path.dirname(current_dir)
    if current_dir == os.path.dirname(current_dir):  # Reached filesystem root
        raise FileNotFoundError("Could not find 'src' directory in any parent folders.")

# Set project root and add it to sys.path
PROJECT_ROOT = current_dir
print(f"Setting project root: {PROJECT_ROOT}")
os.chdir(PROJECT_ROOT)
sys.path.insert(0, PROJECT_ROOT)


from src.data import loader, preprocessor
from src.visualization import exploration_visualized


Setting project root: c:\Users\HP\Desktop\Healthcare_test_results_classification-


In [2]:
  # Make sure loader.py is in your project directory

project_root = r'C:\Users\HP\Desktop\Healthcare_test_results_classification-'  # Replace with the actual path
data_path = os.path.join(project_root, 'data', 'raw')

train_df, test_df = loader.load_data(
    train_path=os.path.join(data_path, 'students_train.csv'),
    test_path=os.path.join(data_path, 'students_test.csv')
)


<h2>🔍Data Overview</h2>

In [3]:
data_overview=loader.get_data_overview(train_df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55500 entries, 0 to 55499
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID                  50000 non-null  float64
 1   Name                50000 non-null  object 
 2   Age                 50000 non-null  float64
 3   Gender              50000 non-null  object 
 4   Blood Type          50000 non-null  object 
 5   Medical Condition   50000 non-null  object 
 6   Date of Admission   50000 non-null  object 
 7   Doctor              50000 non-null  object 
 8   Hospital            50000 non-null  object 
 9   Insurance Provider  50000 non-null  object 
 10  Billing Amount      50000 non-null  float64
 11  Room Number         50000 non-null  float64
 12  Admission Type      50000 non-null  object 
 13  Discharge Date      50000 non-null  object 
 14  Medication          50000 non-null  object 
 15  Test Results        50000 non-null  object 
dtypes: f

<h2>Feature Types</h2>

In [4]:
categorical_cols, numerical_cols = loader.classify_features(train_df, target_col="Test Results")


🔸 Found 11 categorical features (excluding target 'Test Results'):
['Name', 'Gender', 'Blood Type', 'Medical Condition', 'Date of Admission', 'Doctor', 'Hospital', 'Insurance Provider', 'Admission Type', 'Discharge Date', 'Medication']

🔹 Found 4 numerical features:
['ID', 'Age', 'Billing Amount', 'Room Number']


<h2>📊Computing & Visualizing Missing Values</h2>

In [5]:


missing_df = preprocessor.analyze_missing_values(train_df)
exploration_visualized.plot_missing_data(missing_df)


Missing Values Analysis:
                    Missing Values  Percentage (%)
ID                            5500            9.91
Name                          5500            9.91
Age                           5500            9.91
Gender                        5500            9.91
Blood Type                    5500            9.91
Medical Condition             5500            9.91
Date of Admission             5500            9.91
Doctor                        5500            9.91
Hospital                      5500            9.91
Insurance Provider            5500            9.91
Billing Amount                5500            9.91
Room Number                   5500            9.91
Admission Type                5500            9.91
Discharge Date                5500            9.91
Medication                    5500            9.91
Test Results                  5500            9.91


<h2>📊Visualizing categorical features realtions with target</h2>

In [6]:
cat_cols = preprocessor.get_plotable_categorical_features(train_df, target_col="Test Results")
figs = exploration_visualized.plot_categorical_by_target(train_df, cat_cols, target_col="Test Results")

for fig in figs:
    fig.show()


<h2>📈Monthly Test Results Trend</h2>

In [7]:
# Get the grouped data first
monthly_data = preprocessor.get_monthly_test_result_counts(train_df)

# Generate the plot
fig = exploration_visualized.plot_monthly_test_result_trends(monthly_data)

# Show the figure
fig.show()


<h2>📊Visualizing the Test Results Distribution</h2>

In [8]:
exploration_visualized.plot_test_result_distribution(train_df, target_col="Test Results")

<h2>📊Correlation Matrix For Numerical Features</h2>

In [9]:
numerical_cols=preprocessor.get_numerical_features(train_df)
exploration_visualized.plot_correlation_matrix(train_df,numerical_cols)