In [1]:
# Quick Data Verification - Jupyter Notebook Version
# Cell 1: Import libraries and load data

import pandas as pd
import numpy as np
import os

# Load the data - adjust path if needed
df = pd.read_csv('../../data/raw/asthma_disease_data.csv')

print("✅ Data loaded successfully!")
print(f"Dataset shape: {df.shape}")

# Cell 2: Basic information
print("📊 DATASET OVERVIEW")
print("=" * 40)
print(f"Rows: {df.shape[0]:,}")
print(f"Columns: {df.shape[1]}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / (1024**2):.2f} MB")

# Cell 3: Column names and types
print("\n📋 COLUMNS:")
for i, col in enumerate(df.columns, 1):
    print(f"{i:2d}. {col:<25} ({str(df[col].dtype)})")

# Cell 4: First few rows
print("\n🔍 FIRST 5 ROWS:")
df.head()

# Cell 5: Missing values
print("\n❓ MISSING VALUES:")
missing = df.isnull().sum()
if missing.sum() == 0:
    print("✅ No missing values!")
else:
    missing_df = pd.DataFrame({
        'Column': missing[missing > 0].index,
        'Missing Count': missing[missing > 0].values,
        'Missing %': (missing[missing > 0] / len(df) * 100).round(1)
    })
    display(missing_df)

# Cell 6: Look for target variable
print("\n🎯 LOOKING FOR TARGET VARIABLE:")
# Check for columns with few unique values (likely categorical targets)
for col in df.columns:
    unique_count = df[col].nunique()
    if unique_count <= 10:
        print(f"{col}: {unique_count} unique values")
        print(f"  Values: {df[col].value_counts().to_dict()}")
        print()

# Cell 7: Data types summary
print("🏷️  DATA TYPES:")
df.dtypes.value_counts()

# Cell 8: Quick statistics for numerical columns
print("\n📈 NUMERICAL SUMMARY:")
df.describe()

✅ Data loaded successfully!
Dataset shape: (2392, 29)
📊 DATASET OVERVIEW
Rows: 2,392
Columns: 29
Memory usage: 0.66 MB

📋 COLUMNS:
 1. PatientID                 (int64)
 2. Age                       (int64)
 3. Gender                    (int64)
 4. Ethnicity                 (int64)
 5. EducationLevel            (int64)
 6. BMI                       (float64)
 7. Smoking                   (int64)
 8. PhysicalActivity          (float64)
 9. DietQuality               (float64)
10. SleepQuality              (float64)
11. PollutionExposure         (float64)
12. PollenExposure            (float64)
13. DustExposure              (float64)
14. PetAllergy                (int64)
15. FamilyHistoryAsthma       (int64)
16. HistoryOfAllergies        (int64)
17. Eczema                    (int64)
18. HayFever                  (int64)
19. GastroesophagealReflux    (int64)
20. LungFunctionFEV1          (float64)
21. LungFunctionFVC           (float64)
22. Wheezing                  (int64)
23. ShortnessOf

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,PhysicalActivity,DietQuality,SleepQuality,...,GastroesophagealReflux,LungFunctionFEV1,LungFunctionFVC,Wheezing,ShortnessOfBreath,ChestTightness,Coughing,NighttimeSymptoms,ExerciseInduced,Diagnosis
count,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,...,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0
mean,6229.5,42.13796,0.493311,0.669732,1.307274,27.244877,0.141722,5.051786,5.022867,7.019012,...,0.158027,2.548564,3.74127,0.596154,0.500418,0.503344,0.503344,0.602425,0.604933,0.051839
std,690.655244,21.606655,0.50006,0.98612,0.898242,7.201628,0.348838,2.903574,2.90998,1.732475,...,0.364842,0.861809,1.303689,0.49077,0.500104,0.500093,0.500093,0.489499,0.488967,0.221749
min,5034.0,5.0,0.0,0.0,0.0,15.031803,0.0,0.00174,0.003031,4.001437,...,0.0,1.000459,1.500045,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,5631.75,23.0,0.0,0.0,1.0,20.968313,0.0,2.578333,2.432043,5.4985,...,0.0,1.824113,2.607489,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,6229.5,42.0,0.0,0.0,1.0,27.052202,0.0,5.016881,5.115383,6.975839,...,0.0,2.553244,3.734982,1.0,1.0,1.0,1.0,1.0,1.0,0.0
75%,6827.25,61.0,1.0,1.0,2.0,33.555903,0.0,7.540234,7.544216,8.52695,...,0.0,3.292897,4.864121,1.0,1.0,1.0,1.0,1.0,1.0,0.0
max,7425.0,79.0,1.0,3.0,3.0,39.985611,1.0,9.995809,9.999904,9.996235,...,1.0,3.999719,5.999421,1.0,1.0,1.0,1.0,1.0,1.0,1.0
