# Day 6: Data Splitting & Integration Testing
Integrate full data pipeline (cleaning, OCR, feature engineering) and perform stratified train-validation-test splits.

In [5]:
import sys
sys.path.append('..')  # Path to scripts folder

import pandas as pd
from scripts.pipeline import integrate_pipeline
from scripts.data_split import stratified_split, validate_splits, dataset_summary


In [6]:
# 1. Load cleaned dataset
df = pd.read_csv('../data/processed/heart_disease_cleaned.csv')
print(f'Loaded dataset shape: {df.shape}')
df.head()

Loaded dataset shape: (920, 14)


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,63.0,Male,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0.0
1,67.0,Male,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,2.7,normal,2.0
2,67.0,Male,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1.0
3,37.0,Male,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0.0
4,41.0,Female,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0.0


In [7]:
# 2. Integrate full pipeline
df_processed = integrate_pipeline(df)
print(f'Processed dataset shape: {df_processed.shape}')
df_processed.head()

Processed dataset shape: (920, 23)


Unnamed: 0,age,trestbps,chol,thalch,oldpeak,ca,num,risk_score,sex_Male,cp_atypical angina,...,restecg_normal,restecg_st-t abnormality,exang_True,slope_flat,slope_upsloping,thal_normal,thal_reversable defect,age_group_Middle-aged,age_group_Senior,age_group_Young
0,63.0,0.6875,0.528846,0.51832,1.329422,0.0,0.0,197.8,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,67.0,0.875,0.674451,-1.136384,0.565768,2.7,2.0,235.6,1.0,0.0,...,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
2,67.0,0.375,0.517857,-0.309032,1.615793,2.0,1.0,185.4,1.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
3,37.0,0.5,0.575549,1.976036,2.474904,0.0,0.0,202.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,41.0,0.5,0.449176,1.38507,0.470311,0.0,0.0,174.4,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0


In [8]:
# 3. Stratified split (70%-15%-15%)
X_train, X_val, X_test, y_train, y_val, y_test = stratified_split(df_processed, target_col='num')

In [9]:
# 4. Validate splits
validate_splits(X_train, X_val, X_test, y_train, y_val, y_test)

Train target distribution:
 num
0.0    0.447205
1.0    0.287267
2.0    0.118012
3.0    0.116460
4.0    0.031056
Name: proportion, dtype: float64
Validation target distribution:
 num
0.0    0.442029
1.0    0.289855
2.0    0.123188
3.0    0.115942
4.0    0.028986
Name: proportion, dtype: float64
Test target distribution:
 num
0.0    0.449275
1.0    0.289855
3.0    0.115942
2.0    0.115942
4.0    0.028986
Name: proportion, dtype: float64
Train set: No missing values
Validation set: No missing values
Test set: No missing values


In [10]:
# 5. Dataset summary
summary = dataset_summary(df_processed)
summary

Unnamed: 0,Feature,Type,Missing,Unique,Mean,Std,Min,Max
age,age,float64,0,50,53.51087,9.424685,28.0,77.0
trestbps,trestbps,float64,0,93,0.5251984,0.214972,0.0,1.0
chol,chol,float64,0,229,0.458658,0.256404,0.0,1.0
thalch,thalch,float64,0,158,-2.471453e-16,1.000544,-2.791089,2.567001
oldpeak,oldpeak,float64,0,60,0.0,1.000544,-3.157046,2.952187
ca,ca,float64,0,11,0.7797826,0.673212,0.0,2.7
num,num,float64,0,5,0.9956522,1.142693,0.0,4.0
risk_score,risk_score,float64,0,545,177.2773,57.156151,60.3,303.8
sex_Male,sex_Male,float64,0,2,0.7891304,0.408148,0.0,1.0
cp_atypical angina,cp_atypical angina,float64,0,2,0.1891304,0.391825,0.0,1.0


In [11]:
# 6. Save final datasets
import os
os.makedirs('../data/final', exist_ok=True)

train_dataset = X_train.copy()
train_dataset['target'] = y_train
train_dataset.to_csv('../data/final/train_dataset.csv', index=False)

val_dataset = X_val.copy()
val_dataset['target'] = y_val
val_dataset.to_csv('../data/final/validation_dataset.csv', index=False)

test_dataset = X_test.copy()
test_dataset['target'] = y_test
test_dataset.to_csv('../data/final/test_dataset.csv', index=False)

print('✅ Final datasets saved')

✅ Final datasets saved
