In [1]:
import pandas as pd
from load2 import DatasetLoader
from eda2 import EDA
from ui2 import UserInterfaceCLI
from ml_models import MLModels
import os

In [2]:
# Load and clean data
loader = DatasetLoader('data.csv')
data = loader.load_data()
cleaned_data = loader.clean_data()

# Inspect data
print('Data Info:')
print(cleaned_data.info())
print('\nMissing Values:')
print(cleaned_data.isnull().sum())

Dataset loaded successfully.
Data cleaned, features computed, and scaled (excluding categorical columns).
Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 172000 entries, 0 to 171999
Data columns (total 27 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   ID                        172000 non-null  float64
 1   Age                       172000 non-null  float64
 2   Gender                    172000 non-null  float64
 3   Hypertension              172000 non-null  int64  
 4   Heart Disease             172000 non-null  int64  
 5   Ever Married              172000 non-null  int64  
 6   Work Type                 172000 non-null  int64  
 7   Residence Type            172000 non-null  int64  
 8   Average Glucose Level     172000 non-null  float64
 9   BMI                       172000 non-null  float64
 10  Smoking Status            172000 non-null  int64  
 11  Physical Activity         172000 non-nu

In [3]:
print(cleaned_data.describe(include='all'))

                 ID           Age        Gender   Hypertension  Heart Disease  \
count  1.720000e+05  1.720000e+05  1.720000e+05  172000.000000  172000.000000   
mean   1.057552e-17  8.030785e-17  5.056420e-17       0.150134       0.100279   
std    1.000003e+00  1.000003e+00  1.000003e+00       0.357203       0.300373   
min   -1.732041e+00 -1.708546e+00 -9.809222e-01       0.000000       0.000000   
25%   -8.660204e-01 -8.545062e-01 -9.809222e-01       0.000000       0.000000   
50%    0.000000e+00 -4.661911e-04  7.696341e-01       0.000000       0.000000   
75%    8.660204e-01  8.535738e-01  7.696341e-01       0.000000       0.000000   
max    1.732041e+00  1.707614e+00  2.520190e+00       1.000000       1.000000   

        Ever Married      Work Type  Residence Type  Average Glucose Level  \
count  172000.000000  172000.000000   172000.000000           1.720000e+05   
mean        0.300965       2.600151        0.500459           5.328244e-16   
std         0.458679       1.200596 

In [4]:
# Perform EDA
eda = EDA(cleaned_data)
stats = eda.descriptive_statistics()
print('Descriptive Statistics:')
print(stats)

Descriptive Statistics:
                 ID           Age        Gender   Hypertension  Heart Disease  \
count  1.720000e+05  1.720000e+05  1.720000e+05  172000.000000  172000.000000   
mean   1.057552e-17  8.030785e-17  5.056420e-17       0.150134       0.100279   
std    1.000003e+00  1.000003e+00  1.000003e+00       0.357203       0.300373   
min   -1.732041e+00 -1.708546e+00 -9.809222e-01       0.000000       0.000000   
25%   -8.660204e-01 -8.545062e-01 -9.809222e-01       0.000000       0.000000   
50%    0.000000e+00 -4.661911e-04  7.696341e-01       0.000000       0.000000   
75%    8.660204e-01  8.535738e-01  7.696341e-01       0.000000       0.000000   
max    1.732041e+00  1.707614e+00  2.520190e+00       1.000000       1.000000   

        Ever Married      Work Type  Residence Type  Average Glucose Level  \
count  172000.000000  172000.000000   172000.000000           1.720000e+05   
mean        0.300965       2.600151        0.500459           5.328244e-16   
std         

In [5]:
# Generate plots for key features
eda.plot_distribution('Age', 'histogram')
eda.plot_distribution('BMI', 'histogram')
eda.plot_distribution('Stroke Occurrence', 'bar')
balanced_data = eda.check_class_balance('Stroke Occurrence')

Class distribution for Stroke Occurrence (before SMOTE):
Stroke Occurrence
0    154904
1     17096
Name: count, dtype: int64
Applying SMOTE to balance Stroke Occurrence classes.
Balanced class distribution for Stroke Occurrence (after SMOTE):
Stroke Occurrence
0    154904
1    154904
Name: count, dtype: int64


In [6]:
# Train and evaluate models for each target
targets = ['Chronic Stress', 'Physical Activity', 'Income Level', 'Stroke Occurrence']
for i, target in enumerate(targets):
    X_train, X_test, y_train, y_test = loader.split_data(target_column=target)
    if X_train is not None:
        if i > 0:  # Add newline before each target except the first
            print()
        ml = MLModels(X_train, X_test, y_train, y_test)
        ml.train_and_evaluate(target)
        comparison = ml.plot_model_comparison(target)
        print(f'Model Performance for {target}:')
        print(comparison)

Confusion matrix saved at: plots/Naive Bayes_Chronic Stress_confusion_matrix.png
Confusion matrix saved at: plots/Random Forest_Chronic Stress_confusion_matrix.png
Confusion matrix saved at: plots/XGBoost_Chronic Stress_confusion_matrix.png
Model comparison plot saved at: plots/Chronic Stress_model_comparison.png
Model Performance for Chronic Stress:
               Accuracy  Precision    Recall
Naive Bayes    0.751134   0.564202  0.751134
Random Forest  0.751134   0.564202  0.751134
XGBoost        0.749797   0.632575  0.749797

Confusion matrix saved at: plots/Naive Bayes_Physical Activity_confusion_matrix.png
Confusion matrix saved at: plots/Random Forest_Physical Activity_confusion_matrix.png
Confusion matrix saved at: plots/XGBoost_Physical Activity_confusion_matrix.png
Model comparison plot saved at: plots/Physical Activity_model_comparison.png
Model Performance for Physical Activity:
               Accuracy  Precision    Recall
Naive Bayes    0.400058   0.278640  0.400058
Random F

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

In [7]:
# Run CLI
ui = UserInterfaceCLI(loader, eda)
ui.run()


=== Stroke Data Analytics System ===
1. Load and Clean Dataset
2. Perform Exploratory Data Analysis (EDA)
3. Exit
Enter your choice (1-3): 1
Dataset loaded successfully.
Data cleaned, features computed, and scaled (excluding categorical columns).
Dataset loaded and cleaned successfully.

=== Stroke Data Analytics System ===
1. Load and Clean Dataset
2. Perform Exploratory Data Analysis (EDA)
3. Exit
Enter your choice (1-3): 2
Class distribution for Stroke Occurrence (before SMOTE):
Stroke Occurrence
0    154904
1     17096
Name: count, dtype: int64
Applying SMOTE to balance Stroke Occurrence classes.
Balanced class distribution for Stroke Occurrence (after SMOTE):
Stroke Occurrence
0    154904
1    154904
Name: count, dtype: int64
EDA completed. Plots saved in 'plots/' folder.

Descriptive Statistics:
                 ID           Age        Gender   Hypertension  Heart Disease  \
count  1.720000e+05  1.720000e+05  1.720000e+05  172000.000000  172000.000000   
mean   1.057552e-17  8.0