In [16]:
import pandas as pd

def load_data(path=None, df=None):
    if df is None:
        if path is None:
            raise ValueError("Either 'path' or 'df' must be provided")
        df = pd.read_csv(path)

    if 'stroke' not in df.columns:
        raise KeyError(f"Target column 'stroke' not found. Columns: {list(df.columns)}")

    X = df.drop(columns=['stroke'])
    
    y = df['stroke']

    return X, y


In [17]:
X, y = load_data(
    path="C:/Users/DELL/OneDrive/Desktop/anu course/datasets/heart.csv"
)


In [18]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

def build_pipeline():
    return Pipeline([
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(max_iter=1000))
    ])


In [22]:

from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

def preprocess_data(X):
    """Convert categorical columns to numeric and handle missing values"""
    X_processed = X.copy()
    
    # Handle missing values first
    imputer = SimpleImputer(strategy='mean')
    numeric_cols = X_processed.select_dtypes(include=['float64', 'int64']).columns
    X_processed[numeric_cols] = imputer.fit_transform(X_processed[numeric_cols])
    
    # Identify categorical columns
    categorical_cols = X_processed.select_dtypes(include=['object']).columns
    
    # Encode each categorical column
    for col in categorical_cols:
        le = LabelEncoder()
        X_processed[col] = le.fit_transform(X_processed[col])
    
    return X_processed

# Apply preprocessing
X_processed = preprocess_data(X)

In [25]:
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import joblib

def train_and_evaluate():
    X, y = load_data("C:/Users/DELL/OneDrive/Desktop/anu course/datasets/heart.csv")
    X = preprocess_data(X)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    pipe = build_pipeline()
    pipe.fit(X_train, y_train)

    y_pred = pipe.predict(X_test)
    print(classification_report(y_test, y_pred))

    # Create models directory if it doesn't exist
    model_dir = "C:/Users/DELL/OneDrive/Desktop/anu course/models"
    os.makedirs(model_dir, exist_ok=True)
    
    joblib.dump(pipe, os.path.join(model_dir, "heart_model.pkl"))
    return pipe

In [26]:
if __name__ == "__main__":
    train_and_evaluate()

              precision    recall  f1-score   support

           0       0.94      1.00      0.97       960
           1       0.00      0.00      0.00        62

    accuracy                           0.94      1022
   macro avg       0.47      0.50      0.48      1022
weighted avg       0.88      0.94      0.91      1022



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
