In [12]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler
import joblib
import requests
import os

# Function to download dataset
def download_dataset(url, file_path):
    try:
        response = requests.get(url)
        response.raise_for_status()
        with open(file_path, 'wb') as file:
            file.write(response.content)
        print("Dataset downloaded successfully.")
    except Exception as e:
        print(f"Error downloading dataset: {e}")

# Function to load dataset
def load_dataset(file_path):
    try:
        data = pd.read_csv(file_path, encoding='latin1')  # Changed encoding to 'latin1'
        print("Dataset loaded successfully.")
        return data
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return None

# Preprocessing the dataset
def preprocess_data(data):
    try:
        # Encode binary columns with "Yes"/"No" values
        binary_columns = ["HeartDisease", "Smoking", "AlcoholDrinking", "Stroke", "DiffWalking",
                          "Diabetic", "PhysicalActivity", "Asthma", "KidneyDisease", "SkinCancer"]
        for col in binary_columns:
            data[col] = data[col].apply(lambda x: 1 if x == "Yes" else 0)

        # Encode other categorical columns
        categorical_columns = ["Sex", "AgeCategory", "Race", "GenHealth"]
        for col in categorical_columns:
            le = LabelEncoder()
            data[col] = le.fit_transform(data[col])

        # Ensure numerical columns are correctly formatted
        numerical_columns = ["BMI", "PhysicalHealth", "MentalHealth", "SleepTime"]
        for col in numerical_columns:
            data[col] = pd.to_numeric(data[col], errors='coerce')

        # Drop rows with missing values (if any)
        data = data.dropna()

        print("Data preprocessing completed.")
        return data
    except Exception as e:
        print(f"Error during data preprocessing: {e}")
        return None

# URL and file path for the dataset
DATASET_URL = "https://drive.google.com/uc?id=10xt30T9TW_XP97pgdutJkKMBoZ1WZgGr"
DATASET_PATH = "dataset.csv"

# Download the dataset if it doesn't already exist
if not os.path.exists(DATASET_PATH):
    download_dataset(DATASET_URL, DATASET_PATH)

# Load the dataset
data = load_dataset(DATASET_PATH)

if data is not None:
    try:
        # Preprocess the dataset
        data = preprocess_data(data)

        if data is not None:
            # Define features (X) and target (y)
            X = data.drop("HeartDisease", axis=1)  # Use 'HeartDisease' as the target column
            y = data["HeartDisease"]

            # Standardize numerical features
            scaler = StandardScaler()
            X = scaler.fit_transform(X)

            # Split the dataset into training and testing sets (top half for testing, bottom half for training)
            split_index = len(X) // 2
            X_train, X_test = X[split_index:], X[:split_index]
            y_train, y_test = y[split_index:], y[:split_index]

            # Train the Logistic Regression model
            model = LogisticRegression()
            model.fit(X_train, y_train)

            # Save the model
            joblib.dump(model, "logistic_regression_model.pkl")
            print("Model trained and saved successfully.")

            # Evaluate the model
            y_pred = model.predict(X_test)
            print("Accuracy:", accuracy_score(y_test, y_pred))
            print("Classification Report:\n", classification_report(y_test, y_pred))

        else:
            print("Data preprocessing failed.")

    except Exception as e:
        print(f"Error during model training or evaluation: {e}")
else:
    print("Unable to proceed without a valid dataset.")

Dataset downloaded successfully.
Dataset loaded successfully.
Data preprocessing completed.
Model trained and saved successfully.
Accuracy: 0.9141634927484569
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.99      0.95    146082
           1       0.52      0.09      0.16     13815

    accuracy                           0.91    159897
   macro avg       0.72      0.54      0.56    159897
weighted avg       0.89      0.91      0.89    159897

