In [None]:
import sqlite3
import pandas as pd
import logging
import sys
import seaborn as sns
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import numpy as np

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Step 1: Load Dataset from Kaggle
logging.info("Loading dataset from Kaggle store...")
kaggle_data_path = "healthcare-dataset-stroke-data.csv" 
try:
    df = pd.read_csv(kaggle_data_path)
    logging.info("Dataset loaded successfully.")
except FileNotFoundError:
    logging.error(f"File {kaggle_data_path} not found. Please check the file path.")
    sys.exit(1)

# Data Preprocessing
required_columns = ['id', 'age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi', 'stroke']
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
    logging.error(f"Missing columns in dataset: {missing_columns}")
    sys.exit(1)

df = df[required_columns].dropna()
logging.info("Data loaded and preprocessed successfully.")

# Data Exploration
logging.info("Performing data analysis and visualization...")
plt.figure(figsize=(8,6))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Feature Correlation Matrix")
plt.show()

# Step 2: Create SQLite Database & Load Data
engine = create_engine('sqlite:///healthcare.db')
conn = engine.connect()
df.to_sql('patient_data', con=conn, if_exists='replace', index=False)
logging.info("Data stored in SQLite database.")

# Step 3: Query Data Using SQL
query = "SELECT * FROM patient_data"
df = pd.read_sql(query, con=conn)
logging.info("Sample data queried successfully.")
print(df.head())

# Step 4: Train Machine Learning Model
X = df[['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi']]
y = df['stroke']
if y.nunique() < 2:
    logging.error("Insufficient class labels in target variable. Model training aborted.")
    sys.exit(1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
logging.info(f'Model Accuracy: {accuracy * 100:.2f}%')

# Detailed Model Evaluation
logging.info("Classification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6,5))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['No Stroke', 'Stroke'], yticklabels=['No Stroke', 'Stroke'])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

# Step 5: Additional Data Visualization
plt.figure(figsize=(8,6))
sns.scatterplot(x=df['age'], y=df['avg_glucose_level'], hue=df['stroke'], palette='coolwarm', alpha=0.6)
plt.xlabel('Age')
plt.ylabel('Avg Glucose Level')
plt.title('Age vs. Glucose Level (Stroke Indicator)')
plt.show()

# Step 6: Close Database Connection
conn.close()
logging.info("Database connection closed.")
