In [2]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.2-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB 640.0 kB/s eta 0:03:16
   ---------------------------------------- 0.1/124.9 MB 650.2 kB/s eta 0:03:13
   ---------------------------------------- 0.1/124.9 MB 737.3 kB/s eta 0:02:50
   ---------------------------------------- 0.2/124.9 MB 952.6 kB/s eta 0:02:11
   ---------------------------------------- 0.3/124.9 MB 1.2 MB/s eta 0:01:47
   ---------------------------------------- 0.4/124.9 MB 1.4 MB/s eta 0:01:30
   ---------------------------------------- 0.5/124.9 MB 1.5 MB/s eta 0:01:23
   ---------------------------------------- 0.6/124.9 MB 1.5 MB/s eta 0:01:21
   ---------------------------------------- 0.8/124.9 MB 1.8 MB/s eta 0:01:11
   ---------------------------------------- 0.8/124.9 MB 1.8 MB/s eta 0

In [9]:
import sqlite3
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR, SVC
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB


In [10]:
# Load dataset
df = pd.read_csv("../data/heart.csv")

In [11]:
# Upload DataFrame to SQLite
conn = sqlite3.connect("heart_data.db")
df.to_sql("heart_table", conn, if_exists="replace", index=False)
print("DataFrame successfully uploaded to SQLite database.")

DataFrame successfully uploaded to SQLite database.


In [12]:
# Query Data from SQLite
query = "SELECT * FROM heart_table"
data = pd.read_sql_query(query, conn)


In [13]:
# Close the connection
conn.close()

In [14]:
# Split into features and target
x = data.drop("target", axis=1)
y = data["target"]


In [15]:
# Select numerical and categorical columns
num_columns = x.select_dtypes(exclude="O").columns
cat_columns = x.select_dtypes(include="O").columns

In [16]:
num_columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal'],
      dtype='object')

In [17]:
cat_columns

Index([], dtype='object')

In [18]:
# Create preprocessing pipelines
num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy="mean")),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer([
    ('num_pipeline', num_pipeline, num_columns)
])


In [None]:
# Split data
x_train, x_test, c, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


In [20]:
x_train

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
132,42,1,1,120,295,0,1,162,0,0.0,2,0,2
202,58,1,0,150,270,0,0,111,1,0.8,2,0,3
196,46,1,2,150,231,0,1,147,0,3.6,1,0,2
75,55,0,1,135,250,0,0,161,0,1.4,1,0,2
176,60,1,0,117,230,1,1,160,1,1.4,2,2,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
188,50,1,2,140,233,0,1,163,0,0.6,1,1,3
71,51,1,2,94,227,0,1,154,1,0.0,2,1,3
106,69,1,3,160,234,1,0,131,0,0.1,1,1,2
270,46,1,0,120,249,0,0,144,0,0.8,2,0,3


In [21]:
 x_test

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
179,57,1,0,150,276,0,0,112,1,0.6,1,1,1
228,59,1,3,170,288,0,0,159,0,0.2,1,0,3
111,57,1,2,150,126,1,1,173,0,0.2,2,1,3
246,56,0,0,134,409,0,0,150,1,1.9,1,2,3
60,71,0,2,110,265,1,0,130,0,0.0,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
249,69,1,2,140,254,0,0,146,0,2.0,1,3,3
104,50,1,2,129,196,0,1,163,0,0.0,2,0,2
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3
193,60,1,0,145,282,0,0,142,1,2.8,1,2,3


In [22]:
 x_test

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
179,57,1,0,150,276,0,0,112,1,0.6,1,1,1
228,59,1,3,170,288,0,0,159,0,0.2,1,0,3
111,57,1,2,150,126,1,1,173,0,0.2,2,1,3
246,56,0,0,134,409,0,0,150,1,1.9,1,2,3
60,71,0,2,110,265,1,0,130,0,0.0,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
249,69,1,2,140,254,0,0,146,0,2.0,1,3,3
104,50,1,2,129,196,0,1,163,0,0.0,2,0,2
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3
193,60,1,0,145,282,0,0,142,1,2.8,1,2,3


In [23]:
# Preprocess training and testing data
x_train = pd.DataFrame(preprocessor.fit_transform(x_train), columns=preprocessor.get_feature_names_out())
x_test = pd.DataFrame(preprocessor.transform(x_test), columns=preprocessor.get_feature_names_out())


In [24]:
# Define models and hyperparameters
models = {
    'Lasso': {'model': Lasso(), 'params': {'alpha': [0.01, 0.1, 1, 10]}},
    'Ridge': {'model': Ridge(), 'params': {'alpha': [0.01, 0.1, 1, 10]}},
    'ElasticNet': {'model': ElasticNet(), 'params': {'alpha': [0.01, 0.1, 1], 'l1_ratio': [0.1, 0.5, 0.9]}},
    'RandomForestRegressor': {'model': RandomForestRegressor(), 'params': {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20]}},
    'XGBRegressor': {'model': XGBRegressor(), 'params': {'n_estimators': [50, 100], 'learning_rate': [0.01, 0.1]}},
    'SVR': {'model': SVR(), 'params': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}},
    'SVC': {'model': SVC(), 'params': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}}
}


In [25]:
# Evaluate models
performance_scores = []

In [26]:

def evaluate_model(true, pred, metric):
    if metric == 'r2':
        score = r2_score(true, pred)
        mse = mean_squared_error(true, pred)
        mae = mean_absolute_error(true, pred)
        return score, mse, mae, None, None, None
    else:  # Classification metrics
        score = accuracy_score(true, pred)
        precision = precision_score(true, pred)
        recall = recall_score(true, pred)
        f1 = f1_score(true, pred)
        return score, None, None, precision, recall, f1


In [27]:
# Iterate over each model
for model_name, model_info in models.items():
    model = model_info['model']
    params = model_info['params']
    
    # Perform GridSearchCV
    grid_search = GridSearchCV(model, params, cv=5, scoring='r2' if model_name != 'SVC' else 'accuracy', n_jobs=-1)
    grid_search.fit(x_train, y_train)
    
    # Best model and predictions
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(x_test)
    
    # Evaluate the model
    metric = 'r2' if model_name != 'SVC' else 'accuracy'
    score, mse, mae, precision, recall, f1 = evaluate_model(y_test, y_pred, metric)
    
    # Print model performance
    print("Model:", model_name)
    print("Best Parameters:", grid_search.best_params_)
    print(f"{metric.upper()} Score:", score)
    if metric == 'r2':
        print("Mean Squared Error:", mse)
        print("Mean Absolute Error:", mae)
    else:
        print("Precision:", precision)
        print("Recall:", recall)
        print("F1 Score:", f1)
    print("=" * 40)
    
    # Append performance score
    performance_scores.append((model_name, score))

# Find the best model
best_model = max(performance_scores, key=lambda x: x[1])
print("Best Model:", best_model[0])
print("Best Score:", best_model[1])

Model: Lasso
Best Parameters: {'alpha': 0.01}
R2 Score: 0.5386879372054784
Mean Squared Error: 0.115049071290867
Mean Absolute Error: 0.26588157375790894
Model: Ridge
Best Parameters: {'alpha': 10}
R2 Score: 0.5370224251215296
Mean Squared Error: 0.11546444221639895
Mean Absolute Error: 0.26596160803685587
Model: ElasticNet
Best Parameters: {'alpha': 0.1, 'l1_ratio': 0.1}
R2 Score: 0.5407601679967751
Mean Squared Error: 0.11453226662160518
Mean Absolute Error: 0.2652731049532109
Model: RandomForestRegressor
Best Parameters: {'max_depth': None, 'n_estimators': 200}
R2 Score: 0.532479040948276
Mean Squared Error: 0.11659754098360654
Mean Absolute Error: 0.23852459016393443
Model: XGBRegressor
Best Parameters: {'learning_rate': 0.1, 'n_estimators': 50}
R2 Score: 0.4533717789193713
Mean Squared Error: 0.13632652221521727
Mean Absolute Error: 0.2272484287463098
Model: SVR
Best Parameters: {'C': 1, 'kernel': 'rbf'}
R2 Score: 0.5460184658351592
Mean Squared Error: 0.11322087172936637
Mean Abs