In [None]:
# Importing Data Manipulation Libraries
import pandas as pd
import numpy as np
# Import Data Visualization Libraries
import seaborn as sns
import matplotlib.pyplot as plt
# Import Filter Warning Libraries
import warnings
warnings.filterwarnings('ignore')
# Import Logging
import logging
logging.basicConfig(level = logging.INFO,
                    format = '%(asctime)s - %(levelname)s - %(message)s',
                    filemode = 'w',
                    filename = 'model.log',force = True)
# Import Scikit Learn Libraries for Machine Learning Model Building
from sklearn.preprocessing import MinMaxScaler,LabelEncoder
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV,learning_curve,KFold
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
import xgboost
from xgboost import XGBRegressor

# Multicolinearity test and treatment libraries
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.decomposition import PCA
from collections import OrderedDict

In [40]:
# def data ingestion function
def data_ingestion():
    try:
        data = pd.read_csv(r'C:\Heart_Attack_Risk_Model\data\raw\cardiovascular_risk_dataset.csv')
        logging.info('Data Ingestion Successful')
        return data
    except Exception as e:
        logging.error(f'Data Ingestion Failed: {e}')
        return data

In [52]:
# data preprocessing

def data_exploration(data):
    stats = []

    numerical_col = data.select_dtypes(exclude = 'object').columns

    for i in numerical_col:
        Q1 = data[i].quantile(0.25)
        Q3 = data[i].quantile(0.75)
        IQR = Q3 - Q1
        LW = Q1 - 1.5 * IQR
        UW = Q3 + 1.5 * IQR
        outlier_flag = "Has Outliers" if data[(data[i] < LW) | (data[i] > UW)].shape[0] > 0 else "No Outliers"

        numerical_stats = OrderedDict({
            "Feature": i,
            "Minimum": data[i].min(),
            "Maximum": data[i].max(),
            "Mean": data[i].mean(),
            "Median": data[i].median(),
            "Mode": data[i].mode().iloc[0] if not data[i].mode().empty else np.nan,
            "25%": Q1,
            "75%": Q3,
            "IQR": IQR,
            "Standard Deviation": data[i].std(),
            "Skewness": data[i].skew(),
            "Kurtosis": data[i].kurt(),
            "Outlier Comment": outlier_flag
        })

        stats.append(numerical_stats)

        report = pd.DataFrame(stats)
    return report


In [63]:
# use train test split
def data_preprocessing(data):
    X = data.drop('heart_disease_risk_score', axis = 1)
    y = data['heart_disease_risk_score']

    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                         test_size = 0.2,
                                                           random_state = 42)
    
    # use lable encoding for categorical features
    categorical_cols = X_train.select_dtypes(include = 'object').columns
    label_encoder = LabelEncoder()
    for col in categorical_cols:
        X_train[col] = label_encoder.fit_transform(X_train[col])
        X_test[col] = label_encoder.transform(X_test[col])
    return X_train, X_test, y_train, y_test


def model_building(X_train, y_train,X_test, y_test):
    models = {
        'Linear Regression': LinearRegression(),
        'Lasso Regression': Lasso(),
        'Ridge Regression': Ridge(),
        'Decision Tree Regressor': DecisionTreeRegressor(),
        'Support Vector Regressor': SVR(),
        'K-Neighbors Regressor': KNeighborsRegressor(),
        'Random Forest Regressor': RandomForestRegressor(),
        'AdaBoost Regressor': AdaBoostRegressor(),
        'Gradient Boosting Regressor': GradientBoostingRegressor(),
        'XGBoost Regressor': XGBRegressor()
    }
    return models

def model_evaluation(models, X_train, y_train, X_test, y_test):

    model_performance = {}

    for name, model in models.items():
        model.fit(X_train, y_train)
        mae = mean_absolute_error(y_test, model.predict(X_test))
        mse = mean_squared_error(y_test, model.predict(X_test))
        r2_score = r2_score(y_test, model.predict(X_test))
        model_performance[name] = {
            'MAE': mae,
            'MSE': mse,
            'R2 Score': r2_score
        }

    return model_performance

In [67]:
from sklearn.metrics import r2_score
def main():
    data = data_ingestion()
    report = data_exploration(data)
    print(report)
    X_train, X_test, y_train, y_test = data_preprocessing(data)
    models = model_building(X_train, y_train,X_test, y_test)
    model_performance = model_evaluation(models, X_train, y_train, X_test, y_test)
    print(model_performance)
if __name__ == '__main__':
    main()

                             Feature  Minimum  Maximum         Mean  Median  \
0                         Patient_ID      1.0   5500.0  2750.500000  2750.5   
1                                age     18.0     90.0    53.872000    54.0   
2                                bmi     15.0     40.9    28.170818    28.4   
3                        systolic_bp    108.0    192.0   147.248182   147.0   
4                       diastolic_bp     64.0    120.0    95.756727    96.0   
5                  cholesterol_mg_dl    147.0    331.0   239.684182   240.0   
6                 resting_heart_rate     48.0     92.0    74.075091    74.0   
7                        daily_steps    500.0  16793.0  5902.929455  5460.0   
8                       stress_level      1.0     10.0     4.907091     5.0   
9   physical_activity_hours_per_week      0.0     12.9     3.299364     2.6   
10                       sleep_hours      4.0     10.0     6.869364     6.9   
11                diet_quality_score      1.0     10

UnboundLocalError: cannot access local variable 'r2_score' where it is not associated with a value