In [None]:
# import manipulation lybraries
import numpy as np
import pandas as pd

# import visualization lybraries
import matplotlib.pyplot as plt
import seaborn as sns

# import machine learning lybraries for regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error

# import machine learning lybraries for classification
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

import warnings
warnings.filterwarnings("ignore")

import logging
logging.basicConfig(level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    filemode='w',
                    filename='model_regression.log',
                    force=True)
from sklearn.impute import SimpleImputer


In [None]:
# Data ingestion

def data_ingestion():
    try:
        #load the data
        df = pd.read_csv(r'C:\Heart_Attack_Risk_Model\data\raw\cardiovascular_risk_dataset.csv')
        logging.info("Data ingestion successful.")
    except:
        logging.error("Data ingestion failed.")
    return df


df = data_ingestion()    

In [None]:
# check the first five rows of the dataset
df.head()

In [None]:
# check the missing values in the dataset
df.isnull().sum()

In [None]:
# check for duplicates in the dataset
df.duplicated().sum()

In [None]:
# check the statistical summary of the dataset
df.describe()

In [None]:
# data preprocessing

def data_preprocessing(df):
    
    # segregate numerical and categorical columns
    numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols = df.select_dtypes(include=['object']).columns

    

In [None]:
df.columns

In [None]:
# data building

def data_building(df):
    # define the target variable and features
    X = df.drop(columns =['heart_disease_risk_score'], axis=1)
    y = df['heart_disease_risk_score']
    
    # split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                         test_size=0.2,
                                                           random_state=42)
    
    # use pipelines for numerical columns

    numerical_pipeline = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    
    # use pipelines for categorical columns
    categorical_pipeline = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    

    return X_train, X_test, y_train, y_test,numerical_pipeline,categorical_pipeline

In [None]:
# model evaluation
def model_evaluation(X_train, X_test, y_train, y_test,numerical_pipeline,categorical_pipeline):

    models = {
        'Linear Regression': LinearRegression(),
        'Decision Tree': DecisionTreeClassifier(),
        'Random Forest': RandomForestClassifier(),
        'Gradient Boosting': GradientBoostingClassifier(),
        'AdaBoost': AdaBoostClassifier(),
        'Support Vector Machine': SVC(),
        'K-Nearest Neighbors': KNeighborsClassifier()
    }

    for model_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        logging.info(f'{model_name} Accuracy: {accuracy}')

In [None]:
results = model_evaluation(X_train, X_test, y_train, y_test,numerical_pipeline,categorical_pipeline)
print(results)
