In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score




In [2]:
from google.colab import files
uploaded = files.upload()

Saving CVD_cleaned(in).csv to CVD_cleaned(in).csv


In [6]:
data = pd.read_csv('CVD_cleaned(in).csv')

In [7]:
print(data.head())
print(data.columns)

  General_Health                  Checkup Exercise Heart_Disease Skin_Cancer  \
0           Poor  Within the past 2 years       No            No          No   
1      Very Good     Within the past year       No           Yes          No   
2      Very Good     Within the past year      Yes            No          No   
3           Poor     Within the past year      Yes           Yes          No   
4           Good     Within the past year       No            No          No   

  Other_Cancer Depression Diabetes Arthritis     Sex Age_Category  \
0           No         No       No       Yes  Female        70-74   
1           No         No      Yes        No  Female        70-74   
2           No         No      Yes        No  Female        60-64   
3           No         No      Yes        No    Male        75-79   
4           No         No       No        No    Male          80+   

   Height_(cm)  Weight_(kg)    BMI Smoking_History  Alcohol_Consumption  \
0          150        32.66  

In [8]:
categorical_cols = ['General_Health', 'Checkup', 'Exercise', 'Skin_Cancer', 'Other_Cancer',
                    'Depression', 'Diabetes', 'Arthritis', 'Sex', 'Age_Category', 'Smoking_History']
numerical_cols = ['Height_(cm)', 'Weight_(kg)', 'BMI', 'Alcohol_Consumption', 'Fruit_Consumption',
                  'Green_Vegetables_Consumption', 'FriedPotato_Consumption']


In [9]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

In [10]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [11]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)])

In [12]:
model = RandomForestClassifier(n_estimators=100, random_state=0)

In [13]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)])

In [14]:
X = data.drop('Heart_Disease', axis=1)
y = data['Heart_Disease']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


In [15]:
pipeline.fit(X_train, y_train)

In [16]:
y_pred = pipeline.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(classification_report(y_test, y_pred))

Accuracy: 0.9181169157047806
              precision    recall  f1-score   support

          No       0.92      0.99      0.96     56783
         Yes       0.44      0.05      0.09      4988

    accuracy                           0.92     61771
   macro avg       0.68      0.52      0.52     61771
weighted avg       0.88      0.92      0.89     61771



In [18]:

new_data = {
    'General_Health': 'Good',
    'Checkup': 'Yes',
    'Exercise': 'Regular',
    'Skin_Cancer': 'No',
    'Other_Cancer': 'No',
    'Depression': 'No',
    'Diabetes': 'No',
    'Arthritis': 'No',
    'Sex': 'Male',
    'Age_Category': '35-44',
    'Smoking_History': 'Never',
    'Height_(cm)': 175,
    'Weight_(kg)': 70,
    'BMI': 22.9,
    'Alcohol_Consumption': 2,
    'Fruit_Consumption': 3,
    'Green_Vegetables_Consumption': 2,
    'FriedPotato_Consumption': 1
}

input_data = pd.DataFrame([new_data])

prediction = pipeline.predict(input_data)

if prediction[0] == 1:
    print("The patient is predicted to have cardiovascular risk.")
else:
    print("The patient is predicted to not have cardiovascular risk.")


The patient is predicted to not have cardiovascular risk.
