In [None]:
Lenac Liju      :->     Personalized Health Status Prediction System Using Machine Learning

In [1]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
# Load your dataset
df = pd.read_csv(r'C:\Users\lenac\OneDrive\Documents\data\ml&ai\major\CVD.csv')
df = df.head(1000)  # Use the first 1000 rows for analysis

In [3]:
# Define features and target
X = df.drop('General_Health', axis=1)
y = df['General_Health']

In [4]:
# Check class distribution
print("Class distribution:\n", y.value_counts())

Class distribution:
 General_Health
Good         341
Very Good    269
Fair         211
Excellent    107
Poor          72
Name: count, dtype: int64


In [5]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Define categorical and numerical columns
categorical_cols = [
    'Checkup', 'Exercise', 'Heart_Disease', 'Skin_Cancer', 'Other_Cancer',
    'Depression', 'Diabetes', 'Arthritis', 'Sex', 'Age_Category', 'Smoking_History'
]
numerical_cols = [
    'Height_(cm)', 'Weight_(kg)', 'BMI', 'Alcohol_Consumption',
    'Fruit_Consumption', 'Green_Vegetables_Consumption', 'FriedPotato_Consumption'
]

In [9]:
categorical_cols , numerical_cols

(['Checkup',
  'Exercise',
  'Heart_Disease',
  'Skin_Cancer',
  'Other_Cancer',
  'Depression',
  'Diabetes',
  'Arthritis',
  'Sex',
  'Age_Category',
  'Smoking_History'],
 ['Height_(cm)',
  'Weight_(kg)',
  'BMI',
  'Alcohol_Consumption',
  'Fruit_Consumption',
  'Green_Vegetables_Consumption',
  'FriedPotato_Consumption'])

In [18]:
# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), numerical_cols),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))
        ]), categorical_cols)
    ]
)

In [19]:
preprocessor

In [20]:
# Define the model with class balancing
model = RandomForestClassifier(n_estimators=200, max_depth=10, class_weight='balanced')

In [21]:
model

In [23]:
# Create the pipeline with preprocessing and model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model)
])

In [24]:
# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

In [25]:
# Evaluate on the test set
y_pred = pipeline.predict(X_test)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Confusion Matrix:
 [[ 4  0  7  0  7]
 [ 0 13 13  1  4]
 [ 5 14 42  1 19]
 [ 0  6  6  1  1]
 [ 7  6 15  3 25]]
Classification Report:
               precision    recall  f1-score   support

   Excellent       0.25      0.22      0.24        18
        Fair       0.33      0.42      0.37        31
        Good       0.51      0.52      0.51        81
        Poor       0.17      0.07      0.10        14
   Very Good       0.45      0.45      0.45        56

    accuracy                           0.42       200
   macro avg       0.34      0.34      0.33       200
weighted avg       0.42      0.42      0.42       200



In [26]:
# Define the function to preprocess user input
def preprocess_user_input(user_input, preprocessor):
    input_df = pd.DataFrame([user_input])
    # Ensure the columns match the training data
    missing_cols = set(preprocessor.transformers_[1][1].named_steps['onehot'].get_feature_names_out()) - set(input_df.columns)
    for col in missing_cols:
        input_df[col] = [0] * len(input_df)
    input_preprocessed = preprocessor.transform(input_df)
    return input_preprocessed

In [27]:
# Define function to get user input
def get_user_input():
    # Collect user input
    checkup = input("Checkup (e.g., 'Within the past year'): ")
    exercise = input("Exercise (e.g., 'Yes' or 'No'): ")
    heart_disease = input("Heart Disease (e.g., 'Yes' or 'No'): ")
    skin_cancer = input("Skin Cancer (e.g., 'Yes' or 'No'): ")
    other_cancer = input("Other Cancer (e.g., 'Yes' or 'No'): ")
    depression = input("Depression (e.g., 'Yes' or 'No'): ")
    diabetes = input("Diabetes (e.g., 'Yes' or 'No'): ")
    arthritis = input("Arthritis (e.g., 'Yes' or 'No'): ")
    sex = input("Sex (e.g., 'Male' or 'Female'): ")
    age_category = input("Age Category (e.g., '70-74'): ")
    
    try:
        height_cm = float(input("Height (in cm): "))
        weight_kg = float(input("Weight (in kg): "))
        bmi = float(input("BMI: "))
    except ValueError:
        print("Error: Height, weight, and BMI should be numeric.")
        return None

    try:
        smoking_history = input("Smoking History (e.g., 'Yes' or 'No'): ")
        alcohol_consumption = float(input("Alcohol Consumption: "))
        fruit_consumption = float(input("Fruit Consumption: "))
        green_vegetables_consumption = float(input("Green Vegetables Consumption: "))
        fried_potato_consumption = float(input("Fried Potato Consumption: "))
    except ValueError:
        print("Error: Consumption values should be numeric.")
        return None

    return {
        'Checkup': checkup,
        'Exercise': exercise,
        'Heart_Disease': heart_disease,
        'Skin_Cancer': skin_cancer,
        'Other_Cancer': other_cancer,
        'Depression': depression,
        'Diabetes': diabetes,
        'Arthritis': arthritis,
        'Sex': sex,
        'Age_Category': age_category,
        'Height_(cm)': height_cm,
        'Weight_(kg)': weight_kg,
        'BMI': bmi,
        'Smoking_History': smoking_history,
        'Alcohol_Consumption': alcohol_consumption,
        'Fruit_Consumption': fruit_consumption,
        'Green_Vegetables_Consumption': green_vegetables_consumption,
        'FriedPotato_Consumption': fried_potato_consumption
    }


In [None]:
# Get user input
user_input = get_user_input()
if user_input:
    try:
        # Preprocess the user input
        user_input_processed = preprocess_user_input(user_input, pipeline.named_steps['preprocessor'])
        
        # Predict using the model
        prediction = pipeline.named_steps['classifier'].predict(user_input_processed)
        prediction_proba = pipeline.named_steps['classifier'].predict_proba(user_input_processed)

        print("\nPredicted Health Status:", prediction[0])
        print("Prediction Probabilities:", prediction_proba[0])
    except Exception as e:
        print(f"Error during prediction: {e}")
else:
    print("Invalid input provided. Please check the values and try again.")
