In [1]:
import pandas as pd
import numpy as np

In [2]:
df1=pd.read_csv("Final_Data.csv")

In [3]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 54 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Age                             20000 non-null  float64
 1   Gender                          20000 non-null  object 
 2   Weight (kg)                     20000 non-null  float64
 3   Height (m)                      20000 non-null  float64
 4   Max_BPM                         20000 non-null  float64
 5   Avg_BPM                         20000 non-null  float64
 6   Resting_BPM                     20000 non-null  float64
 7   Session_Duration (hours)        20000 non-null  float64
 8   Calories_Burned                 20000 non-null  float64
 9   Workout_Type                    20000 non-null  object 
 10  Fat_Percentage                  20000 non-null  float64
 11  Water_Intake (liters)           20000 non-null  float64
 12  Workout_Frequency (days/week)   

In [4]:
columns=["Age","Gender","Weight (kg)","Height (m)","BMI","Water_Intake (liters)","Workout_Frequency (days/week)","Daily meals frequency","Calories"]

In [5]:
df=df1[columns].copy()

In [6]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 9 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Age                            20000 non-null  float64
 1   Gender                         20000 non-null  object 
 2   Weight (kg)                    20000 non-null  float64
 3   Height (m)                     20000 non-null  float64
 4   BMI                            20000 non-null  float64
 5   Water_Intake (liters)          20000 non-null  float64
 6   Workout_Frequency (days/week)  20000 non-null  float64
 7   Daily meals frequency          20000 non-null  float64
 8   Calories                       20000 non-null  int64  
dtypes: float64(7), int64(1), object(1)
memory usage: 1.4+ MB


In [7]:
df["activity_score"]=df["Workout_Frequency (days/week)"] * 10
df["hydration_score"]=df["Water_Intake (liters)"] * 10
df["meal_frequency_score"]=df["Daily meals frequency"]*10



In [8]:
def categorize_bmi_age(row):
    # For those who are childrens / Teenager.
    age = row['Age']
    bmi = row['BMI']
    if age < 19:
        if bmi < 14:
            return "Underweight"
        elif bmi < 20:
            return "Normal"
        elif bmi < 25:
            return "Overweight"
        else:
            return "Obese"

    # For those who are Adults
    elif age <= 65:
        if bmi < 18.5:
            return "Underweight"
        elif bmi < 25:
            return "Normal"
        elif bmi < 30:
            return "Overweight"
        else:
            return "Obese"

    # Elderly (65+)
    else:
        if bmi < 22:
            return "Underweight"
        elif bmi < 28:
            return "Normal"
        elif bmi < 32:
            return "Overweight"
        else:
            return "Obese"
df['BMI_Category'] = df[['Age', 'BMI']].apply(categorize_bmi_age, axis=1)


In [9]:
bmi_score_map = {
    "Normal": 30,
    "Overweight": 20,
    "Underweight": 15,
    "Obese": 10
}

df["BMI_score"] = df["BMI_Category"].map(bmi_score_map)


In [10]:
# encode the categorical features
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
columns=["BMI_Category","Gender"]

for c in columns:
        df[c]=le.fit_transform(df[c])

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 14 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Age                            20000 non-null  float64
 1   Gender                         20000 non-null  int64  
 2   Weight (kg)                    20000 non-null  float64
 3   Height (m)                     20000 non-null  float64
 4   BMI                            20000 non-null  float64
 5   Water_Intake (liters)          20000 non-null  float64
 6   Workout_Frequency (days/week)  20000 non-null  float64
 7   Daily meals frequency          20000 non-null  float64
 8   Calories                       20000 non-null  int64  
 9   activity_score                 20000 non-null  float64
 10  hydration_score                20000 non-null  float64
 11  meal_frequency_score           20000 non-null  float64
 12  BMI_Category                   20000 non-null 

In [12]:
def calculate_bmr(weight_kg, height_cm, age, gender):
    if gender == 1:
        bmr = (10 * weight_kg) + (6.25 * height_cm) - (5 * age) + 5
    elif gender == 0:
        bmr = (10 * weight_kg) + (6.25 * height_cm) - (5 * age) - 161
    else:
        raise ValueError("Gender must be 'male' or 'female'")

    return bmr

In [13]:
def activity_multiplier(workout_days_per_week):
    if workout_days_per_week == 0:
        return 1.2
    elif 1 <= workout_days_per_week <= 2:
        return 1.375
    elif 3 <= workout_days_per_week <= 4:
        return 1.55
    elif 5 <= workout_days_per_week <= 6:
        return 1.725
    else:  # 7 days
        return 1.9


In [14]:
def calculate_tdee(weight_kg, height_cm, age, gender, workout_days_per_week):
    bmr = calculate_bmr(weight_kg, height_cm, age, gender)
    multiplier = activity_multiplier(workout_days_per_week)
    tdee = bmr * multiplier
    return round(tdee, 2)


In [15]:
df["TDEE"] = df.apply(
    lambda x: calculate_tdee(
        weight_kg=x["Weight (kg)"],
        height_cm=x["Height (m)"] * 100,
        age=x["Age"],
        gender=x["Gender"],
        workout_days_per_week=x["Workout_Frequency (days/week)"]
    ),
    axis=1
)


In [16]:
df["cal_balance"] = df["Calories"] - df["TDEE"]


In [17]:
def calorie_balance_score(cal_balance):
    abs_balance = abs(cal_balance)

    if abs_balance <= 200:
        return 30
    elif abs_balance <= 400:
        return 25
    elif abs_balance <= 600:
        return 20
    elif abs_balance <= 800:
        return 10
    else:
        return 5


In [18]:
df["calorie_score"] = df["cal_balance"].apply(calorie_balance_score)


np.float64(1.46)

In [19]:
df["wellness_score"]=(

    df["BMI_score"] + df["calorie_score"]+ df["meal_frequency_score"] +df["hydration_score"]+df["activity_score"]
)/10

In [20]:
df["wellness_score"].max()


np.float64(18.0)

In [21]:
def classify_wellness(score):
    if score <= 11:
        return  0 #"Unhealthy"
    elif score <= 14:
        return 1 #"At Risk"
    else:
        return 2 #"Healthy"


In [22]:
df["wellness_label"]=df["wellness_score"].apply(classify_wellness)


In [23]:
col=["Age",
     "Gender",
     "Weight (kg)",
     "Height (m)",
     "BMI",
     "Water_Intake (liters)",
     "Workout_Frequency (days/week)",
     "Daily meals frequency",
     "Calories",
    'TDEE',
    'cal_balance']
X=df[col]
y=df["wellness_label"]

In [24]:
y.value_counts()

wellness_label
1    11541
2     4353
0     4106
Name: count, dtype: int64

In [25]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.7, random_state=42
    # ,stratify=y
)

In [26]:
numeric_features=[
     "Age",
     "Weight (kg)",
     "Height (m)",
     "BMI",
     "Water_Intake (liters)",
     "Workout_Frequency (days/week)",
     "Daily meals frequency",
     "Calories",
    'TDEE',
    'cal_balance']
categorical_features=["Gender"]

In [27]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features)
    ],
    remainder='passthrough'
)

In [28]:
X_train_new = preprocessor.fit_transform(X_train)
X_test_new = preprocessor.transform(X_test)


In [29]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_train_num = scaler.fit_transform(X_train[numeric_features])
X_test_num = scaler.transform(X_test[numeric_features])


In [30]:
X_train_new = np.hstack([
    X_train_num,
    X_train[categorical_features].values
])

X_test_new = np.hstack([
    X_test_num,
    X_test[categorical_features].values
])

In [31]:
from sklearn.naive_bayes import GaussianNB

def build_naive_bayes():
    model=GaussianNB()
    model.fit(X_train,y_train)
    return model

In [32]:
from sklearn.linear_model import LogisticRegressionCV
def build_logistic_regression():
    model=LogisticRegressionCV(max_iter=3000)
    model.fit(X_train_new,y_train)
    return model


In [33]:
from sklearn.neighbors import KNeighborsClassifier

def build_KNN():
    model=KNeighborsClassifier()
    model.fit(X_train,y_train)
    return model

In [34]:
# 4. SVM
from  sklearn.svm import SVC

def build_SVM():
    model=SVC()
    model.fit(X_train,y_train)
    return model

In [35]:
from sklearn.tree import DecisionTreeClassifier

def build_decision_tree_model():
    model = DecisionTreeClassifier(max_depth=10)
    model.fit(X_train,y_train)
    return model

In [36]:
from sklearn.ensemble import RandomForestClassifier

def build_Random_forest():
# create an empty model
    model = RandomForestClassifier(n_estimators=100)
    model.fit(X_train,y_train)
    return model

In [37]:
from sklearn.ensemble import AdaBoostClassifier

def build_adaboost_model():
    model = AdaBoostClassifier(n_estimators=50)
    model.fit(X_train,y_train)
    return model

In [38]:
from xgboost import XGBClassifier

def build_XGBoost_model():
    model=XGBClassifier(n_estimators=300)
    model.fit(X_train,y_train)
    return model

In [39]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,classification_report

def format_result(result):
    return f"{result * 100:.2f}"

def evaluate_model(model_info):
    model = model_info['model']
    name = model_info['name']
    print(type(model))

    # predict the test set
    y_pred = model.predict(X_test)
    
    #predict the accuracy for training set
    y_pred_train=model.predict(X_train)
    
    accuracy = format_result(accuracy_score(y_test, y_pred))
    precision = format_result(precision_score(y_test, y_pred,average="weighted"))
    recall = format_result(recall_score(y_test, y_pred,average="weighted"))
    f1 = format_result(f1_score(y_test, y_pred,average="weighted"))
    report = classification_report(y_test, y_pred)

    return accuracy,precision, recall, f1,report

In [40]:
models = [
    {"name": "KNN", "model": build_KNN()},
    {"name": "Logistic_Regression", "model": build_logistic_regression()},
    {"name": "Naive Bayes", "model":  build_naive_bayes()},
    {"name": "SVM", "model": build_SVM()},
    {"name": "Decision Tree", "model": build_decision_tree_model()},
    {"name": "Random Forest", "model":  build_Random_forest()},
    {"name": "Adaboost Model", "model":  build_adaboost_model()},
    {"name": "XGB Model", "model": build_XGBoost_model()}
]

In [41]:
evaluation_result = []
for model in models:
    report,accuracy, precision, recall, f1 = evaluate_model(model)
    evaluation_result.append({
       "model name": model['name'],
        "accuracy": accuracy, 
        "precision": precision, 
        "recall": recall,
        "f1 score": f1,
        "report" : report

    })
    

<class 'sklearn.neighbors._classification.KNeighborsClassifier'>
<class 'sklearn.linear_model._logistic.LogisticRegressionCV'>
<class 'sklearn.naive_bayes.GaussianNB'>
<class 'sklearn.svm._classes.SVC'>


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


<class 'sklearn.tree._classes.DecisionTreeClassifier'>
<class 'sklearn.ensemble._forest.RandomForestClassifier'>
<class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>
<class 'xgboost.sklearn.XGBClassifier'>


In [42]:
evaluation_result_df = pd.DataFrame(evaluation_result)
evaluation_result_df

Unnamed: 0,model name,accuracy,precision,recall,f1 score,report
0,KNN,60.71,61.6,60.84,precision recall f1-score ...,61.6
1,Logistic_Regression,4.83,21.98,7.92,precision recall f1-score ...,21.98
2,Naive Bayes,77.48,77.08,76.47,precision recall f1-score ...,77.08
3,SVM,49.79,60.27,48.47,precision recall f1-score ...,60.27
4,Decision Tree,92.75,92.75,92.73,precision recall f1-score ...,92.75
5,Random Forest,96.42,96.42,96.41,precision recall f1-score ...,96.42
6,Adaboost Model,80.25,76.85,74.44,precision recall f1-score ...,76.85
7,XGB Model,98.12,98.12,98.12,precision recall f1-score ...,98.12


In [43]:
model= build_XGBoost_model()

In [45]:
import pickle

In [46]:
with open("Ml_model.pkl", "wb") as file:
    pickle.dump(model, file)