Task 3: Predict Patient Health Categories 

1. For the first step of preprocessing the dataset, I imported beneficial libraries for this task and also assigned df as the variable that holds the dataset.

In [None]:
import pandas as pd   # This import will be used for data manipulation
import numpy as np   # This import will be used for working with arrays
import seaborn as sns   # This import will be used for data visualisation and dataset loading 
import matplotlib.pyplot as plt   # This import will be used for plotting
from sklearn.model_selection import train_test_split   # This import will be used for splittig data into train and test sets 
from sklearn.preprocessing import StandardScaler   # This import will be used for Scaling Features  

df1 = pd.read_csv("test_data.csv")   # Loads the dataset into the variable df1
df = df1.drop("vitals_blood_pressure", axis = 1)   # Due to blood pressure being displayed as systolic/diastolic this would alter the 
print("Dataset preview:\n", df.head())   # Displays the first few rows of th dataset

2. To be able to view some important statistics of the dataset, I used .info and .describe to be able to analyse the dataset.

In [None]:
# This will allow to view the basic information about the dataset
print(df.info())   # Output: this will show the column names, data types, non-null counts and so on 

# We need to now also view the statistical summary of numrical columns 
print(df.describe())   # This wil output thingd such as the mean, std, min, 25%, 50%, 75%, max for numerical columns 

3. To be able to use numerical data only, any string data needs to be encoded first - in this case the sleep_quality column.  

In [None]:
# As the dataset contains string variables, I will encode sleep quality to integers
df['sleep_quality'] = df['sleep_quality'].map({'poor':0, 'fair':1, 'good':2, 'excellent':3})   # This will map 'sleep_quality' to 0, 1, 2 and 3 for poor, fair, good and excellent 

print(df)

4. For heart rate and temperature, below I calculated a mean to replace the list of values to be able to utilise the data better. First I had to replace the '|' symbol then work out the mean.

In [None]:

def seperate_string(val):   # Function to seperate the string and remove the '|' from the data points 
    try:
        return np.mean([float(x) for x in val.split('|')])
    except Exception as e:
        return np.nan

df['vitals_heart_rate'] = df['vitals_heart_rate'].apply(seperate_string)   # Replaces multiple data in the data point with the mean vital heart rate
df['vitals_temperature'] = df['vitals_temperature'].apply(seperate_string)   # Replaces multiple data in the data point with the mean vital temperature

print("\nData with mean:")

print(df.head())

5. Now the data has only numerical values and has one value for each datapoint, the next step was to label the data based on thresholds. The three labels are:
 - Good: Stable vitals, adequate sleep, and healthy activity
 - Moderate: Minor deviations in health indicators
 - Poor: Significant abnormalities, insufficient sleep, or low activity

Based on these labels, I created a point style system, to be able to accurately label each row of the data. 

In [None]:
def label_health(row):
    points = 0
    if row['vitals_heart_rate'] <= 70:   # if statement for heart rate to check health level 
        points += 2                        # Less than 70 means good health
    elif row['vitals_heart_rate'] <= 75:
        points += 1                        # Moderate health

    if row['sleep_quality'] >= 2:   # if statement for sleep quality to check health level
        points += 2                    # Quality of 2 or 3 (good or excellent) good health 
    elif row['sleep_quality'] == 1:
        points += 1                          # Moderate health

    if row['sleep_interruptions'] == 0:   # if statement for sleep interruptions to check health level
        points += 2                          # No sleep interruptions good health
    elif row['sleep_interruptions'] == 1:
        points += 1                          # Moderate health
    elif row['sleep_interruptions'] == 2:
        points += 1

    if row['activity_steps'] >= 6000:   # if statement for activity steps to check health level
        points += 2                        # More than 6000 steps good health
    elif row['activity_steps'] >= 4000:
        points += 1                        # Moderate health

    if points >= 8:
        return 2    # Good health level
    elif points >= 5:
        return 1   # Moderate health level
    else:
        return 0   # Poor health level 
    
df['health_label'] = df.apply(label_health, axis=1)
print(df.head())

6. To be able to prepare the data for machine learning models, I scaled features to be able to reduce abnormalities. 

In [None]:
# Initialise scaler 
scaler = StandardScaler()

# Scale the following columns to achieve better accuracy when training models. 
df[['vitals_heart_rate', 'vitals_temperature','sleep_duration_hours', 'sleep_interruptions', 'activity_steps','activity_active_minutes', 'activity_sedentary_hours', 'nutrition_calories', 'nutrition_water_oz', 'nutrition_macros_carbs_g', 'nutrition_macros_protein_g', 'nutrition_macros_fat_g']] = scaler.fit_transform(df[['vitals_heart_rate', 'vitals_temperature', 'sleep_duration_hours', 'sleep_interruptions', 'activity_steps', 
    'activity_active_minutes', 'activity_sedentary_hours', 'nutrition_calories', 'nutrition_water_oz', 'nutrition_macros_carbs_g', 'nutrition_macros_protein_g', 'nutrition_macros_fat_g']])

print(df)

7. Next was to set the data into training and testing sets to move onto training a classification model.

In [None]:
# The data is split into features (X) and targets (y)

# Feature variables to ensure the models can be as accurate as possible.
X = df[["vitals_heart_rate", "sleep_duration_hours", "sleep_interruptions", "activity_steps", "activity_active_minutes"]]   # All columns shown have been selected as features of this dataset.
y = df["health_label"]   # Target variable

# Now we will split the data into training (70%) and testing (30%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

# Prints out the training and testing sets with the 70% and 30% split respectively.
print("Training set size:", X_train.shape) 
print("Testing set size:", X_test.shape)

8. Due to this being a classification task, I chose Logistic Regression as the model to train. Below is the layout of the report to showcase scores such as:
 - precision 
 - recall
 - f1-score
 - support 
 Once the model is trained, I will print out the classification report to show the outcome. 

In [None]:

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report   #Imports for all important report metrics for classification tasks 

def print_score(clf, X_train, y_train, X_test, y_test, train = True):
    if train:
        pred = clf.predict(X_train)   # Here we will use predictions 
        clf_report = pd.DataFrame(classification_report(y_train, pred, output_dict = True, zero_division=0))   # For train reports 
        print("Train - Test Result:\n====================================================================")
        print(f"Accuracy Score: {accuracy_score(y_train, pred) * 100:.2f}%")
        print("____________________________________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")   # Shows values such as the F1-score, precision, recall and so on.

    elif train == False:
        pred = clf.predict(X_test)   # Here we will use prediction to gain information for the testing set 
        clf_report = pd.DataFrame(classification_report(y_test, pred, output_dict = True, zero_division=0))   # For test reports 
        print("Test - Test Result:\n====================================================================")
        print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
        print("____________________________________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")


9. Below is the code that uses the training and testing sets to train the logistic regression model using an import from sklearn.linear_model. 

In [None]:
from sklearn.linear_model import LogisticRegression   # Import to train the logistic regression model

lr_clf = LogisticRegression(solver='liblinear')   # The liblinear library is used to optimise the models parameters during training 
lr_clf.fit(X_train, y_train)

print_score(lr_clf, X_train, y_train, X_test, y_test, train=True)   # Printing the outcome of the training set
print()
print()
print_score(lr_clf, X_train, y_train, X_test, y_test, train=False)   # Printing the outcome of the testing set

10. To further show the outcome of the logistic regression model, below is the code to output the confusion matrix of the testing set. 

In [None]:
def print_score(clf, X_train, y_train, X_test, y_test, train = True):   # Layout of the confusion matrix for visualisation 
    if train:   # Confusion matrix for training set.
        pred = clf.predict(X_train)
        print("____________________________________________________________________")
        print(f"Confusion Matrix:\n {confusion_matrix(y_train, pred)}\n")

    elif train == False:   # Confusion matrix for testing set.
        pred = clf.predict(X_test)
        print("____________________________________________________________________")
        print(f"Confusion Matrix:\n {confusion_matrix(y_test, pred)}\n")

In [None]:
print("\nLogistic Regression:")   # Prints out the logistic regression testing set confusion matrix
print_score(lr_clf, X_train, y_train, X_test, y_test, train = False)