<a href="https://colab.research.google.com/github/Ambrgna/CS-5530-Assignment-1/blob/main/Assignment_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np

Read Raw Data

In [3]:
url = 'https://raw.githubusercontent.com/Ambrgna/CS-5530-Assignment-1/refs/heads/main/frailty_project/data_raw/raw_frailty_data.csv'
url2 = 'https://drive.google.com/uc?export=download&id=1EcLvBbcmlzkkLjLnXrC91YiQhvPLIMwG'
df = pd.read_csv(url)
df.head(10)

Unnamed: 0,Height,Weight,Age,Grip strength,Frailty
0,65.8,112,30,30,N
1,71.5,136,19,31,N
2,69.4,153,45,29,N
3,68.2,142,22,28,Y
4,67.8,144,29,24,Y
5,68.7,123,50,26,N
6,69.8,141,51,22,Y
7,70.1,136,23,20,Y
8,67.9,112,17,19,N
9,66.8,120,39,31,N


Unit Standardization

In [9]:
df_standardization = df.copy()
df_standardization["Height"] = (df["Height"] * 0.0254).round(2)
df_standardization["Weight"] = (df["Weight"] * 0.45359237).round(2)

df_standardization.rename(columns={"Height": "Height (m)"}, inplace=True)
df_standardization.rename(columns={"Weight": "Weight (kg)"}, inplace=True)

df_standardization.head(10)

Unnamed: 0,Height (m),Weight (kg),Age,Grip strength,Frailty
0,1.67,50.8,30,30,N
1,1.82,61.69,19,31,N
2,1.76,69.4,45,29,N
3,1.73,64.41,22,28,Y
4,1.72,65.32,29,24,Y
5,1.74,55.79,50,26,N
6,1.77,63.96,51,22,Y
7,1.78,61.69,23,20,Y
8,1.72,50.8,17,19,N
9,1.7,54.43,39,31,N


Feature Engineering

Add BMI and AgeGroup

In [10]:
df_feature = df_standardization.copy()

bmi = (df_feature["Weight (kg)"] / (df_feature["Height (m)"] ** 2)).round(2)

weight_index = df_feature.columns.get_loc("Weight (kg)")
df_feature.insert(weight_index + 1, "BMI", bmi)

conditions = [
    df_feature["Age"] < 30,
    (df_feature["Age"] >= 30) & (df_feature["Age"] <= 45),
    (df_feature["Age"] >= 46) & (df_feature["Age"] <= 60),
    df_feature["Age"] > 60
]

choices = ["<30", "30-45", "46-60", ">60"]

age_index = df_feature.columns.get_loc("Age")

agegroup = np.select(conditions, choices, default="Unknown")
df_feature.insert(age_index + 1, "AgeGroup (categorical)", agegroup)

df_feature.rename(columns={"Age": "Age (yr)"}, inplace=True)

df_feature.head(10)

Unnamed: 0,Height (m),Weight (kg),BMI,Age (yr),AgeGroup (categorical),Grip strength,Frailty
0,1.67,50.8,18.22,30,30-45,30,N
1,1.82,61.69,18.62,19,<30,31,N
2,1.76,69.4,22.4,45,30-45,29,N
3,1.73,64.41,21.52,22,<30,28,Y
4,1.72,65.32,22.08,29,<30,24,Y
5,1.74,55.79,18.43,50,46-60,26,N
6,1.77,63.96,20.42,51,46-60,22,Y
7,1.78,61.69,19.47,23,<30,20,Y
8,1.72,50.8,17.17,17,<30,19,N
9,1.7,54.43,18.83,39,30-45,31,N


Categorical → numeric encoding

In [23]:
df_binary = df_feature.copy()

df_binary["Frailty"] = df_binary["Frailty"].map({"Y": 1, "N": 0}).astype('int8')

all_age_groups = ["<30", "30-45", "46-60", ">60"]

df_binary["AgeGroup (categorical)"] = pd.Categorical(
    df_binary["AgeGroup (categorical)"],
    categories=all_age_groups
)

df_clean= pd.get_dummies(df_binary, columns=["AgeGroup (categorical)"], prefix="AgeGroup", dtype='int8')

df_clean.head(10)

Unnamed: 0,Height (m),Weight (kg),BMI,Age (yr),Grip strength,Frailty,AgeGroup_<30,AgeGroup_30-45,AgeGroup_46-60,AgeGroup_>60
0,1.67,50.8,18.22,30,30,0,0,1,0,0
1,1.82,61.69,18.62,19,31,0,1,0,0,0
2,1.76,69.4,22.4,45,29,0,0,1,0,0
3,1.73,64.41,21.52,22,28,1,1,0,0,0
4,1.72,65.32,22.08,29,24,1,1,0,0,0
5,1.74,55.79,18.43,50,26,0,0,0,1,0
6,1.77,63.96,20.42,51,22,1,0,0,1,0
7,1.78,61.69,19.47,23,20,1,1,0,0,0
8,1.72,50.8,17.17,17,19,0,1,0,0,0
9,1.7,54.43,18.83,39,31,0,0,1,0,0


EDA & Reporting

In [27]:
# Select only numeric columns
numeric_summary = df_clean.describe().T[["mean", "50%", "std"]]
numeric_summary.rename(columns={"50%": "median"}, inplace=True)

# Save to Markdown file
report_path = "reports/findings.md"

# Create the folder if it doesn’t exist
import os
os.makedirs("reports", exist_ok=True)

with open(report_path, "w") as f:
    f.write("# Findings Report\n\n")
    f.write("## Summary Statistics (Numeric Columns)\n\n")
    f.write(numeric_summary.to_markdown())

numeric_summary.head(10)


Unnamed: 0,mean,median,std
Height (m),1.741,1.735,0.043063
Weight (kg),59.829,61.69,6.457045
BMI,19.716,19.15,1.793911
Age (yr),32.5,29.5,12.860361
Grip strength,26.0,27.0,4.521553
Frailty,0.4,0.0,0.516398
AgeGroup_<30,0.5,0.5,0.527046
AgeGroup_30-45,0.3,0.0,0.483046
AgeGroup_46-60,0.2,0.0,0.421637
AgeGroup_>60,0.0,0.0,0.0
