<a href="https://colab.research.google.com/github/Aishwaryagoud27/wellbot/blob/main/wellbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#load dataset
file_path = "/content/health_fitness_dataset.csv"
df=pd.read_csv(file_path)
df.head()

Unnamed: 0,participant_id,date,age,gender,height_cm,weight_kg,activity_type,duration_minutes,intensity,calories_burned,...,stress_level,daily_steps,hydration_level,bmi,resting_heart_rate,blood_pressure_systolic,blood_pressure_diastolic,health_condition,smoking_status,fitness_level
0,1,2024-01-01,56,F,165.3,53.7,Dancing,41,Low,3.3,...,3,7128,1.5,19.6,69.5,110.7,72.9,,Never,0.04
1,1,2024-01-04,56,F,165.3,53.9,Swimming,28,Low,2.9,...,7,7925,1.8,19.6,69.5,110.7,72.9,,Never,0.07
2,1,2024-01-05,56,F,165.3,54.2,Swimming,21,Medium,2.6,...,7,7557,2.7,19.6,69.5,110.7,72.9,,Never,0.09
3,1,2024-01-07,56,F,165.3,54.4,Weight Training,99,Medium,10.7,...,8,11120,2.6,19.6,69.5,110.7,72.9,,Never,0.21
4,1,2024-01-09,56,F,165.3,54.7,Swimming,100,Medium,12.7,...,1,5406,1.5,19.6,69.5,110.7,72.9,,Never,0.33


In [None]:
#display basic information
print("Dataset Info:\n")
print(df.info())
print("\nShape:", df.shape)
print("\nMissing Values:\n", df.isnull().sum())

Dataset Info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 377907 entries, 0 to 377906
Data columns (total 22 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   participant_id            377907 non-null  int64  
 1   date                      377907 non-null  object 
 2   age                       377907 non-null  int64  
 3   gender                    377907 non-null  object 
 4   height_cm                 377907 non-null  float64
 5   weight_kg                 377907 non-null  float64
 6   activity_type             377907 non-null  object 
 7   duration_minutes          377907 non-null  int64  
 8   intensity                 377907 non-null  object 
 9   calories_burned           377907 non-null  float64
 10  avg_heart_rate            377907 non-null  int64  
 11  hours_sleep               377907 non-null  float64
 12  stress_level              377907 non-null  int64  
 13  daily_steps               377

In [None]:
#Check for duplicate rows
duplicates = df.duplicated().sum()
print("\nDuplicate Rows:", duplicates)
#Remove duplicates
df=df.drop_duplicates()


Duplicate Rows: 0


In [None]:
#HandLe missing values (if any)
#For numeric columns: fill with median

num_cols = df.select_dtypes(include=[np.number]).columns
df [num_cols] = df[num_cols].fillna(df[num_cols].median())

#For categorical columns: fill with mode
cat_cols= df.select_dtypes(include=["object"]).columns
for col in cat_cols:
  df[col] = df[col].fillna(df[col].mode()[0])
# Verify again
print("\nMissing Values After Cleaning: \n", df.isnull().sum())


Missing Values After Cleaning: 
 participant_id              0
date                        0
age                         0
gender                      0
height_cm                   0
weight_kg                   0
activity_type               0
duration_minutes            0
intensity                   0
calories_burned             0
avg_heart_rate              0
hours_sleep                 0
stress_level                0
daily_steps                 0
hydration_level             0
bmi                         0
resting_heart_rate          0
blood_pressure_systolic     0
blood_pressure_diastolic    0
health_condition            0
smoking_status              0
fitness_level               0
dtype: int64


In [None]:
#Remove outliers (using IQR method)
for col in num_cols:
   Q1 = df[col].quantile(0.25)
   Q3 = df[col].quantile(0.75)
   IQR= Q3- Q1
   lower= Q1-1.5* IQR
   upper= Q3 + 1.5* IQR
   df=df [(df[col] >= lower) & (df[col] <= upper)]
print("\nShape after removing outliers:",df.shape)
df


Shape after removing outliers: (351446, 22)


Unnamed: 0,participant_id,date,age,gender,height_cm,weight_kg,activity_type,duration_minutes,intensity,calories_burned,...,stress_level,daily_steps,hydration_level,bmi,resting_heart_rate,blood_pressure_systolic,blood_pressure_diastolic,health_condition,smoking_status,fitness_level
0,1,2024-01-01,56,F,165.3,53.7,Dancing,41,Low,3.3,...,3,7128,1.5,19.6,69.5,110.7,72.9,Hypertension,Never,0.04
1,1,2024-01-04,56,F,165.3,53.9,Swimming,28,Low,2.9,...,7,7925,1.8,19.6,69.5,110.7,72.9,Hypertension,Never,0.07
2,1,2024-01-05,56,F,165.3,54.2,Swimming,21,Medium,2.6,...,7,7557,2.7,19.6,69.5,110.7,72.9,Hypertension,Never,0.09
3,1,2024-01-07,56,F,165.3,54.4,Weight Training,99,Medium,10.7,...,8,11120,2.6,19.6,69.5,110.7,72.9,Hypertension,Never,0.21
4,1,2024-01-09,56,F,165.3,54.7,Swimming,100,Medium,12.7,...,1,5406,1.5,19.6,69.5,110.7,72.9,Hypertension,Never,0.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
377902,1648,2024-06-26,64,F,160.9,91.7,Basketball,24,Low,4.9,...,3,8285,1.8,21.7,69.0,114.1,100.7,Hypertension,Never,11.35
377903,1648,2024-06-27,64,F,160.9,91.9,Basketball,104,Medium,25.4,...,1,9595,2.9,21.7,69.0,114.1,100.7,Hypertension,Never,11.48
377904,1648,2024-06-28,64,F,160.9,92.2,Basketball,30,Medium,7.4,...,5,10913,1.8,21.7,69.0,114.1,100.7,Hypertension,Never,11.51
377905,1648,2024-06-30,64,F,160.9,92.4,Swimming,95,Medium,20.4,...,5,11342,3.0,21.7,69.0,114.1,100.7,Hypertension,Never,11.63


In [None]:
#Encode categorical variables (if needed)
df_encoded = pd.get_dummies(df, drop_first=True)
df

Unnamed: 0,participant_id,date,age,gender,height_cm,weight_kg,activity_type,duration_minutes,intensity,calories_burned,...,stress_level,daily_steps,hydration_level,bmi,resting_heart_rate,blood_pressure_systolic,blood_pressure_diastolic,health_condition,smoking_status,fitness_level
0,1,2024-01-01,56,F,165.3,53.7,Dancing,41,Low,3.3,...,3,7128,1.5,19.6,69.5,110.7,72.9,Hypertension,Never,0.04
1,1,2024-01-04,56,F,165.3,53.9,Swimming,28,Low,2.9,...,7,7925,1.8,19.6,69.5,110.7,72.9,Hypertension,Never,0.07
2,1,2024-01-05,56,F,165.3,54.2,Swimming,21,Medium,2.6,...,7,7557,2.7,19.6,69.5,110.7,72.9,Hypertension,Never,0.09
3,1,2024-01-07,56,F,165.3,54.4,Weight Training,99,Medium,10.7,...,8,11120,2.6,19.6,69.5,110.7,72.9,Hypertension,Never,0.21
4,1,2024-01-09,56,F,165.3,54.7,Swimming,100,Medium,12.7,...,1,5406,1.5,19.6,69.5,110.7,72.9,Hypertension,Never,0.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
377902,1648,2024-06-26,64,F,160.9,91.7,Basketball,24,Low,4.9,...,3,8285,1.8,21.7,69.0,114.1,100.7,Hypertension,Never,11.35
377903,1648,2024-06-27,64,F,160.9,91.9,Basketball,104,Medium,25.4,...,1,9595,2.9,21.7,69.0,114.1,100.7,Hypertension,Never,11.48
377904,1648,2024-06-28,64,F,160.9,92.2,Basketball,30,Medium,7.4,...,5,10913,1.8,21.7,69.0,114.1,100.7,Hypertension,Never,11.51
377905,1648,2024-06-30,64,F,160.9,92.4,Swimming,95,Medium,20.4,...,5,11342,3.0,21.7,69.0,114.1,100.7,Hypertension,Never,11.63


In [None]:
#Basic statistics and correlations

print("\nDescriptive Statistics:\n", df.describe())
df_cleaned = df.apply(pd.to_numeric, errors='coerce')
df_cleaned = df_cleaned.dropna (axis=1, how='all')
print("\nCorrelation Matrix: \n", df_cleaned.corr())


Descriptive Statistics:
        participant_id            age      height_cm      weight_kg  \
count   351446.000000  351446.000000  351446.000000  351446.000000   
mean       824.228411      41.647283     168.530259      93.995985   
std        474.721947      13.282376       9.071039      21.785558   
min          1.000000      18.000000     146.400000      45.300000   
25%        415.000000      31.000000     161.700000      77.700000   
50%        823.000000      42.000000     168.300000      93.800000   
75%       1234.000000      53.000000     175.200000     109.300000   
max       1648.000000      64.000000     194.300000     158.800000   

       duration_minutes  calories_burned  avg_heart_rate    hours_sleep  \
count     351446.000000    351446.000000   351446.000000  351446.000000   
mean          69.145157        14.526095      131.078467       7.046903   
std           28.887549         8.598385       17.315550       0.940449   
min           20.000000         0.800000   

In [None]:
#Save cleaned data

df.to_csv("Cleaned_health_fitness_Dataset.csv", index=False)

print("\n Data cleaning completed and saved as 'Cleaned_health_fitness_Dataset.csv")


 Data cleaning completed and saved as 'Cleaned_health_fitness_Dataset.csv
