#**Data Creation**

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

print("Data Generation:")

def fitpulse_data():
    start_time = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)

    timestamps = []
    heart_rates = []
    sleep_hours = []
    step_counts = []
    calories_burned = []
    activity_types = []

    def get_activity_for_time(hour):
        if 0 <= hour < 6:
            return "Sleeping"
        elif 6 <= hour < 9:
            return "Walking"
        elif 9 <= hour < 12:
            return "Resting"
        elif 12 <= hour < 14:
            return "Running"
        elif 14 <= hour < 18:
            return "Walking"
        elif 18 <= hour < 20:
            return "Workout"
        else:
            return "Resting"

    total_minutes = 7 * 24 * 60  # 7 days
    interval = 5  # recording every 5 minutes

    sleep_counter = 0  # track cumulative sleep

    for minute in range(0, total_minutes, interval):
        timestamp = start_time + timedelta(minutes=minute)
        timestamps.append(timestamp.strftime("%Y-%m-%d %H:%M:%S"))

        hour = timestamp.hour
        activity = get_activity_for_time(hour)
        activity_types.append(activity)

        # Heart rate based on activity
        if activity == "Sleeping":
            base_hr = 55
        elif activity == "Resting":
            base_hr = 65
        elif activity == "Walking":
            base_hr = 85
        elif activity == "Running":
            base_hr = 130
        else:  # Workout
            base_hr = 120

        hr = base_hr + np.random.normal(0, 4)
        heart_rates.append(round(max(45, min(180, hr))))

        # Sleep duration accumulates only during sleeping hours
        if activity == "Sleeping":
            sleep_counter += interval / 60  # convert minutes to hours
            sleep_hours.append(round(sleep_counter, 2))
        else:
            sleep_counter = 0
            sleep_hours.append(0)

        # Step Count
        if activity == "Sleeping" or activity == "Resting":
            steps = np.random.randint(0, 10)
        elif activity == "Walking":
            steps = np.random.randint(30, 70)
        elif activity == "Running":
            steps = np.random.randint(100, 150)
        else:  # Workout
            steps = np.random.randint(60, 120)
        step_counts.append(steps)

        # Calories burned
        calories = steps * 0.04 + hr * 0.01
        calories_burned.append(round(calories, 2))

    df = pd.DataFrame({
        "timestamp": timestamps,
        "heart_rate_bpm": heart_rates,
        "sleep_hours": sleep_hours,
        "step_count": step_counts,
        "calories_burned": calories_burned,
        "activity_type": activity_types
    })

    df.to_csv("Sample_data.csv", index=False)
    print("SAMPLE DATASET FOR THIS PROJECT")

    return df

data = fitpulse_data()
print(data.head())


Data Generation:
SAMPLE DATASET FOR THIS PROJECT
             timestamp  heart_rate_bpm  sleep_hours  step_count  \
0  2025-11-24 00:00:00              59         0.08           5   
1  2025-11-24 00:05:00              53         0.17           9   
2  2025-11-24 00:10:00              53         0.25           6   
3  2025-11-24 00:15:00              57         0.33           6   
4  2025-11-24 00:20:00              49         0.42           9   

   calories_burned activity_type  
0             0.79      Sleeping  
1             0.89      Sleeping  
2             0.77      Sleeping  
3             0.81      Sleeping  
4             0.85      Sleeping  


#**Data preprocessing:**

##1. Data import and Validation:

In [None]:
# Load the dataset
df = pd.read_csv("Sample_data.csv")

# Validate basic info
print("=== Dataset Info ===")
print(df.info())

print("\n=== First 5 rows ===")
print(df.head())

print("\n=== Column names ===")
print(df.columns)

# Check for duplicates
duplicates = df.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicates}")
df = df.drop_duplicates()


=== Dataset Info ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2016 entries, 0 to 2015
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   timestamp        2016 non-null   object 
 1   heart_rate_bpm   2016 non-null   int64  
 2   sleep_hours      2016 non-null   float64
 3   step_count       2016 non-null   int64  
 4   calories_burned  2016 non-null   float64
 5   activity_type    2016 non-null   object 
dtypes: float64(2), int64(2), object(2)
memory usage: 94.6+ KB
None

=== First 5 rows ===
             timestamp  heart_rate_bpm  sleep_hours  step_count  \
0  2025-11-24 00:00:00              59         0.08           5   
1  2025-11-24 00:05:00              53         0.17           9   
2  2025-11-24 00:10:00              53         0.25           6   
3  2025-11-24 00:15:00              57         0.33           6   
4  2025-11-24 00:20:00              49         0.42           9   

   calories_

##2. Timestamp Normalization:

In [None]:
# Convert 'timestamp' to datetime type
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Sort by timestamp just in case
df = df.sort_values('timestamp').reset_index(drop=True)

# Optional: Check min and max timestamps
print(f"\nTimestamp range: {df['timestamp'].min()} to {df['timestamp'].max()}")



Timestamp range: 2025-11-24 00:00:00 to 2025-11-30 23:55:00


##3. Missing Value Handling:

In [None]:
# Check for missing values
print("\nMissing values per column:")
print(df.isnull().sum())

# Fill missing values or drop (here we fill numeric columns with forward fill)
numeric_cols = ['heart_rate_bpm', 'sleep_hours', 'step_count', 'calories_burned']
df[numeric_cols] = df[numeric_cols].fillna(method='ffill')

# For categorical column 'activity_type', fill with previous value
df['activity_type'] = df['activity_type'].fillna(method='ffill')

# Verify
print("\nMissing values after handling:")
print(df.isnull().sum())



Missing values per column:
timestamp          0
heart_rate_bpm     0
sleep_hours        0
step_count         0
calories_burned    0
activity_type      0
dtype: int64

Missing values after handling:
timestamp          0
heart_rate_bpm     0
sleep_hours        0
step_count         0
calories_burned    0
activity_type      0
dtype: int64


  df[numeric_cols] = df[numeric_cols].fillna(method='ffill')
  df['activity_type'] = df['activity_type'].fillna(method='ffill')


##4. Outliers Detection:

In [None]:
# Using clip to limit values within realistic bounds
df['heart_rate_bpm'] = df['heart_rate_bpm'].clip(lower=40, upper=180)
df['step_count'] = df['step_count'].clip(lower=0, upper=300)
df['calories_burned'] = df['calories_burned'].clip(lower=0)
df['sleep_hours'] = df['sleep_hours'].clip(lower=0, upper=12)

# Optional: print min/max to verify
print("\nData ranges after outlier handling:")
print(df[['heart_rate_bpm','step_count','calories_burned','sleep_hours']].agg(['min','max']))



Data ranges after outlier handling:
     heart_rate_bpm  step_count  calories_burned  sleep_hours
min              45           0             0.42          0.0
max             140         149             7.24          6.0


##5. Resampling:

In [None]:
# Timestamp as index
df.set_index('timestamp', inplace=True)

numeric_cols = ['heart_rate_bpm', 'sleep_hours', 'step_count', 'calories_burned']

# Resample numeric columns
hourly_numeric = df[numeric_cols].resample('1h').mean()

# Resample categorical column using mode
hourly_activity = df['activity_type'].resample('1h').agg(lambda x: x.mode()[0])

# Combine numeric + categorical columns
hourly_df = pd.concat([hourly_numeric, hourly_activity], axis=1)

print("\nResampled data:")
print(hourly_df.head())



Resampled data:
                     heart_rate_bpm  sleep_hours  step_count  calories_burned  \
timestamp                                                                       
2025-11-24 00:00:00       54.583333     0.541667    5.583333         0.769167   
2025-11-24 01:00:00       52.666667     1.541667    5.916667         0.763333   
2025-11-24 02:00:00       52.333333     2.541667    5.166667         0.730000   
2025-11-24 03:00:00       52.583333     3.541667    4.083333         0.689167   
2025-11-24 04:00:00       53.500000     4.541667    4.250000         0.705000   

                    activity_type  
timestamp                          
2025-11-24 00:00:00      Sleeping  
2025-11-24 01:00:00      Sleeping  
2025-11-24 02:00:00      Sleeping  
2025-11-24 03:00:00      Sleeping  
2025-11-24 04:00:00      Sleeping  


##6. Data Quality Assessment:

In [None]:
# Check total records
print(f"\nTotal records: {len(df)}")

# Check for duplicates again
print(f"Duplicate rows: {df.duplicated().sum()}")

# Basic statistics
print("\nNumeric column statistics:")
print(df.describe())

# Check distribution of activity types
print("\nActivity type counts:")
print(df['activity_type'].value_counts())

# Check for any remaining anomalies (optional)
if df.isnull().sum().sum() == 0:
    print("\nNo missing values left — Data quality looks good!")
else:
    print("\nWarning: There are still missing values!")



Total records: 2016
Duplicate rows: 665

Numeric column statistics:
       heart_rate_bpm  sleep_hours   step_count  calories_burned
count     2016.000000  2016.000000  2016.000000      2016.000000
mean        78.343254     0.760417    34.687004         2.170898
std         24.258456     1.576654    39.561665         1.811921
min         45.000000     0.000000     0.000000         0.420000
25%         60.000000     0.000000     4.000000         0.770000
50%         69.000000     0.000000     9.000000         0.970000
75%         87.000000     0.020000    58.000000         3.180000
max        140.000000     6.000000   149.000000         7.240000

Activity type counts:
activity_type
Walking     588
Resting     588
Sleeping    504
Running     168
Workout     168
Name: count, dtype: int64

No missing values left — Data quality looks good!
