# SleepSense: Sleep Quality Analysis and Prediction

This Jupyter notebook analyzes wearable sensor data to predict sleep quality using **Pandas** and **NumPy**. It includes:
- Generating a sample dataset with 1000 rows.
- Preprocessing data (cleaning and feature engineering).
- Statistical analysis (correlations and summaries).
- Predicting sleep quality using a simple threshold-based model.

## Dataset
The generated dataset (`data/sleep_data.csv`) contains:
- `timestamp`: Date and time of record (datetime).
- `heart_rate`: Heart rate in beats per minute (float, 50-100).
- `sleep_duration`: Sleep duration in hours (float, 4-9).
- `body_movement`: Body movement intensity (float, 0-30).
- `sleep_quality`: Sleep quality label (string: good, average, poor).

## Outputs
- Dataset previews (raw and cleaned).
- Correlation matrix of numeric features.
- Statistical summary by sleep quality.
- Predicted sleep quality with accuracy.

In [21]:
# Import required libraries
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Set random seed for reproducibility
np.random.seed(42)

# Step 1: Generate sample dataset with realistic patterns
def generate_data():
    """Generate a sample dataset with 1000 rows of wearable sensor data and save to CSV."""
    os.makedirs('data', exist_ok=True)  # Create data directory if it doesn't exist
    n = 1000
    data = {
        'timestamp': pd.date_range(start='2023-01-01', periods=n, freq='H'),  # Hourly timestamps
        'sleep_duration': np.random.uniform(4, 9, n),  # Sleep duration: 4-9 hours
    }
    df = pd.DataFrame(data)
    
    # Generate heart_rate and body_movement with patterns based on sleep quality
    df['sleep_quality'] = np.random.choice(['good', 'average', 'poor'], n, p=[0.4, 0.4, 0.2])
    df['heart_rate'] = np.where(
        df['sleep_quality'] == 'good', np.random.normal(55, 5, n).clip(50, 70),
        np.where(df['sleep_quality'] == 'average', np.random.normal(65, 7, n).clip(50, 80),
                 np.random.normal(75, 8, n).clip(60, 100))
    )
    df['body_movement'] = np.where(
        df['sleep_quality'] == 'good', np.random.uniform(0, 10, n),
        np.where(df['sleep_quality'] == 'average', np.random.uniform(5, 20, n),
                 np.random.uniform(15, 30, n))
    )
    
    df.to_csv('data/sleep_data.csv', index=False)  # Save dataset to CSV
    print("Dataset saved to 'data/sleep_data.csv'.")
    return df


In [22]:
# Step 2: Load and preprocess data
def load_data(file_path='data/sleep_data.csv'):
    """Load CSV file and convert timestamp column to datetime format."""
    df = pd.read_csv(file_path)
    df['timestamp'] = pd.to_datetime(df['timestamp'])  # Ensure timestamp is in datetime format
    return df

def clean_data(df):
    """Clean data by filling missing values and removing outliers."""
    numeric_cols = ['heart_rate', 'sleep_duration', 'body_movement']
    df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())  # Fill missing values with mean
    df = df[(df['heart_rate'] >= 40) & (df['heart_rate'] <= 120)]  # Filter heart rate: 40-120
    df = df[(df['sleep_duration'] >= 0) & (df['sleep_duration'] <= 12)]  # Filter sleep duration: 0-12 hours
    df = df[df['body_movement'] >= 0]  # Filter body movement: non-negative
    return df

def add_features(df):
    """Add new features like daily average heart rate and normalized sleep duration."""
    df['date'] = df['timestamp'].dt.date  # Extract date for grouping
    daily_avg_hr = df.groupby('date')['heart_rate'].mean().reset_index()
    daily_avg_hr.columns = ['date', 'daily_avg_heart_rate']
    df = df.merge(daily_avg_hr, on='date', how='left')  # Merge daily avg heart rate
    df['norm_sleep_duration'] = (df['sleep_duration'] - df['sleep_duration'].mean()) / df['sleep_duration'].std()  # Normalize sleep duration
    df = df.drop(columns=['date'])  # Remove temporary date column
    return df

In [23]:
# Step 3: Analyze data
def calculate_correlations(df):
    """Calculate correlation matrix for numeric columns."""
    numeric_cols = ['heart_rate', 'sleep_duration', 'body_movement', 'daily_avg_heart_rate', 'norm_sleep_duration']
    correlations = df[numeric_cols].corr()  # Compute Pearson correlation
    return correlations

def summarize_by_quality(df):
    """Summarize numeric columns grouped by sleep quality."""
    summary = df.groupby('sleep_quality').agg({
        'heart_rate': ['mean', 'std'],
        'sleep_duration': ['mean', 'std'],
        'body_movement': ['mean', 'std']
    }).round(2)  # Round to 2 decimal places
    return summary


In [24]:
# Step 4: Prepare data for modeling
def prepare_data_for_model(df):
    """Prepare features and labels for machine learning model."""
    features = ['heart_rate', 'sleep_duration', 'body_movement', 'daily_avg_heart_rate', 'norm_sleep_duration']
    X = df[features]
    y = df['sleep_quality']
    # Normalize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled, y, scaler


In [25]:
# Step 5: Train and predict with Logistic Regression
def train_predict_model(X, y):
    """Train a Logistic Regression model and predict sleep quality."""
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train Logistic Regression model
    model = LogisticRegression(max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    
    # Predict on test set
    y_pred = model.predict(X_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    return model, accuracy, y_pred, y_test


In [26]:
# Step 6: Apply predictions to entire dataset
def apply_predictions(df, model, scaler):
    """Apply trained model to entire dataset for predictions."""
    features = ['heart_rate', 'sleep_duration', 'body_movement', 'daily_avg_heart_rate', 'norm_sleep_duration']
    X = df[features]
    X_scaled = scaler.transform(X)
    df['predicted_quality'] = model.predict(X_scaled)
    return df


In [27]:
# Execute the pipeline
# Generate and load data
df = generate_data()
df = load_data()  # Reload to ensure consistency
print('Raw Data:')
df.head()


Dataset saved to 'data/sleep_data.csv'.
Raw Data:


  'timestamp': pd.date_range(start='2023-01-01', periods=n, freq='H'),  # Hourly timestamps


Unnamed: 0,timestamp,sleep_duration,sleep_quality,heart_rate,body_movement
0,2023-01-01 00:00:00,5.872701,good,50.610087,2.571524
1,2023-01-01 01:00:00,8.753572,average,67.727299,7.470963
2,2023-01-01 02:00:00,7.65997,poor,60.279378,20.01323
3,2023-01-01 03:00:00,6.993292,average,68.742404,9.308865
4,2023-01-01 04:00:00,4.780093,poor,80.124343,20.115718


In [28]:
# Preprocess data
df = clean_data(df)
df = add_features(df)
print('\nCleaned Data with New Features:')
df.head()


Cleaned Data with New Features:


Unnamed: 0,timestamp,sleep_duration,sleep_quality,heart_rate,body_movement,daily_avg_heart_rate,norm_sleep_duration
0,2023-01-01 00:00:00,5.872701,good,50.610087,2.571524,64.514891,-0.396103
1,2023-01-01 01:00:00,8.753572,average,67.727299,7.470963,64.514891,1.576169
2,2023-01-01 02:00:00,7.65997,poor,60.279378,20.01323,64.514891,0.827479
3,2023-01-01 03:00:00,6.993292,average,68.742404,9.308865,64.514891,0.371065
4,2023-01-01 04:00:00,4.780093,poor,80.124343,20.115718,64.514891,-1.144112


In [29]:
# Analyze data
print('\nCorrelation Matrix:')
calculate_correlations(df)



Correlation Matrix:


Unnamed: 0,heart_rate,sleep_duration,body_movement,daily_avg_heart_rate,norm_sleep_duration
heart_rate,1.0,-0.019828,0.641109,0.189257,-0.019828
sleep_duration,-0.019828,1.0,-0.000923,-0.060248,1.0
body_movement,0.641109,-0.000923,1.0,0.095921,-0.000923
daily_avg_heart_rate,0.189257,-0.060248,0.095921,1.0,-0.060248
norm_sleep_duration,-0.019828,1.0,-0.000923,-0.060248,1.0


In [30]:
print('\nSummary by Sleep Quality:')
print(summarize_by_quality(df))




Summary by Sleep Quality:
              heart_rate       sleep_duration       body_movement      
                    mean   std           mean   std          mean   std
sleep_quality                                                          
average            65.50  7.12           6.53  1.47         12.48  4.32
good               55.33  4.36           6.42  1.46          5.05  2.83
poor               74.76  7.79           6.37  1.44         22.50  4.17


In [31]:
# Prepare data and train model
X, y, scaler = prepare_data_for_model(df)
model, accuracy, y_pred, y_test = train_predict_model(X, y)
print(f'\nModel Accuracy on Test Set: {accuracy:.2%}')



Model Accuracy on Test Set: 85.00%


In [32]:
# Apply predictions to entire dataset
df = apply_predictions(df, model, scaler)
print('\nData with Predicted Sleep Quality:')
print(df[['timestamp', 'sleep_quality', 'predicted_quality']].head())



Data with Predicted Sleep Quality:
            timestamp sleep_quality predicted_quality
0 2023-01-01 00:00:00          good              good
1 2023-01-01 01:00:00       average           average
2 2023-01-01 02:00:00          poor           average
3 2023-01-01 03:00:00       average           average
4 2023-01-01 04:00:00          poor              poor


In [33]:
# Calculate overall prediction accuracy
overall_accuracy = (df['sleep_quality'] == df['predicted_quality']).mean()
print(f'\nOverall Prediction Accuracy: {overall_accuracy:.2%}')


Overall Prediction Accuracy: 87.00%
