In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np


# 1. Initial Dataset Overview

# Loading the raw sleep and lifestyle dataset from Kaggle
file_path = '/content/drive/MyDrive/MS THESIS/Sleep_health_and_lifestyle_dataset.csv'
df = pd.read_csv(file_path)

print("ARCHIVAL DATASET CLEANING PROCESS")
# Displaying initial shape and total number of records
print(f"Initial Dataset Shape: {df.shape}")
print(f"Total starting records: {df.shape[0]}")
print("-" * 40)

# 2. Data Cleaning and Filtering (EDA)

# Step A: Applying Inclusion Criteria (Sedentary Roles & Age 18-54)
sedentary_roles = ['Software Engineer', 'Engineer', 'Accountant', 'Manager']
df = df[df['Occupation'].isin(sedentary_roles)]
df = df[(df['Age'] >= 18) & (df['Age'] <= 54)]

# Checking for missing values before dropping
print("Checking for missing values in critical PERI columns:")
print(df[['Daily Steps', 'Stress Level', 'Heart Rate', 'Sleep Duration']].isnull().sum())


# Step B: Check and remove rows with missing values in key columns
df = df.dropna(subset=['Daily Steps', 'Stress Level', 'Heart Rate', 'Sleep Duration'])

# Step C: Checking for Duplicate Records
duplicate_count = df.duplicated().sum()
print(f"Checking for Duplicate Rows: {duplicate_count}")

# Step D: Creating target variable by maping Sleep Disorders to IBS-C Risk (0 = Healthy, 1 = At Risk)
# 'None' or NaN in Sleep Disorder = Healthy (0), others = Risk (1)
df['Sleep Disorder'] = df['Sleep Disorder'].fillna('None')
df['IBS_C_Risk'] = df['Sleep Disorder'].apply(lambda x: 0 if x == 'None' else 1)


# 3. Cleaned Dataset Summary
print("CLEANING COMPLETE.")
print("-" * 40)

# Display the final dataset shape and total records ready for analysis
print(f"Final Cleaned Archival Shape: {df.shape}")
print(f"Final records available for Model Training (N): {df.shape[0]}")


# Save the cleaned version dataset for merging with primary dataset later
df.to_csv('cleaned_archival_n105.csv', index=False)
print("\nSuccess: Cleaned file saved as 'cleaned_archival_n105.csv'")

ARCHIVAL DATASET CLEANING PROCESS
Initial Dataset Shape: (374, 13)
Total starting records: 374
----------------------------------------
Checking for missing values in critical PERI columns:
Daily Steps       0
Stress Level      0
Heart Rate        0
Sleep Duration    0
dtype: int64
Checking for Duplicate Rows: 0
CLEANING COMPLETE.
----------------------------------------
Final Cleaned Archival Shape: (105, 14)
Final records available for Model Training (N): 105

Success: Cleaned file saved as 'cleaned_archival_n105.csv'
