In [118]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [119]:
# Load the data
df = pd.read_csv('Sleep_health_and_lifestyle_dataset.csv')
df.head()

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126/83,77,4200,
1,2,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
2,3,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
3,4,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
4,5,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea


# Inspect the dataset

In [120]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 374 entries, 0 to 373
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Person ID                374 non-null    int64  
 1   Gender                   374 non-null    object 
 2   Age                      374 non-null    int64  
 3   Occupation               374 non-null    object 
 4   Sleep Duration           374 non-null    float64
 5   Quality of Sleep         374 non-null    int64  
 6   Physical Activity Level  374 non-null    int64  
 7   Stress Level             374 non-null    int64  
 8   BMI Category             374 non-null    object 
 9   Blood Pressure           374 non-null    object 
 10  Heart Rate               374 non-null    int64  
 11  Daily Steps              374 non-null    int64  
 12  Sleep Disorder           155 non-null    object 
dtypes: float64(1), int64(7), object(5)
memory usage: 38.1+ KB


In [121]:
df.describe()

Unnamed: 0,Person ID,Age,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,Heart Rate,Daily Steps
count,374.0,374.0,374.0,374.0,374.0,374.0,374.0,374.0
mean,187.5,42.184492,7.132086,7.312834,59.171123,5.385027,70.165775,6816.84492
std,108.108742,8.673133,0.795657,1.196956,20.830804,1.774526,4.135676,1617.915679
min,1.0,27.0,5.8,4.0,30.0,3.0,65.0,3000.0
25%,94.25,35.25,6.4,6.0,45.0,4.0,68.0,5600.0
50%,187.5,43.0,7.2,7.0,60.0,5.0,70.0,7000.0
75%,280.75,50.0,7.8,8.0,75.0,7.0,72.0,8000.0
max,374.0,59.0,8.5,9.0,90.0,8.0,86.0,10000.0


# Handling missing values

In [122]:
# Fill missing values or drop rows/columns
# Check for missing values
print(df.isnull().sum())

Person ID                    0
Gender                       0
Age                          0
Occupation                   0
Sleep Duration               0
Quality of Sleep             0
Physical Activity Level      0
Stress Level                 0
BMI Category                 0
Blood Pressure               0
Heart Rate                   0
Daily Steps                  0
Sleep Disorder             219
dtype: int64


In [123]:
# Calculate the percentage of missing values for each column
missing_percentage = df['Sleep Disorder'].isnull().mean() * 100

# Display the missing percentages
print(missing_percentage)

58.55614973262032


In [124]:
# Create a new category 'Unknown' for missing values
df['Sleep Disorder'] = df['Sleep Disorder'].fillna('Unknown')

print("\nDistribution after adding 'Unknown' category:")
print(df['Sleep Disorder'].value_counts())


Distribution after adding 'Unknown' category:
Sleep Disorder
Unknown        219
Sleep Apnea     78
Insomnia        77
Name: count, dtype: int64


In [125]:
# Evaluate the impact on the dataset
print("\nPercentage of missing values handled:")
print(df['Sleep Disorder'].isnull().mean() * 100)


Percentage of missing values handled:
0.0


In [126]:
print(df.isnull().sum())

Person ID                  0
Gender                     0
Age                        0
Occupation                 0
Sleep Duration             0
Quality of Sleep           0
Physical Activity Level    0
Stress Level               0
BMI Category               0
Blood Pressure             0
Heart Rate                 0
Daily Steps                0
Sleep Disorder             0
dtype: int64


In [127]:
df.head()

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126/83,77,4200,Unknown
1,2,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,Unknown
2,3,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,Unknown
3,4,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
4,5,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea


In [128]:
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
df_1 = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

In [129]:
df_1.head()

Unnamed: 0,Person ID,Age,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,Heart Rate,Daily Steps,Gender_Male,Occupation_Doctor,...,Blood Pressure_131/86,Blood Pressure_132/87,Blood Pressure_135/88,Blood Pressure_135/90,Blood Pressure_139/91,Blood Pressure_140/90,Blood Pressure_140/95,Blood Pressure_142/92,Sleep Disorder_Sleep Apnea,Sleep Disorder_Unknown
0,1,27,6.1,6,42,6,77,4200,True,False,...,False,False,False,False,False,False,False,False,False,True
1,2,28,6.2,6,60,8,75,10000,True,True,...,False,False,False,False,False,False,False,False,False,True
2,3,28,6.2,6,60,8,75,10000,True,True,...,False,False,False,False,False,False,False,False,False,True
3,4,28,5.9,4,30,8,85,3000,True,False,...,False,False,False,False,False,True,False,False,True,False
4,5,28,5.9,4,30,8,85,3000,True,False,...,False,False,False,False,False,True,False,False,True,False


In [130]:
# Identify numeric columns 
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
print(numeric_cols)

Index(['Person ID', 'Age', 'Sleep Duration', 'Quality of Sleep',
       'Physical Activity Level', 'Stress Level', 'Heart Rate', 'Daily Steps'],
      dtype='object')


In [131]:
scaler = StandardScaler()

In [132]:
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

In [133]:
df.head()

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,-1.727426,Male,-1.753096,Software Engineer,-1.298887,-1.09828,-0.825418,0.347021,Overweight,126/83,1.654719,-1.619584,Unknown
1,-1.718163,Male,-1.637643,Doctor,-1.173036,-1.09828,0.039844,1.475592,Normal,125/80,1.170474,1.970077,Unknown
2,-1.708901,Male,-1.637643,Doctor,-1.173036,-1.09828,0.039844,1.475592,Normal,125/80,1.170474,1.970077,Unknown
3,-1.699639,Male,-1.637643,Sales Representative,-1.550588,-2.771424,-1.40226,1.475592,Obese,140/90,3.591698,-2.362273,Sleep Apnea
4,-1.690376,Male,-1.637643,Sales Representative,-1.550588,-2.771424,-1.40226,1.475592,Obese,140/90,3.591698,-2.362273,Sleep Apnea


In [134]:
# identifying target(dependent) and feature(independent) vaariable
X = df.drop('Sleep Disorder', axis=1)  # Features: all columns except 'Sleep Disorder'
y = df['Sleep Disorder']  # Target: 'Sleep Disorder' column

In [137]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Verify the preprocessing
print("Training features shape:", X_train.shape)
print("Testing features shape:", X_test.shape)
print("Training target shape:", y_train.shape)
print("Testing target shape:", y_test.shape)

X_train.head()
y_train.head()

Training features shape: (299, 12)
Testing features shape: (75, 12)
Training target shape: (299,)
Testing target shape: (75,)


192       Insomnia
75         Unknown
84         Unknown
362    Sleep Apnea
16     Sleep Apnea
Name: Sleep Disorder, dtype: object