In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

In [3]:
df = pd.read_csv('/content/archive (14).zip')

In [4]:
df.shape

(374, 13)

In [5]:
df.head()

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126/83,77,4200,
1,2,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
2,3,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
3,4,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
4,5,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 374 entries, 0 to 373
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Person ID                374 non-null    int64  
 1   Gender                   374 non-null    object 
 2   Age                      374 non-null    int64  
 3   Occupation               374 non-null    object 
 4   Sleep Duration           374 non-null    float64
 5   Quality of Sleep         374 non-null    int64  
 6   Physical Activity Level  374 non-null    int64  
 7   Stress Level             374 non-null    int64  
 8   BMI Category             374 non-null    object 
 9   Blood Pressure           374 non-null    object 
 10  Heart Rate               374 non-null    int64  
 11  Daily Steps              374 non-null    int64  
 12  Sleep Disorder           155 non-null    object 
dtypes: float64(1), int64(7), object(5)
memory usage: 38.1+ KB


In [7]:
df.isnull().sum()

Unnamed: 0,0
Person ID,0
Gender,0
Age,0
Occupation,0
Sleep Duration,0
Quality of Sleep,0
Physical Activity Level,0
Stress Level,0
BMI Category,0
Blood Pressure,0


In [8]:
df.duplicated().sum()

np.int64(0)

In [9]:
df.describe()

Unnamed: 0,Person ID,Age,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,Heart Rate,Daily Steps
count,374.0,374.0,374.0,374.0,374.0,374.0,374.0,374.0
mean,187.5,42.184492,7.132086,7.312834,59.171123,5.385027,70.165775,6816.84492
std,108.108742,8.673133,0.795657,1.196956,20.830804,1.774526,4.135676,1617.915679
min,1.0,27.0,5.8,4.0,30.0,3.0,65.0,3000.0
25%,94.25,35.25,6.4,6.0,45.0,4.0,68.0,5600.0
50%,187.5,43.0,7.2,7.0,60.0,5.0,70.0,7000.0
75%,280.75,50.0,7.8,8.0,75.0,7.0,72.0,8000.0
max,374.0,59.0,8.5,9.0,90.0,8.0,86.0,10000.0


In [10]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier


In [11]:
df = df.drop(columns=["Person ID"])

# Encode categorical variables
le_gender = LabelEncoder()
df["Gender"] = le_gender.fit_transform(df["Gender"])  # Male/Female

In [12]:
le_occ = LabelEncoder()
df["Occupation"] = le_occ.fit_transform(df["Occupation"])

le_bmi = LabelEncoder()
df["BMI Category"] = le_bmi.fit_transform(df["BMI Category"])

# Convert BP "126/83" → numeric average
df["Blood Pressure"] = df["Blood Pressure"].apply(lambda x: np.mean(list(map(int, x.split("/")))))


In [13]:
labeled_df = df[df["Sleep Disorder"].notna()]
unlabeled_df = df[df["Sleep Disorder"].isna()]

X = labeled_df.drop(columns=["Sleep Disorder"])
y = labeled_df["Sleep Disorder"]


In [14]:
le_target = LabelEncoder()
y = le_target.fit_transform(y)


In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [17]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

In [18]:
print("Train Accuracy:", clf.score(X_train, y_train))
print("Test Accuracy:", clf.score(X_test, y_test))

Train Accuracy: 0.9596774193548387
Test Accuracy: 0.8387096774193549


In [19]:
# predictiong on the missing dataset part
unlabeled_features = scaler.transform(unlabeled_df.drop(columns=["Sleep Disorder"]))
predicted_labels = clf.predict(unlabeled_features)

In [20]:
unlabeled_df.loc[:, "Sleep Disorder"] = le_target.inverse_transform(predicted_labels)

In [21]:
final_df = pd.concat([labeled_df, unlabeled_df]).sort_index()

print(final_df.head())

   Gender  Age  Occupation  Sleep Duration  Quality of Sleep  \
0       1   27           9             6.1                 6   
1       1   28           1             6.2                 6   
2       1   28           1             6.2                 6   
3       1   28           6             5.9                 4   
4       1   28           6             5.9                 4   

   Physical Activity Level  Stress Level  BMI Category  Blood Pressure  \
0                       42             6             3           104.5   
1                       60             8             0           102.5   
2                       60             8             0           102.5   
3                       30             8             2           115.0   
4                       30             8             2           115.0   

   Heart Rate  Daily Steps Sleep Disorder  
0          77         4200       Insomnia  
1          75        10000    Sleep Apnea  
2          75        10000    Sleep Ap