# SLEEP QUALITY ANALYSIS 


### 1. Data Collections 
Collected data from a dataset titled "Sleep Health and Lifestyle Dataset" available on Kaggle. This dataset contains various health and lifestyle factors related to sleep quality, including age, sex, weight, height, sleep duration, sleep efficiency, sleep quality, and sleep disorder.



### 2. Data Cleaning
Checked for any missing values in the dataset and filled them with the mean value of the respective column. This was done using the `fillna()` function in Pandas.




### 3. Feature Engineering
Split the 'Blood Pressure' column into 'systolic_pressure' and 'diastolic_pressure' for easier analysis.

Converting the 'Gender' column to numerical values using dummy variables.

Creating new columns 'BMI_overweight' and 'BMI_Normal' based on the 'BMI Category' values.

Sleep disorder is considered as yes or no depending on type of sleep_disorder



### 4. Data visualization

Analyzed the correlation using correlation matrix.

Visualized the correlation using scatter plots


### 5. Model Training and selection
Split the dataset into training and testing sets using `train_test_split` from sklearn.

Tried multiple regression models including Linear Regression and Lasso Regression.

Compared the performance of these models using R-squared.




In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("Sleep_health_and_lifestyle_dataset.csv")
df.head()

## DATA CLEANING

In [None]:
df.columns

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df['Sleep Disorder'] = df['Sleep Disorder'].fillna(0)

## CATEGORICAL DATA ANALYSIS

In [None]:
df.head()

In [None]:
df['systolic_pressure'] = df['Blood Pressure'].str.split('/').str[0].astype(int)
df['diastolic_pressure'] = df['Blood Pressure'].str.split('/').str[1].astype(int)
df

In [None]:
gender_dummies_df = pd.get_dummies(df['Gender']).astype(int)
gender_dummies_df
df = pd.concat([df, gender_dummies_df], axis=1)
df.head()

In [None]:
df['BMI Category'].unique()

In [None]:
# Map Obese and Overweight as one category called BMI_overweight
# Normal and Normal Weight as another category as BMI_Category_Normal
df['BMI_overweight'] = df['BMI Category'].isin(['Overweight', 'Obese']).astype(int)
df['BMI_Normal'] = df['BMI Category'].isin(['Normal', 'Normal Weight']).astype(int)
df.head()

# ['Age', 'Sleep Duration', 'Physical Activity Level', 'Stress Level',
#        'Heart Rate', 'Daily Steps', 'systolic_pressure', 'diastolic_pressure',
#        'Female', 'Male', 'BMI_overweight', 'BMI_Normal',
    #    'Sleep Disorder Status']

In [None]:
df['Sleep Disorder'].unique()

In [None]:
df['Sleep Disorder Status'] = df['Sleep Disorder'].isin(['Sleep Apnea', 'Insomnia']).astype(int)

In [None]:
df['Occupation'].unique().tolist()

## DATA VISUALIZATION

In [None]:
df1 = df.copy()
df1.drop(['Gender', 'Person ID', 'BMI Category', 'Blood Pressure', 'Sleep Disorder', 'Occupation'], axis = 1, inplace= True)
df1.head()

In [None]:
df1.shape

In [None]:
df1.isnull().sum()

In [None]:
df1.info()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

sns.pairplot(df1)
plt.show()

In [None]:
# Calculate the correlation matrix
plt.figure(figsize=(12, 10))
cor = df1.corr()
sns.heatmap(cor, annot=True, cmap='coolwarm', fmt='.2f', vmin=-1, vmax=1, center=0)
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Create a scatter plot with a regression line for Quality of Sleep vs. Sleep Duration
plt.figure(figsize=(8, 5))
sns.regplot(x='Sleep Duration', y='Quality of Sleep', data=df, scatter_kws={'alpha':0.6}, line_kws={'color':'red'})
plt.title('Quality of Sleep vs Sleep Duration')
plt.xlabel('Sleep Duration')
plt.ylabel('Quality of Sleep')
plt.show()


In [None]:
plt.figure(figsize=(8, 5))
sns.regplot(x='Stress Level', y='Quality of Sleep', data=df, scatter_kws={'alpha':0.6}, line_kws={'color':'red'})
plt.title('Quality of Sleep vs Stress level')
plt.xlabel('Stress Level')
plt.ylabel('Quality of Sleep')
plt.show()


In [None]:
plt.figure(figsize=(8, 5))
sns.regplot(x='Physical Activity Level', y='Quality of Sleep', data=df, scatter_kws={'alpha':0.6}, line_kws={'color':'red'})
plt.title('Quality of Sleep vs Physical Activity Level')
plt.xlabel('Physical Activity Level')
plt.ylabel('Quality of Sleep')
plt.show()

In [None]:
# systolic_pressure', 'diastolic_pressure', 

plt.figure(figsize=(8, 5))
sns.regplot(x='systolic_pressure', y='Quality of Sleep', data=df)
plt.title('Quality of Sleep vs systolic_pressure')
plt.xlabel('systolic_pressurel')
plt.ylabel('Quality of Sleep')
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
sns.regplot(x='diastolic_pressure', y='Quality of Sleep', data=df, scatter_kws={'alpha':0.6}, line_kws={'color':'red'})
plt.title('Quality of Sleep vs diastolic_pressure')
plt.xlabel('diastolic_pressure')
plt.ylabel('Quality of Sleep')
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
sns.regplot(x='Daily Steps', y='Quality of Sleep', data=df, scatter_kws={'alpha':0.6}, line_kws={'color':'red'})
plt.title('Quality of Sleep vs Daily Steps')
plt.xlabel('Daily Steps')
plt.ylabel('Quality of Sleep')
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
sns.regplot(x='BMI_overweight', y='Quality of Sleep', data=df, scatter_kws={'alpha':0.6}, line_kws={'color':'red'})
plt.title('Quality of Sleep vs BMI_overweight')
plt.xlabel('BMI_overweight')
plt.ylabel('Quality of Sleep')
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
sns.regplot(x='BMI_Normal', y='Quality of Sleep', data=df, scatter_kws={'alpha':0.6}, line_kws={'color':'red'})
plt.title('Quality of Sleep vs BMI_Normal')
plt.xlabel('BMI_Normal')
plt.ylabel('Quality of Sleep')
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(df1.corr(), annot=True, fmt=".2f")
plt.show()

# MODEL TRAINING AND MODEL SELECTION

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import r2_score


X = df1.drop('Quality of Sleep', axis = 1)
Y  = df1['Quality of Sleep']

X.columns

In [None]:
# Split the data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
linear_reg = LinearRegression()
linear_reg = linear_reg.fit(X_train, Y_train)
linear_reg.score(X_test, Y_test)


In [None]:
linear_reg.predict([[53, 8, 30, 3, 65, 5000, 125, 80, 1, 0, 0,1,0]])

In [None]:
lasso_reg = Lasso()
lasso_reg = lasso_reg.fit(X_train, Y_train)
lasso_reg.score(X_test, Y_test)

In [None]:
import joblib

# save
joblib.dump(linear_reg, "model.pkl") 

# load
clf2 = joblib.load("model.pkl")

In [None]:
X_test[0:1]

In [None]:
[53, 8, 30, 3, 65, 5000, 125, 80, 1, 0, 0,1,0]