In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
plt.style.use("ggplot")

matplotlib.rcParams["figure.figsize"] = (12,8)

In [None]:
df = pd.read_csv("Sleep_Efficiency.csv")

In [None]:
df.head(50)

In [None]:
for col in df.columns:
    missing = np.mean(df[col].isnull())
    print(f"{col} - {round(missing,2)}%")

In [None]:
"""
since we don't have many missing values we can drop rows with a missing values and it shouldn't affect
our analysis
"""
df = df.dropna()

In [None]:
#remove duplicates if exists
print(f"before: {len(df['ID'])}")
df = df.drop_duplicates()
print(f"after: {len(df['ID'])}")

In [None]:
df.dtypes

In [None]:
#remove convert float numbers into int where necessary
df["Awakenings"] = df["Awakenings"].astype("int64")
df["Caffeine consumption"] = df["Caffeine consumption"].astype("int64")
df["Alcohol consumption"] = df["Alcohol consumption"].astype("int64")
df["Exercise frequency"] = df["Exercise frequency"].astype("int64")

In [None]:
df

In [None]:
#do females sleep more efficiently than males?
gender = df.groupby("Gender")["Sleep efficiency"].mean()
gender

In [None]:
gender = gender.to_frame(name='Efficiency')
gender

In [None]:
gender = gender.reset_index()
gender

In [None]:
# print(f"{gender['Gender']}, Male: {round(gender['Efficiency'],2)}")
sns.barplot(x=gender["Gender"],y=gender['Efficiency'], color='red')
# plt.bar(['Female','Male'],gender,color='red')

In [None]:
#sleep efficiency by age
age = df.groupby(["Age"])["Sleep efficiency"].mean()
age = age.to_frame()
age = age.reset_index()
age = age.sort_values('Sleep efficiency')

plt.xticks(rotation=90)
sns.barplot(x='Age', y='Sleep efficiency', data=age, order=age.sort_values('Sleep efficiency').Age, color="red")


In [None]:
#smoking effect on sleeping quality
smk = df.groupby(["Smoking status"])["Sleep efficiency"].mean()
smk

In [None]:
smk = smk.to_frame()
smk = smk.reset_index()
#we could say it has, but it isn't effect that we expected!
sns.barplot(x='Smoking status',y='Sleep efficiency', data=smk, color='red')

In [None]:
#does amount of deep sleep have any effect on sleep efficiency?
plt.scatter(x=df["Deep sleep percentage"], y=df["Sleep efficiency"], color='red')
plt.show()

In [None]:
sns.regplot(x='Deep sleep percentage', y="Sleep efficiency", data=df, scatter_kws={"color":"red"}, line_kws={"color":"blue"})

In [None]:
#now let see correlations to find the best variable that affects sleeping quality
df.corr()

In [None]:
#use heatmap for better understanding
correlation = df.corr()
plt.title("Correlation Metrics")
sns.heatmap(correlation, annot=True)

In [None]:
"""
As we can see, deep sleep percentage highly correlate to the sleep efficiency,
while age, gender and smoking doesn't have any measurable effect!
On the other hand if we look at the heatmap we will see that alcohol consumption has some kind of effect
on the light sleep percentage
"""