In [67]:
# importando pandas para fazer a análise exploratória
import pandas as pd

In [68]:
# Lendo o arquivo csv e mostrando as 10 primeiras linhas do dataset

dataframe = pd.read_csv('Sleep_health_and_lifestyle_dataset.csv')
dataframe.head(10)

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,1,Male,27,Software Engineer,6.1,6,42.0,6,Overweight,126/83,77,4200,
1,2,Male,28,Doctor,6.2,6,60.0,8,Normal,125/80,75,10000,
2,3,Male,28,Doctor,,6,60.0,8,Normal,125/80,75,10000,
3,4,Male,28,Sales Representative,5.9,4,30.0,8,Obese,140/90,85,3000,Sleep Apnea
4,5,Male,28,Sales Representative,5.9,4,30.0,8,Obese,140/90,85,3000,Sleep Apnea
5,6,Male,28,Software Engineer,5.9,4,30.0,8,Obese,140/90,85,3000,Insomnia
6,7,Male,29,Teacher,6.3,6,40.0,7,Obese,140/90,82,3500,Insomnia
7,8,Male,29,Doctor,7.8,7,75.0,6,Normal,120/80,70,8000,
8,9,Male,29,Doctor,7.8,7,75.0,6,Normal,120/80,70,8000,
9,10,Male,29,Doctor,7.8,7,75.0,6,Normal,120/80,70,8000,


In [69]:
len(dataframe)

382

In [70]:
dataframe.describe()

Unnamed: 0,Person ID,Age,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,Heart Rate,Daily Steps
count,382.0,382.0,375.0,382.0,377.0,382.0,382.0,382.0
mean,189.109948,42.314136,7.1368,7.319372,59.230769,5.379581,70.157068,6825.91623
std,108.171287,8.680733,0.798113,1.200434,20.934734,1.782063,4.12806,1624.414637
min,1.0,27.0,5.8,4.0,30.0,3.0,65.0,3000.0
25%,96.25,36.0,6.4,6.0,45.0,4.0,68.0,5600.0
50%,189.5,43.0,7.2,7.0,60.0,5.0,70.0,7000.0
75%,283.75,50.0,7.8,8.0,75.0,7.0,72.0,8000.0
max,374.0,59.0,8.5,9.0,90.0,8.0,86.0,10000.0


In [71]:
dataframe.dtypes

Person ID                    int64
Gender                      object
Age                          int64
Occupation                  object
Sleep Duration             float64
Quality of Sleep             int64
Physical Activity Level    float64
Stress Level                 int64
BMI Category                object
Blood Pressure              object
Heart Rate                   int64
Daily Steps                  int64
Sleep Disorder              object
dtype: object

In [72]:
# trocando dados "object" para "category"
dataframe["Gender"] = dataframe["Gender"].astype("category")
dataframe["Occupation"] = dataframe["Occupation"].astype("category")
dataframe["BMI Category"] = dataframe["BMI Category"].astype("category")
dataframe["Sleep Disorder"] = dataframe["Sleep Disorder"].astype("category")

print(dataframe.dtypes)

Person ID                     int64
Gender                     category
Age                           int64
Occupation                 category
Sleep Duration              float64
Quality of Sleep              int64
Physical Activity Level     float64
Stress Level                  int64
BMI Category               category
Blood Pressure               object
Heart Rate                    int64
Daily Steps                   int64
Sleep Disorder             category
dtype: object


## Conclusões sobre a definição de tipos

Ao revisar os tipos de dados no DataFrame, observamos que os tipos estão bem ajustados para a maioria dos campos, mas há oportunidades para otimizações. Identificadores e métricas quantitativas como "Person ID", "Age", e "Quality of Sleep" estão corretamente como int64. "Sleep Duration" está como float64, adequado para valores que necessitam precisão decimal. No entanto, colunas como "Gender", "Occupation", e "Blood Pressure", classificadas como object, poderiam ser convertidas para o tipo categórico. A conversão para categórico não só economiza memória, especialmente útil para grandes datasets, como também facilita a realização de análises estatísticas que dependem de agrupamento ou comparação de categorias. 

# Tratamento de valores faltantes

In [73]:
predict_dataframe = dataframe.copy(deep=True)

predict_dataframe["Blood Pressure"] = predict_dataframe["Blood Pressure"].astype("category")


predict_dataframe["Gender"] = predict_dataframe["Gender"].cat.codes
predict_dataframe["Occupation"] = predict_dataframe["Occupation"].cat.codes
predict_dataframe["BMI Category"] = predict_dataframe["BMI Category"].cat.codes
predict_dataframe["Sleep Disorder"] = predict_dataframe["Sleep Disorder"].cat.codes
predict_dataframe["Blood Pressure"] = predict_dataframe["Blood Pressure"].cat.codes
predict_dataframe.head()

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,1,1,27,9,6.1,6,42.0,6,3,11,77,4200,-1
1,2,1,28,1,6.2,6,60.0,8,0,9,75,10000,-1
2,3,1,28,1,,6,60.0,8,0,9,75,10000,-1
3,4,1,28,6,5.9,4,30.0,8,2,22,85,3000,1
4,5,1,28,6,5.9,4,30.0,8,2,22,85,3000,1


In [77]:
print(predict_dataframe.isnull().sum())

Person ID                  0
Gender                     0
Age                        0
Occupation                 0
Sleep Duration             7
Quality of Sleep           0
Physical Activity Level    5
Stress Level               0
BMI Category               0
Blood Pressure             0
Heart Rate                 0
Daily Steps                0
Sleep Disorder             0
dtype: int64


In [78]:
from fancyimpute import KNN
Sleep_knn = KNN(k=3).fit_transform(predict_dataframe)

Imputing row 1/382 with 0 missing, elapsed time: 0.018
Imputing row 101/382 with 0 missing, elapsed time: 0.018
Imputing row 201/382 with 0 missing, elapsed time: 0.019
Imputing row 301/382 with 0 missing, elapsed time: 0.019


In [79]:
Sleep_knn = pd.DataFrame(data=Sleep_knn[0:, 0:], columns=["Person ID", "Gender", "Age", "Occupation", "Sleep Duration", 'Quality of Sleep', "Physical Activity Level", "Stress Level", "BMI Category", "Blood Pressure", "Heart Rate", "Daily Steps", "Sleep Disorder"])

In [81]:
Sleep_knn.head(10)

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,1.0,1.0,27.0,9.0,6.1,6.0,42.0,6.0,3.0,11.0,77.0,4200.0,-1.0
1,2.0,1.0,28.0,1.0,6.2,6.0,60.0,8.0,0.0,9.0,75.0,10000.0,-1.0
2,3.0,1.0,28.0,1.0,6.199996,6.0,60.0,8.0,0.0,9.0,75.0,10000.0,-1.0
3,4.0,1.0,28.0,6.0,5.9,4.0,30.0,8.0,2.0,22.0,85.0,3000.0,1.0
4,5.0,1.0,28.0,6.0,5.9,4.0,30.0,8.0,2.0,22.0,85.0,3000.0,1.0
5,6.0,1.0,28.0,9.0,5.9,4.0,30.0,8.0,2.0,22.0,85.0,3000.0,0.0
6,7.0,1.0,29.0,10.0,6.3,6.0,40.0,7.0,2.0,22.0,82.0,3500.0,0.0
7,8.0,1.0,29.0,1.0,7.8,7.0,75.0,6.0,0.0,6.0,70.0,8000.0,-1.0
8,9.0,1.0,29.0,1.0,7.8,7.0,75.0,6.0,0.0,6.0,70.0,8000.0,-1.0
9,10.0,1.0,29.0,1.0,7.8,7.0,75.0,6.0,0.0,6.0,70.0,8000.0,-1.0


In [83]:
print(Sleep_knn.isnull().sum())

Person ID                  0
Gender                     0
Age                        0
Occupation                 0
Sleep Duration             0
Quality of Sleep           0
Physical Activity Level    0
Stress Level               0
BMI Category               0
Blood Pressure             0
Heart Rate                 0
Daily Steps                0
Sleep Disorder             0
dtype: int64


# Lidando com entries/rows duplicadas

In [91]:
len(Sleep_knn)

382

In [92]:
max_person_id = Sleep_knn['Person ID'].max()
print(max_person_id)


374.0


In [93]:
Sleep_knn[Sleep_knn.duplicated()].sort_values("Person ID").head(10)

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
134,134.0,0.0,38.0,0.0,7.1,8.0,60.0,4.0,0.0,0.0,68.0,7000.0,-1.0
171,170.0,1.0,41.0,3.0,7.7,8.0,90.0,5.0,0.0,15.0,70.0,8000.0,-1.0
201,199.0,1.0,43.0,7.0,6.5,6.0,45.0,7.0,3.0,15.0,72.0,6000.0,0.0
290,287.0,0.0,50.0,5.0,6.0,6.0,90.0,8.0,3.0,23.0,75.0,10000.0,1.0
291,287.0,0.0,50.0,5.0,6.0,6.0,90.0,8.0,3.0,23.0,75.0,10000.0,1.0
337,332.0,0.0,53.0,2.0,8.4,9.0,30.0,3.0,0.0,9.0,65.0,5000.0,-1.0
338,332.0,0.0,53.0,2.0,8.4,9.0,30.0,3.0,0.0,9.0,65.0,5000.0,-1.0
381,374.0,0.0,59.0,5.0,8.1,9.0,75.0,3.0,3.0,23.0,68.0,7000.0,1.0


In [94]:
Sleep_knn.drop_duplicates(keep='first', inplace=True)

In [95]:
len(Sleep_knn)

374

# Normalização

In [96]:
Normalized_dataset = Sleep_knn.copy(deep=True)

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,1.0,1.0,27.0,9.0,6.1,6.0,42.0,6.0,3.0,11.0,77.0,4200.0,-1.0
1,2.0,1.0,28.0,1.0,6.2,6.0,60.0,8.0,0.0,9.0,75.0,10000.0,-1.0
2,3.0,1.0,28.0,1.0,6.199996,6.0,60.0,8.0,0.0,9.0,75.0,10000.0,-1.0
3,4.0,1.0,28.0,6.0,5.9,4.0,30.0,8.0,2.0,22.0,85.0,3000.0,1.0
4,5.0,1.0,28.0,6.0,5.9,4.0,30.0,8.0,2.0,22.0,85.0,3000.0,1.0


### Escalas diferentes

Nesse caso, `Daily Steps` estaria dominando o cálculo, precisamos normalizar a entrada.