In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder
from imblearn.over_sampling import RandomOverSampler
from google.colab import files
import io

In [None]:
df = pd.read_csv('/content/sample_data/Sleep_health_and_lifestyle_dataset.csv')

# Limpeza e Tratamento Inicial

Substituimos os valores faltantes de Sleep Disorder por "Normal" para indicar que não há distúrbio.

Padronizamos a classe do BMI.

In [None]:
if 'Person ID' in df.columns:
    df = df.drop(columns=['Person ID'])

# Preencher NaN em Sleep Disorder com 'Normal'
df['Sleep Disorder'] = df['Sleep Disorder'].fillna('Normal')

# Padronizar BMI Category
df['BMI Category'] = df['BMI Category'].replace('Normal Weight', 'Normal')

# Feature Engineering: Pressão Arterial
Começamos dividindo a variável textual em duas colunas numéricas: pressão sistólica e diastólica.

Depois, de acordo com a tabela de categorias de presssão que André apresentou, categorizamos entre Normal, Pré-hipertensão, Hipertensão Estágio 1, Hipertensão Estágio 2 e Crise de Hipertensão.

Apagamos a variável textual inicial e ficamos com essa nova categoria: BP_category.

In [None]:
def process_bp_category(x):
    if pd.isna(x): return 'Normal'
    try:
        s, d = map(int, x.split('/'))
        if s < 120 and d < 80: return 'Normal'
        elif (120 <= s <= 139) or (80 <= d <= 89): return 'Prehypertension'
        elif (140 <= s <= 159) or (90 <= d <= 99): return 'Hypertension Stage 1'
        elif (160 <= s <= 179) or (100 <= d <= 109): return 'Hypertension Stage 2'
        elif s >= 180 or d >= 110: return 'Hypertensive Crisis'
        else: return 'Normal'
    except: return 'Normal'

df['BP_Category'] = df['Blood Pressure'].apply(process_bp_category)
df = df.drop(columns=['Blood Pressure'])

# Split

Definimos Quality of Sleep como nosso target.

Fazemos a divisão do dataset entre treino (70%), validação (15%) e teste (15%)

In [None]:
# Definir Target e Features
target_col = 'Quality of Sleep'
X = df.drop(columns=[target_col])
y = df[target_col]

# 1ª Divisão: Separa 70% para Treino e 30% para Temp (Val + Teste)
# Usamos stratify=y para garantir que as notas de sono (3, 4, ... 9) fiquem bem distribuídas

X_train_raw, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)
# 2ª Divisão: Quebra os 30% restantes em duas metades iguais (15% cada)
X_val_raw, X_test_raw, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42
)

# Balanceamento (no treino)

In [None]:
print("Contagem ANTES do balanceamento:")
print(X_train_raw['Occupation'].value_counts())

# Juntamos X e y temporariamente para balancear a linha inteira
train_full = pd.concat([X_train_raw, y_train], axis=1)

# O 'y' do sampler será a Profissão (para ele igualar as quantidades)
sampler = RandomOverSampler(random_state=42)
X_res, y_res = sampler.fit_resample(train_full, train_full['Occupation'])

# Agora separamos de volta
y_train_balanced = X_res[target_col] # O alvo volta a ser Quality of Sleep
X_train_balanced = X_res.drop(columns=[target_col]) # O X volta a ser as features

print("\nContagem DEPOIS do balanceamento:")
print(X_train_balanced['Occupation'].value_counts())

Contagem ANTES do balanceamento:
Occupation
Doctor                  54
Nurse                   48
Engineer                43
Lawyer                  30
Accountant              29
Teacher                 27
Salesperson             21
Software Engineer        4
Scientist                3
Sales Representative     1
Manager                  1
Name: count, dtype: int64

Contagem DEPOIS do balanceamento:
Occupation
Doctor                  54
Accountant              54
Salesperson             54
Engineer                54
Lawyer                  54
Software Engineer       54
Teacher                 54
Nurse                   54
Scientist               54
Sales Representative    54
Manager                 54
Name: count, dtype: int64


# Encoding e Scaling
BMI Category

0.0 $\rightarrow$ Normal

1.0 $\rightarrow$ Overweight

2.0 $\rightarrow$ Obese

BP_Category (Pressão Arterial)

0.0 $\rightarrow$ Normal

1.0 $\rightarrow$ Prehypertension

2.0 $\rightarrow$ Hypertension Stage 1

3.0 $\rightarrow$ Hypertension Stage 2

4.0 $\rightarrow$ Hypertensive Crisis

In [None]:
cat_nominal_cols = ['Gender', 'Occupation', 'Sleep Disorder']
cat_ordinal_cols = ['BMI Category', 'BP_Category']
num_cols = ['Age', 'Sleep Duration', 'Physical Activity Level', 'Stress Level', 'Heart Rate', 'Daily Steps']

bmi_order = ['Normal', 'Overweight', 'Obese']
bp_order = ['Normal', 'Prehypertension', 'Hypertension Stage 1', 'Hypertension Stage 2', 'Hypertensive Crisis']

ord_enc = OrdinalEncoder(categories=[bmi_order, bp_order], handle_unknown='use_encoded_value', unknown_value=-1)
ohe_enc = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
scaler = StandardScaler()

# Fit no treino
ord_enc.fit(X_train_balanced[cat_ordinal_cols])
ohe_enc.fit(X_train_balanced[cat_nominal_cols])
scaler.fit(X_train_balanced[num_cols])

def transform_data(X_data, y_data):
    ord_vals = ord_enc.transform(X_data[cat_ordinal_cols])
    ohe_vals = ohe_enc.transform(X_data[cat_nominal_cols])
    num_vals = scaler.transform(X_data[num_cols])

    ohe_cols = ohe_enc.get_feature_names_out(cat_nominal_cols)
    cols_final = num_cols + cat_ordinal_cols + list(ohe_cols)

    X_proc = pd.DataFrame(np.hstack([num_vals, ord_vals, ohe_vals]), columns=cols_final)
    X_proc[target_col] = y_data.values
    return X_proc

# Transformar
train_reg = transform_data(X_train_balanced, y_train_balanced)
val_reg = transform_data(X_val_raw, y_val)
test_reg = transform_data(X_test_raw, y_test)

In [None]:
train_reg.to_csv('train_Sleep_health_and_lifestyle_dataset.csv', index=False)
val_reg.to_csv('val_Sleep_health_and_lifestyle_dataset.csv', index=False)
test_reg.to_csv('test_Sleep_health_and_lifestyle_dataset.csv', index=False)

files.download('train_Sleep_health_and_lifestyle_dataset.csv')
files.download('val_Sleep_health_and_lifestyle_dataset.csv')
files.download('test_Sleep_health_and_lifestyle_dataset.csv')

print("Exemplo das colunas finais:")
print(train_reg.columns.tolist())

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Exemplo das colunas finais:
['Age', 'Sleep Duration', 'Physical Activity Level', 'Stress Level', 'Heart Rate', 'Daily Steps', 'BMI Category', 'BP_Category', 'Gender_Female', 'Gender_Male', 'Occupation_Accountant', 'Occupation_Doctor', 'Occupation_Engineer', 'Occupation_Lawyer', 'Occupation_Manager', 'Occupation_Nurse', 'Occupation_Sales Representative', 'Occupation_Salesperson', 'Occupation_Scientist', 'Occupation_Software Engineer', 'Occupation_Teacher', 'Sleep Disorder_Insomnia', 'Sleep Disorder_Normal', 'Sleep Disorder_Sleep Apnea', 'Quality of Sleep']


# Treino

In [None]:
df = pd.read_csv('/content/train_Sleep_health_and_lifestyle_dataset.csv')

print(f"Dimensões: {df.shape}")
print(f"\nColunas: {df.columns.tolist()}")

print("\nAmostra dos Dados")
display(df.head())

df.info()

Dimensões: (594, 25)

Colunas: ['Age', 'Sleep Duration', 'Physical Activity Level', 'Stress Level', 'Heart Rate', 'Daily Steps', 'BMI Category', 'BP_Category', 'Gender_Female', 'Gender_Male', 'Occupation_Accountant', 'Occupation_Doctor', 'Occupation_Engineer', 'Occupation_Lawyer', 'Occupation_Manager', 'Occupation_Nurse', 'Occupation_Sales Representative', 'Occupation_Salesperson', 'Occupation_Scientist', 'Occupation_Software Engineer', 'Occupation_Teacher', 'Sleep Disorder_Insomnia', 'Sleep Disorder_Normal', 'Sleep Disorder_Sleep Apnea', 'Quality of Sleep']

Amostra dos Dados


Unnamed: 0,Age,Sleep Duration,Physical Activity Level,Stress Level,Heart Rate,Daily Steps,BMI Category,BP_Category,Gender_Female,Gender_Male,...,Occupation_Nurse,Occupation_Sales Representative,Occupation_Salesperson,Occupation_Scientist,Occupation_Software Engineer,Occupation_Teacher,Sleep Disorder_Insomnia,Sleep Disorder_Normal,Sleep Disorder_Sleep Apnea,Quality of Sleep
0,2.030626,1.793977,2.074123,-1.669211,-1.314638,2.266822,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,9
1,-0.168002,0.333675,0.407642,-1.05905,-0.812563,0.529936,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,8
2,0.320582,-0.462854,-0.425598,0.771432,-0.143128,-0.049027,1.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,6
3,0.442728,1.262958,2.074123,-0.448889,-0.477845,1.108898,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,8
4,-1.267316,1.262958,1.240883,0.161271,-0.477845,1.108898,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,7


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 594 entries, 0 to 593
Data columns (total 25 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Age                              594 non-null    float64
 1   Sleep Duration                   594 non-null    float64
 2   Physical Activity Level          594 non-null    float64
 3   Stress Level                     594 non-null    float64
 4   Heart Rate                       594 non-null    float64
 5   Daily Steps                      594 non-null    float64
 6   BMI Category                     594 non-null    float64
 7   BP_Category                      594 non-null    float64
 8   Gender_Female                    594 non-null    float64
 9   Gender_Male                      594 non-null    float64
 10  Occupation_Accountant            594 non-null    float64
 11  Occupation_Doctor                594 non-null    float64
 12  Occupation_Engineer   

# Teste

In [None]:
df = pd.read_csv('/content/test_Sleep_health_and_lifestyle_dataset.csv')

print(f"Dimensões: {df.shape}")
print(f"\nColunas: {df.columns.tolist()}")

print("\nAmostra dos Dados")
display(df.head())

df.info()

Dimensões: (57, 25)

Colunas: ['Age', 'Sleep Duration', 'Physical Activity Level', 'Stress Level', 'Heart Rate', 'Daily Steps', 'BMI Category', 'BP_Category', 'Gender_Female', 'Gender_Male', 'Occupation_Accountant', 'Occupation_Doctor', 'Occupation_Engineer', 'Occupation_Lawyer', 'Occupation_Manager', 'Occupation_Nurse', 'Occupation_Sales Representative', 'Occupation_Salesperson', 'Occupation_Scientist', 'Occupation_Software Engineer', 'Occupation_Teacher', 'Sleep Disorder_Insomnia', 'Sleep Disorder_Normal', 'Sleep Disorder_Sleep Apnea', 'Quality of Sleep']

Amostra dos Dados


Unnamed: 0,Age,Sleep Duration,Physical Activity Level,Stress Level,Heart Rate,Daily Steps,BMI Category,BP_Category,Gender_Female,Gender_Male,...,Occupation_Nurse,Occupation_Sales Representative,Occupation_Salesperson,Occupation_Scientist,Occupation_Software Engineer,Occupation_Teacher,Sleep Disorder_Insomnia,Sleep Disorder_Normal,Sleep Disorder_Sleep Apnea,Quality of Sleep
0,0.442728,-0.462854,-0.425598,0.771432,-0.143128,-0.049027,1.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,6
1,-0.778732,-1.126627,-1.258838,1.381593,-0.143128,-0.627989,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,6
2,0.442728,-0.462854,-0.425598,0.771432,-0.143128,-0.049027,1.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,6
3,1.29775,-1.126627,2.074123,1.381593,0.358948,2.266822,1.0,2.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6
4,-0.045856,0.068165,-0.147851,0.161271,0.358948,-0.338508,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,7


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57 entries, 0 to 56
Data columns (total 25 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Age                              57 non-null     float64
 1   Sleep Duration                   57 non-null     float64
 2   Physical Activity Level          57 non-null     float64
 3   Stress Level                     57 non-null     float64
 4   Heart Rate                       57 non-null     float64
 5   Daily Steps                      57 non-null     float64
 6   BMI Category                     57 non-null     float64
 7   BP_Category                      57 non-null     float64
 8   Gender_Female                    57 non-null     float64
 9   Gender_Male                      57 non-null     float64
 10  Occupation_Accountant            57 non-null     float64
 11  Occupation_Doctor                57 non-null     float64
 12  Occupation_Engineer     

# Validação

In [None]:
df = pd.read_csv('/content/val_Sleep_health_and_lifestyle_dataset.csv')

print(f"Dimensões: {df.shape}")
print(f"\nColunas: {df.columns.tolist()}")

print("\nAmostra dos Dados")
display(df.head())

df.info()

Dimensões: (56, 25)

Colunas: ['Age', 'Sleep Duration', 'Physical Activity Level', 'Stress Level', 'Heart Rate', 'Daily Steps', 'BMI Category', 'BP_Category', 'Gender_Female', 'Gender_Male', 'Occupation_Accountant', 'Occupation_Doctor', 'Occupation_Engineer', 'Occupation_Lawyer', 'Occupation_Manager', 'Occupation_Nurse', 'Occupation_Sales Representative', 'Occupation_Salesperson', 'Occupation_Scientist', 'Occupation_Software Engineer', 'Occupation_Teacher', 'Sleep Disorder_Insomnia', 'Sleep Disorder_Normal', 'Sleep Disorder_Sleep Apnea', 'Quality of Sleep']

Amostra dos Dados


Unnamed: 0,Age,Sleep Duration,Physical Activity Level,Stress Level,Heart Rate,Daily Steps,BMI Category,BP_Category,Gender_Female,Gender_Male,...,Occupation_Nurse,Occupation_Sales Representative,Occupation_Salesperson,Occupation_Scientist,Occupation_Software Engineer,Occupation_Teacher,Sleep Disorder_Insomnia,Sleep Disorder_Normal,Sleep Disorder_Sleep Apnea,Quality of Sleep
0,1.175604,-0.993872,2.074123,1.381593,0.358948,2.266822,1.0,2.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6
1,-0.168002,0.333675,0.407642,-0.448889,-0.812563,1.108898,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,8
2,-0.168002,0.599184,0.407642,-0.448889,-0.812563,1.108898,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,8
3,-0.412294,-0.330099,-0.981091,0.771432,0.191589,-0.743781,1.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,5
4,0.564874,-0.728363,-0.425598,0.771432,-0.143128,-0.049027,1.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,6


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56 entries, 0 to 55
Data columns (total 25 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Age                              56 non-null     float64
 1   Sleep Duration                   56 non-null     float64
 2   Physical Activity Level          56 non-null     float64
 3   Stress Level                     56 non-null     float64
 4   Heart Rate                       56 non-null     float64
 5   Daily Steps                      56 non-null     float64
 6   BMI Category                     56 non-null     float64
 7   BP_Category                      56 non-null     float64
 8   Gender_Female                    56 non-null     float64
 9   Gender_Male                      56 non-null     float64
 10  Occupation_Accountant            56 non-null     float64
 11  Occupation_Doctor                56 non-null     float64
 12  Occupation_Engineer     

In [None]:
import joblib

# Salvando os objetos para usar no Streamlit
joblib.dump(scaler, 'scaler.joblib')
joblib.dump(ohe_enc, 'ohe_encoder.joblib')
joblib.dump(ord_enc, 'ord_encoder.joblib')

print("Salvos")

Salvos
