# Imports

In [1]:
import numpy as np
import pandas as pd

# Limpeza dos dados

In [2]:
# Dataset disponível em: https://www.cdc.gov/brfss/annual_data/annual_2022.html
# 445.132 linhas
df_raw_2022 = pd.read_sas("../raw_data/brfss_2022.XPT")

# Target: ADDEPEV3: (Ever told) (you had) a depressive disorder (including depression, major depression, dysthymia, or minor depression)?

In [3]:
# Selecionando colunas e fazendo o parse delas

df_selected = df_raw_2022[["ADDEPEV3", "_SEX", "_AGE80", "_RFHLTH", "_HLTHPLN", "MEDCOST1", "CHECKUP1", "_TOTINDA", 
             "SLEPTIM1", "_MICHD", "_LTASTH1", "MARITAL", "EDUCA", "RENTHOM1", "EMPLOY1", "CHILDREN", "_BMI5CAT",
             "DECIDE", "DIFFALON", "_SMOKER3", "ALCDAY4", "LSATISFY", "EMTSUPRT", "SDHISOLT", "SDHEMPLY",
             "SDHFOOD1", "SDHBILLS", "SDHUTILS", "SDHTRNSP", "SDHSTRE1", "_RACEGR4"]]

def parse_target(value):
    if value == 2.0:
        res = 0
    elif value == 1.0:
        res = 1
    else:
        res = np.nan
    return res

def parse_alcday(value):
    if value == 888.0:
        res = 0
    elif (value == 999.0 or value == 777.0):
        res = np.nan
    elif (value >= 201.0 and value <= 299.0):
        res = value - 200
    elif (value >= 101.0 and value <= 199.0):
        res = 4*(value-100)
    else:
        res = np.nan
        
    if res > 30:
        res = np.nan
    return res

df_selected.loc[:,"ADDEPEV3"] = df_selected.loc[:,"ADDEPEV3"].apply(parse_target)
df_selected.loc[:,"_RFHLTH"] = df_selected.loc[:,"_RFHLTH"].apply(lambda x: np.nan if x == 9 else x)
df_selected.loc[:,"_HLTHPLN"] = df_selected.loc[:,"_HLTHPLN"].apply(lambda x: np.nan if x == 9 else x)
df_selected.loc[:,"MEDCOST1"] = df_selected.loc[:,"MEDCOST1"].apply(lambda x: np.nan if x == 9.0 else x)
df_selected.loc[:,"CHECKUP1"] = df_selected.loc[:,"CHECKUP1"].apply(lambda x: np.nan if x == 9.0 else x)
df_selected.loc[:,"_TOTINDA"] = df_selected.loc[:,"_TOTINDA"].apply(lambda x: np.nan if x == 9 else x)
df_selected.loc[:,"MARITAL"] = df_selected.loc[:,"MARITAL"].apply(lambda x: np.nan if x == 9 else x)
df_selected.loc[:,"EDUCA"] = df_selected.loc[:,"EDUCA"].apply(lambda x: np.nan if x == 9 else x)
df_selected.loc[:,"RENTHOM1"] = df_selected.loc[:,"RENTHOM1"].apply(lambda x: np.nan if x == 9 else x)
df_selected.loc[:,"EMPLOY1"] = df_selected.loc[:,"EMPLOY1"].apply(lambda x: np.nan if x == 9 else x)
df_selected.loc[:,"CHILDREN"] = df_selected.loc[:,"CHILDREN"].apply(lambda x: np.nan if x == 99 else (0 if x == 88 else x))
df_selected.loc[:,"DECIDE"] = df_selected.loc[:,"DECIDE"].apply(lambda x: np.nan if x == 9 else x)
df_selected.loc[:,"DIFFALON"] = df_selected.loc[:,"DIFFALON"].apply(lambda x: np.nan if x == 9 else x)
df_selected.loc[:,"_SMOKER3"] = df_selected.loc[:,"_SMOKER3"].apply(lambda x: np.nan if x == 9 else x)
df_selected.loc[:,"_LTASTH1"] = df_selected.loc[:,"_LTASTH1"].apply(lambda x: np.nan if x == 9 else x)

# Colunas exclusivas ao dataset de 2022
df_selected.loc[:,"SLEPTIM1"] = df_selected.loc[:,"SLEPTIM1"].apply(lambda x: np.nan if (x == 99 or x == 77) else x)
df_selected.loc[:,"ALCDAY4"] = df_selected.loc[:,"ALCDAY4"].apply(parse_alcday)
df_selected.loc[:,"LSATISFY"] = df_selected.loc[:,"LSATISFY"].apply(lambda x: np.nan if x == 9 else x)
df_selected.loc[:,"EMTSUPRT"] = df_selected.loc[:,"EMTSUPRT"].apply(lambda x: np.nan if x == 9 else x)
df_selected.loc[:,"SDHISOLT"] = df_selected.loc[:,"SDHISOLT"].apply(lambda x: np.nan if x == 9 else x)
df_selected.loc[:,"SDHEMPLY"] = df_selected.loc[:,"SDHEMPLY"].apply(lambda x: np.nan if x == 9 else x)
df_selected.loc[:,"SDHFOOD1"] = df_selected.loc[:,"SDHFOOD1"].apply(lambda x: np.nan if x == 9 else x)
df_selected.loc[:,"SDHBILLS"] = df_selected.loc[:,"SDHBILLS"].apply(lambda x: np.nan if x == 9 else x)
df_selected.loc[:,"SDHUTILS"] = df_selected.loc[:,"SDHUTILS"].apply(lambda x: np.nan if x == 9 else x)
df_selected.loc[:,"SDHTRNSP"] = df_selected.loc[:,"SDHTRNSP"].apply(lambda x: np.nan if x == 9 else x)
df_selected.loc[:,"SDHSTRE1"] = df_selected.loc[:,"SDHSTRE1"].apply(lambda x: np.nan if x == 9 else x)
df_selected.loc[:,"_RACEGR4"] = df_selected.loc[:,"_RACEGR4"].apply(lambda x: np.nan if x == 9 else x)


# Dropando valores nulos
df_filtered = df_selected.dropna()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value


In [4]:
df_filtered.to_csv("../cleaned_data/brfss_filtered_2022_full.csv")