## Módulo 11

### Data leakage
https://machinelearningmastery.com/data-leakage-machine-learning/

https://estatsite.com.br/2020/12/12/data-leakage-o-erro-que-ate-os-grandes-cometem/

https://www.forcepoint.com/pt-br/cyber-edu/data-leakage

In [None]:
# Todo o processamento das variáveis numéricas e categóricas está feito aqui. Feature Engineering, remoção de outliers
# Procurar data leakage aqui
def preprocessing(df):
    
    #>> NUMERICAL
    #===================================================================================================
    # Nova feature: 'PCT_CREDIT'
    df["PCT_CREDIT"] = df["AMT_CREDIT"]/df["AMT_INCOME_TOTAL"]
    #---------------------------------------------------------------------------------------------------
    #- AMT_INCOME_TOTAL
    # Cut off de 3 std: Melhor remoção de outliers
    # Find the mean and standard dev   
    std = df['AMT_INCOME_TOTAL'].std()
    mean = df['AMT_INCOME_TOTAL'].mean()
    # Calculate the cutoff
    cut_off = std * 3
    lower, upper = mean - cut_off, mean + cut_off
    # Trim the outliers
    df = df[(df['AMT_INCOME_TOTAL'] > lower) & (df['AMT_INCOME_TOTAL'] < upper)]
    #---------------------------------------------------------------------------------------------------
    #- AMT_GOODS_PRICE: drop valores > 2.500.000
    df = df[df["AMT_GOODS_PRICE"] <= 2_500_000] # remove 267 linhas no dado de treino
    #---------------------------------------------------------------------------------------------------
    #- REGION_POPULATION_RELATIVE: manter o outlier, pois representam 6.760 observações
    #---------------------------------------------------------------------------------------------------
    #- DAYS_BIRTH: serão transformadas para anos
    df["YEARS_BIRTH"] = df["DAYS_BIRTH"].apply(lambda x: np.int8(x//-365.25))
    df["YEARS_EMPLOYED"] = df["DAYS_EMPLOYED"].apply(lambda x: np.int8(x//-365.25))
    df["YEARS_REGISTRATION"] = df["DAYS_REGISTRATION"].apply(lambda x: np.int8(x//-365.25))
    df["YEARS_ID_PUBLISH"] = df["DAYS_ID_PUBLISH"].apply(lambda x: np.int8(x//-365.25))
    df.drop(columns=["DAYS_BIRTH", "DAYS_EMPLOYED", "DAYS_ID_PUBLISH", "DAYS_REGISTRATION"], inplace=True)
    #---------------------------------------------------------------------------------------------------
    #- DAYS_EMPLOYED: tirar os pensionistas e verificar como fica a distribuição para possível categorização
    df.loc[df.NAME_INCOME_TYPE=="Pensioneer", "YEARS_EMPLOYED"] = 0
    #---------------------------------------------------------------------------------------------------
    #- DAYS_REGISTRATION: drop outliers (box-plot)
    Q1 = df["YEARS_REGISTRATION"].quantile(0.25)
    Q3 = df["YEARS_REGISTRATION"].quantile(0.75)
    IQR = Q3 - Q1
    # Trim dos dados
    df = df.loc[(df["YEARS_REGISTRATION"] >= Q1 - 1.5 * IQR) & (df["YEARS_REGISTRATION"] <= (Q3 + 1.5 * IQR)),]
    #---------------------------------------------------------------------------------------------------
    #- OWN_CAR_AGE: vamos combinar com FLAG_OWN_CAR para criar classes: 0-não tem carro; 1-0~5anos; 2-6~10; 3-11~20; 4->20
    bins = [0, 5, 10, 20, 999]
    labels = [1, 2, 3, 4]
    df["OWN_CAR_TRANS"] = pd.cut(df["OWN_CAR_AGE"], bins=bins, labels=labels).cat.add_categories([0])
    df["OWN_CAR_TRANS"].fillna(0, inplace=True)
    df.drop(columns=["OWN_CAR_AGE", "FLAG_OWN_CAR"], inplace=True)
    #---------------------------------------------------------------------------------------------------
    #- LIVE_CITY_NOT_WORK_CITY: drop coluna inteira, 94% dos valores são iguais ao REG_CITY_NOT_WORK_CITY
    df.drop(columns=["LIVE_CITY_NOT_WORK_CITY"], inplace=True)
    #---------------------------------------------------------------------------------------------------
    #- DAYS_LAST_PHONE_CHANGE: será categorizada
    df["LAST_MONTH_PHONE_CHANGE"] = df["DAYS_LAST_PHONE_CHANGE"].apply(lambda x: 1 if (x>=-30 and x<0) else 0)
    df.drop(columns=["DAYS_LAST_PHONE_CHANGE"], inplace=True)
    #---------------------------------------------------------------------------------------------------
    #- AMT_REQ_CREDITBUREAU*: somar até quarter na V0, 3 categorias (0, 1-2, >2)
    df["TOTAL_AMT_REQ_CREDIT_BUREAU"] = df["AMT_REQ_CREDIT_BUREAU_HOUR"]+df["AMT_REQ_CREDIT_BUREAU_DAY"]+df["AMT_REQ_CREDIT_BUREAU_WEEK"]+df["AMT_REQ_CREDIT_BUREAU_MON"]+df["AMT_REQ_CREDIT_BUREAU_QRT"]
    bins = [0, 1, 2, 99]
    labels = [0, 1, 2]
    df["TOTAL_AMT_REQ_CREDIT_BUREAU"] = pd.cut(df["TOTAL_AMT_REQ_CREDIT_BUREAU"], bins=bins, labels=labels)
    df["TOTAL_AMT_REQ_CREDIT_BUREAU"].fillna(0, inplace=True)
    df.drop(columns=["AMT_REQ_CREDIT_BUREAU_HOUR", "AMT_REQ_CREDIT_BUREAU_DAY", "AMT_REQ_CREDIT_BUREAU_WEEK", "AMT_REQ_CREDIT_BUREAU_MON", "AMT_REQ_CREDIT_BUREAU_QRT"], inplace=True)
    #---------------------------------------------------------------------------------------------------
    #- CNT_CHILDREN: categorizar com 4 classes = 0, 1, 2, >2
    df["CNT_CHILDREN_CAT"] = df["CNT_CHILDREN"]
    df.loc[df.CNT_CHILDREN_CAT > 2, "CNT_CHILDREN_CAT"] = 3
    #---------------------------------------------------------------------------------------------------
    # AMT_ANNUITY
    # Preenchendo missing data - OLS com os maiores valores de correlação para preencher NaNs
    temp = df.dropna(subset=["AMT_ANNUITY"])
    results = smf.ols('AMT_ANNUITY ~ AMT_CREDIT + AMT_GOODS_PRICE + AMT_INCOME_TOTAL', data=temp).fit()
    temp = df.loc[df.AMT_ANNUITY.isna(), ["AMT_CREDIT", "AMT_GOODS_PRICE", "AMT_INCOME_TOTAL"]]
    df.loc[df.AMT_ANNUITY.isna(), "AMT_ANNUITY"] = results.predict(temp).values
    #---------------------------------------------------------------------------------------------------
    # EXT_SOURCE_2
    # Preenchendo missing data - OLS com os maiores valores de correlação para preencher NaNs
    # TARGET não pode estar aqui
    temp = df.dropna(subset=["EXT_SOURCE_2"])
    results = smf.ols('EXT_SOURCE_2 ~ TARGET + AMT_CREDIT + AMT_ANNUITY + AMT_GOODS_PRICE + REGION_POPULATION_RELATIVE + REGION_RATING_CLIENT_W_CITY', data=temp).fit()
    temp = df.loc[df.EXT_SOURCE_2.isna(), ["TARGET", "AMT_CREDIT", "AMT_ANNUITY", "AMT_GOODS_PRICE", "REGION_POPULATION_RELATIVE", "REGION_RATING_CLIENT_W_CITY"]]
    df.loc[df.EXT_SOURCE_2.isna(), "EXT_SOURCE_2"] = results.predict(temp).values
    #---------------------------------------------------------------------------------------------------
    #---------------------------------------------------------------------------------------------------
    
    #>> CATEGORICAL
    #===================================================================================================
    # NAME_TYPE_SUITE : Deixar duas classes, acompanhado e desacompanhado. Os NaNs são preenchidos com a moda
    df["NAME_TYPE_SUITE"].fillna("Unaccompanied", inplace=True)
    df.loc[df["NAME_TYPE_SUITE"] != "Unaccompanied", "NAME_TYPE_SUITE"] = "Accompanied"
    #---------------------------------------------------------------------------------------------------
    # NAME_INCOME_TYPE : 'Businessman', 'Unemployed', 'Student', 'Maternity leave' estão categorizados como 'Others'
    df["NAME_INCOME_TYPE"] = df["NAME_INCOME_TYPE"].where(df["NAME_INCOME_TYPE"].isin(['Working', 'Commercial associate', 'Pensioner', 'State servant']), 'Other')
    #---------------------------------------------------------------------------------------------------
    # Renomear como academico ou other : Com e sem ensino superior
    academic = ["Higher education", "Academic degree"]
    df['NAME_EDUCATION_TYPE'] = df['NAME_EDUCATION_TYPE'].where(df['NAME_EDUCATION_TYPE'].isin(academic), "Other")
    df['NAME_EDUCATION_TYPE'] = df['NAME_EDUCATION_TYPE'].where(df['NAME_EDUCATION_TYPE'].isin(["Other"]), "Academic")
    #---------------------------------------------------------------------------------------------------
    # NAME_FAMILY_STATUS : Casado e Não casado
    married_other = ['Married','Civil marriage']
    df['NAME_FAMILY_STATUS'] = df['NAME_FAMILY_STATUS'].where(df['NAME_FAMILY_STATUS'].isin(married_other), "Not married")
    df['NAME_FAMILY_STATUS'] = df['NAME_FAMILY_STATUS'].where(df['NAME_FAMILY_STATUS'].isin(["Not married"]), "Married")
    #---------------------------------------------------------------------------------------------------
    # NAME_HOUSING_TYPE : Owner e Not Owner
    df['NAME_HOUSING_TYPE'] = df['NAME_HOUSING_TYPE'].where(df['NAME_HOUSING_TYPE'].isin(["House / apartment"]), "Not owner")
    df['NAME_HOUSING_TYPE'] = df['NAME_HOUSING_TYPE'].where(df['NAME_HOUSING_TYPE'].isin(['Not owner']), "Owner")
    #---------------------------------------------------------------------------------------------------
    #---------------------------------------------------------------------------------------------------
    return df


---