# Data Masters: Case

## Bibliotecas

In [133]:
# --- Data Exploration --- #
import numpy as np
import pandas as pd

# --- Classification model --- #
from sklearn.ensemble import GradientBoostingClassifier

# --- Model Evaluation --- #
from sklearn.metrics import auc, confusion_matrix, roc_auc_score

# --- Pipeline Building --- #
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV

# --- Cluster Analysis --- #
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

# --- Preprocessing --- #
from sklearn.preprocessing import StandardScaler

# --- Custom Libs --- #
from resources.utils import ColumnDropper

## Pipeline de Classificação

In [2]:
with open("resources/drop_cols.txt") as f:
    drop_cols = [line.rstrip("\n") for line in f]

In [3]:
main_pipeline = Pipeline(
    steps=[
        ("ColDrop", ColumnDropper(drop_cols)),
        ("clf", GradientBoostingClassifier())
    ]
)

In [4]:
param_grid = {
    "clf__n_estimators": [80, 100, 200],
    "clf__learning_rate": [.01, .1, 1],
    "clf__max_depth": [4, 6, 8]
}

In [5]:
gscv = GridSearchCV(
    main_pipeline,
    param_grid=param_grid,
    scoring="roc_auc",
    n_jobs=14,
    error_score="raise"
)

## Fit

In [140]:
df = pd.read_csv("data/train.csv")
X = df.set_index("ID")
y = df.set_index("ID")["TARGET"]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

X_train = X_train.drop("TARGET", axis=1)
X_test = X_test.drop("TARGET", axis=1)

In [7]:
gscv = gscv.fit(X_train,y_train)

In [114]:
y_pred = gscv.predict_proba(X_train)[:,1]
roc_auc_score(y_train, y_pred)

0.8682277997866147

## Selecionando o melhor corte

In [125]:
def predict_probability(X, model):
    return model.predict_proba(X)[:,1]

def make_result_df(y,y_pred):
    df = y.reset_index()
    df["prob"] = y_pred
    df["pred"] = y_pred
    return df

def predict_class(y, c):
    y["pred"] = y["prob"].apply(lambda x: 1 if x >= c else 0)
    return y

def evaluate(df, c):
    tn, fp, fn, tp = confusion_matrix(
        df["TARGET"],
        predict_class(df, c)["pred"],
    ).ravel()
    return (tn*0+fp*(-10)+fn*0+tp*(100-10))/len(df)

In [126]:
def select_threshold(X, y):
    y_pred = predict_probability(X, gscv)
    df = make_result_df(y,y_pred)

    thresh_scores = [[i/100, 0] for i in range(101)]

    for i in range(len(thresh_scores)):
        c = thresh_scores[i][0]
        thresh_scores[i][1] = evaluate(df,c)

    thresh_scores = pd.DataFrame(thresh_scores,columns=["Threshold", "Profit"])

    best_threshold = thresh_scores.iloc[
        thresh_scores.Profit.idxmax()
    ].Threshold

    return best_threshold

In [127]:
c = select_threshold(X_train, y_train)

## Avaliação no conjunto de testes

In [131]:
y_pred = predict_probability(X_test, gscv)
pred_df = make_result_df(y_test, y_pred)
profit = evaluate(pred_df,c)
print(f"Lucro esperado: R${profit:.2f} por cliente da base")

Lucro esperado: R$1.03 por cliente da base


## Clusters

In [158]:
X_clust = X_test \
    .merge(y_test.reset_index(),how="left",on="ID") \
    .merge(pred_df[["ID","pred"]],how="left",on="ID")

In [159]:
def profit_individual(true,pred):
    profit = 0
    if pred == 1:
        profit += -10
        if true == 1:
            profit += 100
    return profit

X_clust["profit"] = np.vectorize(profit_individual)(X_clust["TARGET"], X_clust["pred"])

In [160]:
X_clust = X_clust.set_index("ID")

In [161]:
X_clust

Unnamed: 0_level_0,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,imp_op_var40_ult1,...,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET,pred,profit
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
125987,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0,0,0
54150,2,28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,72380.010000,0,0,0
11883,2,24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,121334.280000,0,0,0
9733,2,24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,114830.010000,0,0,0
12745,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,82863.900000,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145076,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,55667.400000,0,0,0
8976,2,35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0,0,0
123578,2,25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,72733.590000,0,0,0
119823,2,24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,115950.060000,0,0,0
