# Data Masters: Case

## Bibliotecas

In [1]:
# --- Data Exploration --- #
import numpy as np
import pandas as pd

# --- sklearn pre processing tools --- #
from sklearn.preprocessing import StandardScaler, FunctionTransformer

# --- Classification model --- #
from sklearn.ensemble import GradientBoostingClassifier

# --- Clustering --- #
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

# --- Model Evaluation --- #
from sklearn.metrics import auc, confusion_matrix, roc_auc_score, make_scorer

# --- Pìpeline Building --- #
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin

# --- Custom Libs --- #
from resources.utils import ColumnDropper

## Pipeline de Classificação

In [2]:
with open("resources/drop_cols.txt") as f:
    drop_cols = [line.rstrip("\n") for line in f]

In [3]:
main_pipeline = Pipeline(
    steps=[
        ("ColDrop", ColumnDropper(drop_cols)),
        ("clf", GradientBoostingClassifier())
    ]
)

In [4]:
param_grid = {
    "clf__n_estimators": [80, 100, 200],
    "clf__learning_rate": [.01, .1, 1],
    "clf__max_depth": [4, 6, 8]
}

In [5]:
gscv = GridSearchCV(
    main_pipeline,
    param_grid=param_grid,
    scoring="roc_auc",
    n_jobs=14,
    error_score="raise"
)

## Fit

In [6]:
df = pd.read_csv("data/train.csv")
X = df.set_index("ID")
y = df.set_index("ID")["TARGET"]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

X_train = X_train.drop("TARGET", axis=1)

In [7]:
gscv = gscv.fit(X_train,y_train)

In [None]:
gscv.best_estimator_