In [5]:
%%writefile data_transform.py

import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

# 1. Remove colunas especificadas*
class ColumnDropper(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_drop=None):
        self.columns_to_drop = columns_to_drop

    def fit(self, X, y=None):
        return self  
        
    def transform(self, X):
        X = X.copy()
        return X.drop(columns=self.columns_to_drop, errors='ignore')


# 2. Vai Categorizar Algumas Features
class CategorizerFeatures(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        # Armazena o valor máximo de Fare para usar nos bins
        self.fare_max = X['Fare'].max()
        return self

    def transform(self, X):
        X = X.copy()

        # Categorização de idade
        age_bins = [0, 2, 12, 18, 30, 60, 120, np.inf]
        age_labels = ['recém-nascido', 'criança', 'adolescente', 'jovem', 'adulto', 'idoso', 'Não_Informado']
        X['faixa_etaria'] = pd.cut(X['Age'], bins=age_bins, labels=age_labels, right=False)
        X['faixa_etaria'] = X['faixa_etaria'].fillna('Não_Informado')
        X.drop(columns=['Age'], inplace=True)

        # Categorização de tarifa
        fare_bins = [0, 7.91, 14.45, 31.00, self.fare_max]
        fare_labels = ['muito_baixa', 'baixa', 'média', 'alta']
        X['Fare_categoria'] = pd.cut(X['Fare'], bins=fare_bins, labels=fare_labels, include_lowest=True)
        X.drop(columns=['Fare'], inplace=True)

        return X

# 3. Codifica colunas ordinais
class OrdinalEncoderTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns
        self.encoder = OrdinalEncoder()

    def fit(self, X, y=None):
        self.encoder.fit(X[self.columns])
        return self

    def transform(self, X):
        X = X.copy()
        X[self.columns] = self.encoder.transform(X[self.columns])
        return X

# 4. Codifica Embarked, Pclass, faixa_etaria e  Fare com OneHotEncoder
class OneHotEmbarkedAndPclassEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        self.columns = ['Embarked', 'Pclass', 'faixa_etaria', 'Fare_categoria']
        self.feature_names = None

    def fit(self, X, y=None):
        self.encoder.fit(X[self.columns])
        self.feature_names = self.encoder.get_feature_names_out(self.columns)
        return self

    def transform(self, X):
        X = X.copy()
        encoded = self.encoder.transform(X[self.columns])
        X_encoded = pd.DataFrame(encoded, columns=self.feature_names, index=X.index)
        X = pd.concat([X.drop(columns=self.columns), X_encoded], axis=1)
        return X

class LogTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.columns:
            # Evita problemas com valores zero ou negativos
            X[col] = X[col].apply(lambda x: np.log1p(x) if pd.notnull(x) and x >= 0 else x)
        return X


Overwriting data_transform.py
