# Librerías y Funciones

## Librerías

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from sklearn.ensemble import RandomForestClassifier

import optuna
from optuna.samplers import TPESampler
optuna.logging.set_verbosity(optuna.logging.WARNING)
from sklearn.metrics import f1_score

## Funciones

# Optimización de Modelo Random Forest

In [2]:
# Transformación de datos desde un timestamp en segundos a un formato más legible
class TimestampTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, timestamp_columns, target_format='days'):
        self.timestamp_columns = timestamp_columns
        self.target_format = target_format

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        # Se valida que todas las columnas de tiempo existan en el dataframe
        missing_columns = [col for col in self.timestamp_columns if col not in X.columns]
        if missing_columns:
            raise KeyError(f"Missing required columns for timestamp transformation: {missing_columns}")

        for col in self.timestamp_columns:
            X[col] = pd.to_datetime(X[col], unit='s')

        # Se agregan algunas variables de tiempo
        X['wallet_age_days'] = (X['last_tx_timestamp'] - X['first_tx_timestamp']).dt.days
        X['risky_tx_duration_days'] = (
            X['risky_last_tx_timestamp'] - X['risky_first_tx_timestamp']
        ).dt.days

        return X

# Creación de nuevas variables a partir de algunas existentes
class FeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        # Añado las nuevas variables al conjunto de datos
        X['borrow_repay_ratio'] = X['borrow_amount_sum_eth'] / (X['repay_amount_sum_eth'] + 1e-9)
        X['time_since_first_borrow'] = (
        X['borrow_timestamp'] - X['first_tx_timestamp'].astype('int64') / 1e9
        )
        X['activity_duration'] = (
        X['last_tx_timestamp'].astype('int64') / 1e9 - X['first_tx_timestamp'].astype('int64') / 1e9
        )
        X['tx_in_out_ratio'] = X['incoming_tx_count'] / (X['outgoing_tx_count'] + 1e-9)
        X['borrow_balance_ratio'] = X['borrow_amount_sum_eth'] / (X['total_balance_eth'] + 1e-9)
        X['weighted_risky_tx'] = (
            X['risky_tx_count'] * X['risky_sum_outgoing_amount_eth'] / (X['total_balance_eth'] + 1e-9)
        )

        return X

# Eliminación de variables no deseadas
class DropColumns(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_drop):
        self.columns_to_drop = columns_to_drop

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.drop(columns=self.columns_to_drop)

# Eliminación de registros con valores atípicos
class RemoveOutlierWallet(BaseEstimator, TransformerMixin):
    def __init__(self, wallet_address):
        self.wallet_address = wallet_address

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.copy()
        original_index = X.index
        X = X[X['wallet_address'] != self.wallet_address]

        if y is not None:
            y = y.loc[original_index]
            y = y.loc[X.index]
            return X, y

        return X

# Escalado de valores numéricos
class ScaleNumericFeatures(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.scaler = MinMaxScaler()

    def fit(self, X, y=None):
        numeric_columns = X.select_dtypes(include=[np.number]).columns
        self.scaler.fit(X[numeric_columns])
        self.numeric_columns = numeric_columns
        return self

    def transform(self, X):
        X = X.copy()
        X[self.numeric_columns] = self.scaler.transform(X[self.numeric_columns])
        return X

In [3]:
# Defino las columnas que voy a eliminar
timestamp_columns = [
    'first_tx_timestamp',
    'last_tx_timestamp',
    'risky_first_tx_timestamp',
    'risky_last_tx_timestamp'
]

columns_to_drop = [
    'max_eth_ever',
    'min_eth_ever',
    'total_collateral_avg_eth',
    'borrow_amount_avg_eth',
    'repay_amount_avg_eth',
    'avg_gas_paid_per_tx_eth',
    'risky_unique_contract_count',
    'risky_first_last_tx_timestamp_diff',
    'incoming_tx_avg_eth',
    'outgoing_tx_avg_eth',
    'incoming_tx_sum_eth',
    'outgoing_tx_sum_eth',
    'withdraw_deposit_diff_if_positive_eth',
    'wallet_address',
    'borrow_block_number',
    'borrow_timestamp',
    'max_risk_factor',
    'repay_count',
    'risk_factor_above_threshold_daily_count',
    'market_macd_macdfix',
    'market_macdsignal_macdfix'
]

outlier_wallet_address = '0x586e32930ac05127de429bd566eaa2758fcbd9bc'

In [4]:
# Creamos el pipeline de procesamiento
preprocessing_pipeline = Pipeline([
    ('timestamp_transformer', TimestampTransformer(timestamp_columns=timestamp_columns)),
    ('feature_engineering', FeatureEngineer()),
    ('drop_columns', DropColumns(columns_to_drop=columns_to_drop + timestamp_columns)),
    ('scale_numeric', ScaleNumericFeatures())
])

In [5]:
# Cargamos y acomodamos un poco los datos
path = 'C:/Users/simon/OneDrive/Escritorio/Proyecto-Lab-MDS/competition_files'
X = pd.read_parquet(path + "/X_t1.parquet")
y = pd.read_parquet(path + "/y_t1.parquet")

X.reset_index(drop=True, inplace=True)
y.reset_index(drop=True, inplace=True)

# Eliminamos el outlier que encontramos para no sesgar el modelo
X, y = RemoveOutlierWallet(wallet_address=outlier_wallet_address).transform(X, y)
y = y.squeeze()

# Realizamos la división inicial de los datos
data = pd.concat([X, y], axis=1)

# Realizamos la división inicial de los datos
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=0)

In [6]:
# Desarrollo de un Random Forest
rf_model = RandomForestClassifier(random_state=42, n_jobs=-1)

rf_pipeline = Pipeline([
    ('preprocessing', preprocessing_pipeline),
    ('clf', rf_model)
])

rf_pipeline.fit(X_train, y_train)

In [7]:
# Evaluamos el modelo base en el conjunto de prueba
y_pred = rf_pipeline.predict(X_test)
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.96      0.90     24794
           1       0.92      0.73      0.81     14985

    accuracy                           0.87     39779
   macro avg       0.89      0.85      0.86     39779
weighted avg       0.88      0.87      0.87     39779



In [8]:
def objective(trial):
    '''
    Función objetivo para optimizar el modelo XGBClassifier, buscando la mejor
    combinación de hiperparámetros para minimizar el F1-Score.
    Se usará TPESampler como método de muestreo.

    Args
        trial: instancia de la clase Trial

    Returns
        f1_score: F1-Score del modelo entrenado
    '''
    # Definición de los hiperparámetros a optimizar para xgboost
    rf_params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 100),
        'max_depth': trial.suggest_int('max_depth', 3, 50),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
        'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy', 'log_loss']),
        'class_weight': trial.suggest_categorical('class_weight', ['balanced', 'balanced_subsample', None])
    }

    # Creación del modelo
    rf_model = RandomForestClassifier(random_state = 30, **rf_params, n_jobs=-1)

    # Split the data into train and test sets
    X = data.drop(columns=['target'])
    y = data['target']

    # Ensure X and y are synchronized if outliers are removed
    X, y = RemoveOutlierWallet(wallet_address=outlier_wallet_address).transform(X, y)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Create a pipeline including preprocessing and XGBoost
    pipe = Pipeline([
        ('preprocessing', preprocessing_pipeline),
        ('model', rf_model)
    ])

    # Entrenamiento
    pipe.fit(X_train, y_train)
    y_val_pred = pipe.predict(X_test)

    # Guardar el mejor pipeline entrenado
    trial.set_user_attr('best_pipeline', pipe)

    return f1_score(y_test, y_val_pred)

In [9]:
time = 1800

In [10]:
sampler = TPESampler(seed = 30)
study = optuna.create_study(direction = 'maximize', sampler = sampler)
study.optimize(objective, timeout = time, show_progress_bar=True)

   0%|          | 00:00/30:00

In [None]:
print(f"Número de trials: {len(study.trials)}")
print(f"Mejor F1-Score: {study.best_value}")
print("Mejores hiperparámetros encontrados:", study.best_params)

In [None]:
# Creación de modelo con los mejores hiperparámetros encontrados
best_params = study.best_params
best_rf_model = RandomForestClassifier(random_state=42, **best_params, n_jobs=-1)

# Creación del pipeline con los mejores hiperparámetros
best_rf_pipeline = Pipeline([
    ('preprocessing', preprocessing_pipeline),
    ('clf', best_rf_model)
])

# Entrenamiento del modelo con los mejores hiperparámetros
best_rf_pipeline.fit(X_train, y_train)

In [None]:
# Evaluación del modelo con los mejores hiperparámetros
y_pred = best_rf_pipeline.predict(X_test)
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

# Interpretabilidad

# Re-Entrenamiento del Modelo

# Tracking con MLFlow

# Aplicación Web