Modelo 1: Random Forest + optimización de hiperparámetros

In [None]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import joblib
from google.colab import drive

drive.mount('/content/drive', force_remount=True)

with open('/content/drive/My Drive/TFM_b/tess_combined_training_data.pkl', 'rb') as f:
    combined_data = pickle.load(f)

def extract_features(obj):
    flux = np.array(obj['norm_flux'])
    flux_err = np.array(obj['norm_flux_err'])

    return {
        'mean_flux': np.mean(flux),
        'std_flux': np.std(flux),
        'skew_flux': stats.skew(flux),
        'kurt_flux': stats.kurtosis(flux),
        'iqr_flux': np.percentile(flux, 75) - np.percentile(flux, 25),
        'exoplanet': obj['exoplanet']
    }

df = pd.DataFrame([extract_features(obj) for obj in combined_data])

def train_evaluate_model(X, y, model_name):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    pipeline = ImbPipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler()),
        ('classifier', RandomForestClassifier(random_state=42))
    ])

    param_dist = {
        'classifier__n_estimators': [100, 200, 300, 400, 500],
        'classifier__max_depth': [5, 10, 15, 20, None],
        'classifier__min_samples_split': [2, 5, 10],
        'classifier__min_samples_leaf': [1, 2, 4],
        'classifier__class_weight': ['balanced', 'balanced_subsample', None]
    }

    random_search = RandomizedSearchCV(pipeline, param_distributions=param_dist,
                                       n_iter=30, cv=StratifiedKFold(n_splits=5),
                                       scoring='f1', n_jobs=-1, random_state=42)

    random_search.fit(X_train, y_train)
    best_model = random_search.best_estimator_

    y_pred = best_model.predict(X_test)
    y_pred_proba = best_model.predict_proba(X_test)[:, 1]

    print(f"\nResultados para {model_name}:")
    print("\nMejores hiperparámetros:")
    print(random_search.best_params_)
    print("\nMatriz de confusión:")
    print(confusion_matrix(y_test, y_pred))
    print("\nInforme de clasificación:")
    print(classification_report(y_test, y_pred))
    print(f"\nROC AUC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")

    joblib.dump(best_model, f'/content/drive/My Drive/TFM_b/exoplanet_model_{model_name}.joblib')
    print(f"Modelo guardado como: exoplanet_model_{model_name}.joblib")

    if hasattr(best_model.named_steps['classifier'], 'feature_importances_'):
        feature_importance = best_model.named_steps['classifier'].feature_importances_
        feature_names = X.columns
        feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importance})
        feature_importance_df = feature_importance_df.sort_values('importance', ascending=False)

        plt.figure(figsize=(10, 6))
        sns.barplot(x='importance', y='feature', data=feature_importance_df)
        plt.title('Importancia de las características')
        plt.tight_layout()
        plt.show()

    precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
    plt.figure(figsize=(8, 6))
    plt.plot(recall, precision, marker='.')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Curva Precisión-Recall')
    plt.show()

X = df.drop('exoplanet', axis=1)
y = df['exoplanet']
train_evaluate_model(X, y, "modelo1")

Resultados para modelo1:

Mejores hiperparámetros:
{'classifier__n_estimators': 400, 'classifier__min_samples_split': 10, 'classifier__min_samples_leaf': 2, 'classifier__max_depth': 15, 'classifier__class_weight': 'balanced'}

Matriz de confusión:<br>
[[1971   64]<br>
 [  57   30]]<br>

            Informe de clasificación:
              precision    recall  f1-score   support<br>

           0       0.97      0.97      0.97      2035
           1       0.32      0.34      0.33        87

    accuracy                            0.94      2122
    macro avg       0.65      0.66      0.65      2122 
    weighted avg    0.95      0.94      0.94      2122


ROC AUC Score: 0.8138<br>
Modelo guardado como: exoplanet_model_tess_flux_only.joblib

---------------------

Modelo 2: Random Forest + optimización de hiperparámetros + manejo del desequilibro de clases

In [None]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import joblib
from google.colab import drive

drive.mount('/content/drive', force_remount=True)

with open('/content/drive/My Drive/TFM_b/tess_combined_training_data.pkl', 'rb') as f:
    combined_data = pickle.load(f)

def extract_features(obj):
    flux = np.array(obj['norm_flux'])
    flux_err = np.array(obj['norm_flux_err'])

    return {
        'mean_flux': np.mean(flux),
        'std_flux': np.std(flux),
        'skew_flux': stats.skew(flux),
        'kurt_flux': stats.kurtosis(flux),
        'iqr_flux': np.percentile(flux, 75) - np.percentile(flux, 25),
        'exoplanet': obj['exoplanet']
    }

df = pd.DataFrame([extract_features(obj) for obj in combined_data])

def train_evaluate_model(X, y, model_name):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    pipeline = ImbPipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler()),
        ('smote', SMOTE(random_state=42)),
        ('classifier', RandomForestClassifier(random_state=42))
    ])

    param_dist = {
        'classifier__n_estimators': [100, 200, 300, 400, 500],
        'classifier__max_depth': [5, 10, 15, 20, None],
        'classifier__min_samples_split': [2, 5, 10],
        'classifier__min_samples_leaf': [1, 2, 4],
        'classifier__class_weight': ['balanced', 'balanced_subsample', None]
    }

    random_search = RandomizedSearchCV(pipeline, param_distributions=param_dist,
                                       n_iter=30, cv=StratifiedKFold(n_splits=5),
                                       scoring='f1', n_jobs=-1, random_state=42)

    random_search.fit(X_train, y_train)
    best_model = random_search.best_estimator_

    y_pred = best_model.predict(X_test)
    y_pred_proba = best_model.predict_proba(X_test)[:, 1]

    print(f"\nResultados para {model_name}:")
    print("\nMejores hiperparámetros:")
    print(random_search.best_params_)
    print("\nMatriz de confusión:")
    print(confusion_matrix(y_test, y_pred))
    print("\nInforme de clasificación:")
    print(classification_report(y_test, y_pred))
    print(f"\nROC AUC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")

    joblib.dump(best_model, f'/content/drive/My Drive/TFM_b/exoplanet_model_{model_name}.joblib')
    print(f"Modelo guardado como: exoplanet_model_{model_name}.joblib")

    if hasattr(best_model.named_steps['classifier'], 'feature_importances_'):
        feature_importance = best_model.named_steps['classifier'].feature_importances_
        feature_names = X.columns
        feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importance})
        feature_importance_df = feature_importance_df.sort_values('importance', ascending=False)

        plt.figure(figsize=(10, 6))
        sns.barplot(x='importance', y='feature', data=feature_importance_df)
        plt.title('Importancia de las características')
        plt.tight_layout()
        plt.show()

    precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
    plt.figure(figsize=(8, 6))
    plt.plot(recall, precision, marker='.')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Curva Precisión-Recall')
    plt.show()

X = df.drop('exoplanet', axis=1)
y = df['exoplanet']
train_evaluate_model(X, y, "model2")

Resultados para model2:

Mejores hiperparámetros:
{'classifier__n_estimators': 300, 'classifier__min_samples_split': 10, 'classifier__min_samples_leaf': 1, 'classifier__max_depth': None, 'classifier__class_weight': 'balanced'}

Matriz de confusión:<br>
[[1857  178]<br>
 [  45   42]]

    Informe de clasificación:
              precision    recall  f1-score   support

           0       0.98      0.91      0.94      2035
           1       0.19      0.48      0.27        87

    accuracy                           0.89      2122
    macro avg      0.58      0.70      0.61      2122
    weighted avg   0.94      0.89      0.92      2122


ROC AUC Score: 0.8284

_________________________________

Modelo 3: Random Forest + optimización de hiperparámetros + manejo del desequilibro de clases + feature engineering

In [None]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import joblib
from google.colab import drive

drive.mount('/content/drive', force_remount=True)

with open('/content/drive/My Drive/TFM_b/tess_combined_training_data.pkl', 'rb') as f:
    combined_data = pickle.load(f)

def extract_features(obj):
    flux = np.array(obj['norm_flux'])
    flux_err = np.array(obj['norm_flux_err'])

    return {
        'mean_flux': np.mean(flux),
        'std_flux': np.std(flux),
        'skew_flux': stats.skew(flux),
        'kurt_flux': stats.kurtosis(flux),
        'iqr_flux': np.percentile(flux, 75) - np.percentile(flux, 25),
        'mean_flux_err': np.mean(flux_err),
        'std_flux_err': np.std(flux_err),
        'flux_amplitude': np.max(flux) - np.min(flux),
        'flux_median': np.median(flux),
        'flux_max': np.max(flux),
        'flux_min': np.min(flux),
        'flux_range': np.ptp(flux),
        'flux_mad': np.median(np.abs(flux - np.median(flux))),
        'flux_energy': np.sum(flux**2),
        'flux_1st_quartile': np.percentile(flux, 25),
        'flux_3rd_quartile': np.percentile(flux, 75),
        'flux_above_mean': np.mean(flux > np.mean(flux)),
        'flux_max_to_min_ratio': np.max(flux) / np.min(flux) if np.min(flux) != 0 else np.inf,
        'exoplanet': obj['exoplanet']
    }

df = pd.DataFrame([extract_features(obj) for obj in combined_data])

def train_evaluate_model(X, y, model_name):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    pipeline = ImbPipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler()),
        ('smote', SMOTE(random_state=42)),
        ('classifier', RandomForestClassifier(random_state=42))
    ])

    param_dist = {
        'classifier__n_estimators': [100, 200, 300, 400, 500],
        'classifier__max_depth': [5, 10, 15, 20, None],
        'classifier__min_samples_split': [2, 5, 10],
        'classifier__min_samples_leaf': [1, 2, 4],
        'classifier__class_weight': ['balanced', 'balanced_subsample', None]
    }

    random_search = RandomizedSearchCV(pipeline, param_distributions=param_dist,
                                       n_iter=30, cv=StratifiedKFold(n_splits=5),
                                       scoring='f1', n_jobs=-1, random_state=42)

    random_search.fit(X_train, y_train)
    best_model = random_search.best_estimator_

    y_pred = best_model.predict(X_test)
    y_pred_proba = best_model.predict_proba(X_test)[:, 1]

    print(f"\nResultados para {model_name}:")
    print("\nMejores hiperparámetros:")
    print(random_search.best_params_)
    print("\nMatriz de confusión:")
    print(confusion_matrix(y_test, y_pred))
    print("\nInforme de clasificación:")
    print(classification_report(y_test, y_pred))
    print(f"\nROC AUC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")

    joblib.dump(best_model, f'/content/drive/My Drive/TFM_b/exoplanet_model_{model_name}.joblib')
    print(f"Modelo guardado como: exoplanet_model_{model_name}.joblib")

    if hasattr(best_model.named_steps['classifier'], 'feature_importances_'):
        feature_importance = best_model.named_steps['classifier'].feature_importances_
        feature_names = X.columns
        feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importance})
        feature_importance_df = feature_importance_df.sort_values('importance', ascending=False)

        plt.figure(figsize=(10, 6))
        sns.barplot(x='importance', y='feature', data=feature_importance_df)
        plt.title('Importancia de las características')
        plt.tight_layout()
        plt.show()

    precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
    plt.figure(figsize=(8, 6))
    plt.plot(recall, precision, marker='.')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Curva Precisión-Recall')
    plt.show()

X = df.drop('exoplanet', axis=1)
y = df['exoplanet']
train_evaluate_model(X, y, "model3")

Resultados para model3:

Mejores hiperparámetros:
{'classifier__n_estimators': 500, 'classifier__min_samples_split': 10, 'classifier__min_samples_leaf': 1, 'classifier__max_depth': None, 'classifier__class_weight': None}

Matriz de confusión:<br>
[[1988   47]<br>
 [  20   67]]

    Informe de clasificación:
              precision    recall  f1-score   support

           0       0.99      0.98      0.98      2035
           1       0.59      0.77      0.67        87

    accuracy                           0.97      2122
    macro avg      0.79      0.87      0.83      2122
    weighted avg   0.97      0.97      0.97      2122


ROC AUC Score: 0.9761

____________________________

Modelo 4: Random Forest + optimización de hiperparámetros + manejo del desequilibro de clases + feature engineering avanzado

In [None]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.signal import find_peaks
from scipy.fft import fft
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import joblib
from google.colab import drive

drive.mount('/content/drive', force_remount=True)

with open('/content/drive/My Drive/TFM_b/tess_combined_training_data.pkl', 'rb') as f:
    combined_data = pickle.load(f)

def extract_features(obj):
    flux = np.array(obj['norm_flux'])
    flux_err = np.array(obj['norm_flux_err'])

    features = {
        'mean_flux': np.mean(flux),
        'std_flux': np.std(flux),
        'skew_flux': stats.skew(flux),
        'kurt_flux': stats.kurtosis(flux),
        'iqr_flux': np.percentile(flux, 75) - np.percentile(flux, 25),
        'mean_flux_err': np.mean(flux_err),
        'std_flux_err': np.std(flux_err),
        'flux_amplitude': np.max(flux) - np.min(flux),
        'flux_median': np.median(flux),
        'flux_max': np.max(flux),
        'flux_min': np.min(flux),
        'flux_range': np.ptp(flux),
        'flux_mad': np.median(np.abs(flux - np.median(flux))),
        'flux_energy': np.sum(flux**2),
        'flux_1st_quartile': np.percentile(flux, 25),
        'flux_3rd_quartile': np.percentile(flux, 75),
        'flux_above_mean': np.mean(flux > np.mean(flux)),
        'flux_max_to_min_ratio': np.max(flux) / np.min(flux) if np.min(flux) != 0 else np.inf,
    }

    # Características adicionales
    peaks, _ = find_peaks(flux, height=np.mean(flux) + np.std(flux))
    valleys, _ = find_peaks(-flux, height=-(np.mean(flux) - np.std(flux)))
    features['num_peaks'] = len(peaks)
    features['num_valleys'] = len(valleys)
    features['peak_valley_ratio'] = features['num_peaks'] / (features['num_valleys'] + 1)
    features['peak_ratio'] = len(peaks) / len(flux)
    features['valley_ratio'] = len(valleys) / len(flux)
    features['peak_valley_ratio'] = features['peak_ratio'] / (features['valley_ratio'] + 1e-10)  # Evitar división por cero

    autocorr = np.correlate(flux - np.mean(flux), flux - np.mean(flux), mode='full')
    autocorr = autocorr[len(autocorr)//2:]
    features['max_autocorr'] = np.max(autocorr[1:])

    sorted_flux = np.sort(flux)
    features['flux_drop'] = np.mean(sorted_flux[:len(sorted_flux)//10]) - np.mean(flux)

    # Características basadas en FFT
    fft_vals = np.abs(fft(flux))
    features['fft_max'] = np.max(fft_vals[1:])
    features['fft_mean'] = np.mean(fft_vals[1:])
    features['fft_std'] = np.std(fft_vals[1:])

    # Características de forma de onda
    zero_crossings = np.where(np.diff(np.sign(flux - np.mean(flux))))[0]
    features['zero_crossing_rate'] = len(zero_crossings) / len(flux)

    features['log_flux_energy'] = np.log1p(features['flux_energy'])

    features['exoplanet'] = obj['exoplanet']
    return features

df = pd.DataFrame([extract_features(obj) for obj in combined_data])

def train_evaluate_model(X, y, model_name):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    pipeline = ImbPipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler()),
        ('smote', SMOTE(random_state=42)),
        ('classifier', RandomForestClassifier(random_state=42))
    ])

    param_dist = {
        'classifier__n_estimators': [100, 200, 300, 400, 500],
        'classifier__max_depth': [5, 10, 15, 20, None],
        'classifier__min_samples_split': [2, 5, 10],
        'classifier__min_samples_leaf': [1, 2, 4],
        'classifier__class_weight': ['balanced', 'balanced_subsample', None]
    }

    random_search = RandomizedSearchCV(pipeline, param_distributions=param_dist,
                                       n_iter=30, cv=StratifiedKFold(n_splits=5),
                                       scoring='f1', n_jobs=-1, random_state=42)

    random_search.fit(X_train, y_train)
    best_model = random_search.best_estimator_

    y_pred = best_model.predict(X_test)
    y_pred_proba = best_model.predict_proba(X_test)[:, 1]

    print(f"\nResultados para {model_name}:")
    print("\nMejores hiperparámetros:")
    print(random_search.best_params_)
    print("\nMatriz de confusión:")
    print(confusion_matrix(y_test, y_pred))
    print("\nInforme de clasificación:")
    print(classification_report(y_test, y_pred))
    print(f"\nROC AUC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")

    joblib.dump(best_model, f'/content/drive/My Drive/TFM_b/exoplanet_model_{model_name}.joblib')
    print(f"Modelo guardado como: exoplanet_model_{model_name}.joblib")

    if hasattr(best_model.named_steps['classifier'], 'feature_importances_'):
        feature_importance = best_model.named_steps['classifier'].feature_importances_
        feature_names = X.columns
        feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importance})
        feature_importance_df = feature_importance_df.sort_values('importance', ascending=False)

        plt.figure(figsize=(12, 8))
        sns.barplot(x='importance', y='feature', data=feature_importance_df)
        plt.title('Importancia de las características')
        plt.tight_layout()
        plt.show()

        # Análisis adicional de las características más importantes
        top_features = feature_importance_df['feature'].head(5).tolist()
        plt.figure(figsize=(15, 10))
        for i, feature in enumerate(top_features, 1):
            plt.subplot(2, 3, i)
            sns.histplot(data=df, x=feature, hue='exoplanet', kde=True)
            plt.title(f'Distribución de {feature}')
        plt.tight_layout()
        plt.show()

    precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
    plt.figure(figsize=(8, 6))
    plt.plot(recall, precision, marker='.')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Curva Precisión-Recall')
    plt.show()

    return feature_importance_df

X = df.drop('exoplanet', axis=1)
y = df['exoplanet']
feature_importance_df = train_evaluate_model(X, y, "model4")

# Matriz de correlación
plt.figure(figsize=(12, 10))
correlation_matrix = df.drop('exoplanet', axis=1).corr()
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm')
plt.title('Matriz de Correlación de Características')
plt.tight_layout()
plt.show()

Resultados para model4:

Mejores hiperparámetros:
{'classifier__n_estimators': 300, 'classifier__min_samples_split': 10, 'classifier__min_samples_leaf': 1, 'classifier__max_depth': None, 'classifier__class_weight': 'balanced'}

Matriz de confusión:<br>
[[2009   26]<br>
 [  15   72]]

    Informe de clasificación:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      2035
           1       0.73      0.83      0.78        87

    accuracy                           0.98      2122
    macro avg      0.86      0.91      0.88      2122
    weighted avg   0.98      0.98      0.98      2122


ROC AUC Score: 0.9718

Modelo 5: Random Forest + optimización de hiperparámetros + manejo del desequilibro de clases + feature engineering avanzado + feature selection

In [None]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.signal import find_peaks
from scipy.fft import fft
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import joblib
from google.colab import drive

drive.mount('/content/drive', force_remount=True)

with open('/content/drive/My Drive/TFM_b/tess_combined_training_data.pkl', 'rb') as f:
    combined_data = pickle.load(f)

def extract_features(obj):
    flux = np.array(obj['norm_flux'])
    flux_err = np.array(obj['norm_flux_err'])

    features = {
        'skew_flux': stats.skew(flux),
        'kurt_flux': stats.kurtosis(flux),
        'flux_median': np.median(flux),
        'flux_energy': np.sum(flux**2)
    }

    # Características adicionales
    peaks, _ = find_peaks(flux, height=np.mean(flux) + np.std(flux))
    valleys, _ = find_peaks(-flux, height=-(np.mean(flux) - np.std(flux)))
    # features['num_peaks'] = len(peaks)
    # features['num_valleys'] = len(valleys)
    features['valley_ratio'] = len(valleys) / len(flux)
    # features['peak_valley_ratio'] = (len(peaks) / len(flux))/ (len(valleys) / len(flux) + 1e-10)  # Evitar división por cero


    # Características basadas en FFT
    fft_vals = np.abs(fft(flux))
    features['fft_max'] = np.max(fft_vals[1:])

    # Características de forma de onda
    zero_crossings = np.where(np.diff(np.sign(flux - np.mean(flux))))[0]
    features['zero_crossing_rate'] = len(zero_crossings) / len(flux)

    features['log_flux_energy'] = np.log1p(features['flux_energy'])

    features['exoplanet'] = obj['exoplanet']
    return features

df = pd.DataFrame([extract_features(obj) for obj in combined_data])

def train_evaluate_model(X, y, model_name):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    pipeline = ImbPipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler()),
        ('smote', SMOTE(random_state=42)),
        ('classifier', RandomForestClassifier(random_state=42))
    ])

    param_dist = {
        'classifier__n_estimators': [100, 200, 300, 400, 500],
        'classifier__max_depth': [5, 10, 15, 20, None],
        'classifier__min_samples_split': [2, 5, 10],
        'classifier__min_samples_leaf': [1, 2, 4],
        'classifier__class_weight': ['balanced', 'balanced_subsample', None]
    }

    random_search = RandomizedSearchCV(pipeline, param_distributions=param_dist,
                                       n_iter=30, cv=StratifiedKFold(n_splits=5),
                                       scoring='f1', n_jobs=-1, random_state=42)

    random_search.fit(X_train, y_train)
    best_model = random_search.best_estimator_

    y_pred = best_model.predict(X_test)
    y_pred_proba = best_model.predict_proba(X_test)[:, 1]

    print(f"\nResultados para {model_name}:")
    print("\nMejores hiperparámetros:")
    print(random_search.best_params_)
    print("\nMatriz de confusión:")
    print(confusion_matrix(y_test, y_pred))
    print("\nInforme de clasificación:")
    print(classification_report(y_test, y_pred))
    print(f"\nROC AUC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")

    joblib.dump(best_model, f'/content/drive/My Drive/TFM_b/exoplanet_model_{model_name}.joblib')
    print(f"Modelo guardado como: exoplanet_model_{model_name}.joblib")

    if hasattr(best_model.named_steps['classifier'], 'feature_importances_'):
        feature_importance = best_model.named_steps['classifier'].feature_importances_
        feature_names = X.columns
        feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importance})
        feature_importance_df = feature_importance_df.sort_values('importance', ascending=False)

        plt.figure(figsize=(12, 8))
        sns.barplot(x='importance', y='feature', data=feature_importance_df)
        plt.title('Importancia de las características')
        plt.tight_layout()
        plt.savefig('/content/drive/My Drive/TFM_b/feature_importance.png')
        plt.show()

        # Análisis adicional de las características más importantes
        top_features = feature_importance_df['feature'].head(5).tolist()
        plt.figure(figsize=(15, 10))
        for i, feature in enumerate(top_features, 1):
            plt.subplot(2, 3, i)
            sns.histplot(data=df, x=feature, hue='exoplanet', kde=True)
            plt.title(f'Distribución de {feature}')
        plt.tight_layout()
        plt.show()

    precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
    plt.figure(figsize=(8, 6))
    plt.plot(recall, precision, marker='.')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Curva Precisión-Recall')
    plt.show()

    return feature_importance_df

X = df.drop('exoplanet', axis=1)
y = df['exoplanet']
feature_importance_df = train_evaluate_model(X, y, "model5")

Resultados para model5:

Mejores hiperparámetros:
{'classifier__n_estimators': 400, 'classifier__min_samples_split': 5, 'classifier__min_samples_leaf': 1, 'classifier__max_depth': None, 'classifier__class_weight': None}

Matriz de confusión:<br>
[[2018   17]<br>
 [  19   68]]

    Informe de clasificación:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      2035
           1       0.80      0.78      0.79        87

    accuracy                           0.98      2122
    macro avg      0.90      0.89      0.89      2122
    weighted avg   0.98      0.98      0.98      2122


ROC AUC Score: 0.9663

----------------------------

Modelo 6: Random Forest + optimización de hiperparámetros + manejo del desequilibro de clases + feature engineering avanzado + feature selection + regularización

In [None]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.signal import find_peaks
from scipy.fft import fft
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import joblib
from google.colab import drive

drive.mount('/content/drive', force_remount=True)

with open('/content/drive/My Drive/TFM_b/tess_combined_training_data.pkl', 'rb') as f:
    combined_data = pickle.load(f)

def extract_features(obj):
    flux = np.array(obj['norm_flux'])
    flux_err = np.array(obj['norm_flux_err'])

    features = {
        'skew_flux': stats.skew(flux),
        'kurt_flux': stats.kurtosis(flux),
        'flux_median': np.median(flux),
        'flux_energy': np.sum(flux**2)
    }

    # Características adicionales
    peaks, _ = find_peaks(flux, height=np.mean(flux) + np.std(flux))
    valleys, _ = find_peaks(-flux, height=-(np.mean(flux) - np.std(flux)))
    features['valley_ratio'] = len(valleys) / len(flux)

    # Características basadas en FFT
    fft_vals = np.abs(fft(flux))
    features['fft_max'] = np.max(fft_vals[1:])

    # Características de forma de onda
    zero_crossings = np.where(np.diff(np.sign(flux - np.mean(flux))))[0]
    features['zero_crossing_rate'] = len(zero_crossings) / len(flux)

    features['log_flux_energy'] = np.log1p(features['flux_energy'])

    features['exoplanet'] = obj['exoplanet']
    return features

df = pd.DataFrame([extract_features(obj) for obj in combined_data])

def train_evaluate_model(X, y, model_name):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    pipeline = ImbPipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler()),
        ('smote', SMOTE(random_state=42)),
        ('classifier', RandomForestClassifier(random_state=42))
    ])

    param_dist = {
        'smote__sampling_strategy': [0.1, 0.25, 0.5, 0.75, 1.0],
        'smote__k_neighbors': [3, 5, 7, 9],
        'classifier__n_estimators': [100, 200, 300, 400],
        'classifier__max_depth': [10, 15, 20, 25],
        'classifier__min_samples_split': [2, 5, 10],
        'classifier__min_samples_leaf': [2, 4, 6],
        'classifier__max_features': [0.3, 0.5, 0.7, 'sqrt', 'log2'],
        'classifier__class_weight': ['balanced', 'balanced_subsample', None]
    }

    random_search = RandomizedSearchCV(pipeline, param_distributions=param_dist,
                                       n_iter=50, cv=StratifiedKFold(n_splits=5),
                                       scoring='f1', n_jobs=-1, random_state=42)

    random_search.fit(X_train, y_train)
    best_model = random_search.best_estimator_

    # Evaluación en conjunto de entrenamiento
    y_train_pred = best_model.predict(X_train)
    y_train_pred_proba = best_model.predict_proba(X_train)[:, 1]

    # Evaluación en conjunto de prueba
    y_test_pred = best_model.predict(X_test)
    y_test_pred_proba = best_model.predict_proba(X_test)[:, 1]

    print(f"\nResultados para {model_name}:")
    print("\nMejores hiperparámetros:")
    print(random_search.best_params_)

    print("\nRendimiento en conjunto de entrenamiento:")
    print(classification_report(y_train, y_train_pred))
    print(f"ROC AUC Score (Entrenamiento): {roc_auc_score(y_train, y_train_pred_proba):.4f}")

    print("\nRendimiento en conjunto de prueba:")
    print(classification_report(y_test, y_test_pred))
    print(f"ROC AUC Score (Prueba): {roc_auc_score(y_test, y_test_pred_proba):.4f}")

    print("\nMatriz de confusión (Conjunto de prueba):")
    print(confusion_matrix(y_test, y_test_pred))

    joblib.dump(best_model, f'/content/drive/My Drive/TFM_b/exoplanet_model_{model_name}.joblib')
    print(f"Modelo guardado como: exoplanet_model_{model_name}.joblib")

    if hasattr(best_model.named_steps['classifier'], 'feature_importances_'):
        feature_importance = best_model.named_steps['classifier'].feature_importances_
        feature_names = X.columns
        feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importance})
        feature_importance_df = feature_importance_df.sort_values('importance', ascending=False)

        plt.figure(figsize=(12, 8))
        sns.barplot(x='importance', y='feature', data=feature_importance_df)
        plt.title('Importancia de las características')
        plt.tight_layout()
        plt.savefig('/content/drive/My Drive/TFM_b/feature_importance.png')
        plt.show()

        # Análisis adicional de las características más importantes
        top_features = feature_importance_df['feature'].head(5).tolist()
        plt.figure(figsize=(15, 10))
        for i, feature in enumerate(top_features, 1):
            plt.subplot(2, 3, i)
            sns.histplot(data=df, x=feature, hue='exoplanet', kde=True)
            plt.title(f'Distribución de {feature}')
        plt.tight_layout()
        plt.show()

    return feature_importance_df

X = df.drop('exoplanet', axis=1)
y = df['exoplanet']
feature_importance_df = train_evaluate_model(X, y, "model6")

Resultados para model6:

Mejores hiperparámetros:
{'smote__sampling_strategy': 0.1, 'smote__k_neighbors': 3, 'classifier__n_estimators': 200, 'classifier__min_samples_split': 2, 'classifier__min_samples_leaf': 4, 'classifier__max_features': 0.7, 'classifier__max_depth': 10, 'classifier__class_weight': None}

    Rendimiento en conjunto de entrenamiento:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      8138
           1       0.98      0.81      0.89       349

    accuracy                           0.99      8487
    macro avg      0.99      0.90      0.94      8487
    weighted avg   0.99      0.99      0.99      8487

ROC AUC Score (Entrenamiento): 0.9981

    Rendimiento en conjunto de prueba:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      2035
           1       0.94      0.74      0.83        87

    accuracy                           0.99      2122
    macro avg      0.96      0.87      0.91      2122
    weighted avg   0.99      0.99      0.99      2122

ROC AUC Score (Prueba): 0.9461

Matriz de confusión (Conjunto de prueba):<br>
[[2031    4]<br>
 [  23   64]]