# TFM : Pruebas

- [url_ayuda_secuencias](https://med.stanford.edu/content/dam/sm/genetics/documents/gene211/schedule/Lecture4_Sequence_Comparison-2014.pdf)
- [url_ayuda_para_npx](https://olink.com/faq/what-is-npx/)
- [url_ayuda_proteins_existence](https://www.uniprot.org/help/protein_existence)

In [1]:
# To interact with the operative system.
import os
from concurrent.futures import ProcessPoolExecutor, as_completed
os.chdir('..')

# For time and dates.
import time
import datetime

# For the use of warnings.
import warnings
warnings.simplefilter(action='ignore', category=(FutureWarning, UserWarning))

# To use DataFrames.
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
pd.set_option('display.max_columns', None)

# To treat mathematical objects.
import numpy as np 

# To get online information.
import requests

# To get random samples or numbers.
import random as rnd

# For the typing hints.
from typing import Dict, List

# To get information about the errors.
import traceback

# To the graphics.
import matplotlib.pyplot as plt
import seaborn as sns

# To train, preprocess and evaluate models.
import xgboost as xgb
import lightgbm as lgb

from sklearn.model_selection import (train_test_split, KFold, learning_curve)
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import (GradientBoostingRegressor, RandomForestRegressor)
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.exceptions import DataConversionWarning
from sklearn.compose import ColumnTransformer

# Para optimizar hiperparámetros.
import optuna
from optuna.visualization import plot_optimization_history
from optuna.visualization import plot_param_importances
from optuna.visualization import plot_parallel_coordinate
from optuna.visualization import plot_contour

# Para guardar y cargar modelos.
import joblib

# Para guardar y cargar parámetros.
import json

# Personal imports.
from utils import DataLoader, DataFrameOptimizer, reduce_mem_usage
from bioinfo import UnitProtInfo
from fe import FeatureEngineeringNew, full_FeatureEngineeringNew
from eda import EdaNew, full_EdaNew
from modelling import DataPreparationToModelNew, Metricas

%load_ext autoreload
%autoreload 2

from IPython.display import clear_output


clear_output()

## Pruebas

In [2]:
def sample_dataframes(dataframes, seed=42, sample_size=5):
    """
    Función para muestrear de forma coherente de múltiples DataFrames basados en una columna común.

    Parámetros:
    - dataframes (dict): Un diccionario de DataFrames.
    - seed (int): Semilla para la generación de números aleatorios.
    - sample_size (int): Número de muestras a seleccionar.

    Retorna:
    - dict: Un diccionario de DataFrames filtrados.
    """
    # Establecer la semilla aleatoria para reproducibilidad
    rnd.seed(seed)

    # Determinar los patient_id únicos
    unique_patient_ids = set()
    for df in dataframes.values():
        unique_patient_ids = unique_patient_ids.union(set(df['visit_id']))

    # Convertir a lista y muestrear
    unique_patient_ids = list(unique_patient_ids)
    sampled_patient_ids = rnd.sample(unique_patient_ids, sample_size)

    # Filtrar cada DataFrame y retornar un nuevo diccionario
    filtered_dfs = {}
    for key, df in dataframes.items():
        filtered_dfs[key] = df[df['visit_id'].isin(sampled_patient_ids)]

    return filtered_dfs

# Uso de la función
data_loader = DataLoader()
dict_of_dfs = {}
dict_of_dfs['proteins'], dict_of_dfs['peptides'], dict_of_dfs['clinical'], _ = data_loader.load_train_data()

sampled_dfs = sample_dataframes(dict_of_dfs)

# Mostrar los resultados
print(sorted(sampled_dfs['proteins'].patient_id.unique()))
print(sorted(sampled_dfs['clinical'].patient_id.unique()))

[]
[3636, 5742, 7117, 26210, 41883]


In [3]:
data_loader = DataLoader()
dict_of_dfs = {}
dict_of_dfs['proteins'], dict_of_dfs['peptides'], dict_of_dfs['clinical'], _ = data_loader.load_train_data()


In [9]:
# Ruta del archivo CSV (reemplázala con tu ruta de archivo real)
csv_file_path = 'tu_ruta_aqui.csv'

# Leer el archivo CSV desde la ruta especificada
df_from_file = pd.read_csv(csv_file_path)

# Convertir a formato LaTeX
latex_table_from_file = df_grouped_from_file.to_latex(header=True, index=True)

print(latex_table_from_file)

'\\begin{tabular}{llrrrl}\n\\toprule\n & Modelo & SMAPE & RMSE & MAE & Dataset \\\\\n\\midrule\n0 & updrs_1 & 50.050002 & 5.517838 & 30.446541 & 24 Meses \\\\\n1 & updrs_2 & 49.165772 & 5.274010 & 27.815184 & 24 Meses \\\\\n2 & updrs_3 & 41.722576 & 12.575038 & 158.131573 & 24 Meses \\\\\n3 & updrs_4 & 143.625983 & 3.488770 & 12.171516 & 24 Meses \\\\\n4 & Media & 71.141083 & 6.713914 & 57.141204 & 24 Meses \\\\\n5 & updrs_1 & 45.203362 & 6.550131 & 42.904215 & 84 Meses \\\\\n6 & updrs_2 & 47.470505 & 7.754802 & 60.136950 & 84 Meses \\\\\n7 & updrs_3 & 49.618239 & 18.549762 & 344.093659 & 84 Meses \\\\\n8 & updrs_4 & 88.098736 & 2.965481 & 8.794080 & 84 Meses \\\\\n9 & Media & 57.597711 & 8.955044 & 113.982226 & 84 Meses \\\\\n\\bottomrule\n\\end{tabular}\n'

In [5]:
dict_of_dfs['proteins'].head()

Unnamed: 0,visit_id,visit_month,patient_id,UniProt,NPX
0,55_0,0,55,O00391,11254.3
1,55_0,0,55,O00533,732430.0
2,55_0,0,55,O00584,39585.8
3,55_0,0,55,O14498,41526.9
4,55_0,0,55,O14773,31238.0


In [6]:
list(dict_of_dfs['clinical'].visit_month.unique())

[0, 3, 6, 9, 12, 18, 24, 30, 36, 42, 48, 54, 60, 72, 84, 96, 108]

In [7]:
dict_of_dfs['peptides'].head()

Unnamed: 0,visit_id,visit_month,patient_id,UniProt,Peptide,PeptideAbundance
0,55_0,0,55,O00391,NEQEQPLGQWHLS,11254.3
1,55_0,0,55,O00533,GNPEPTFSWTK,102060.0
2,55_0,0,55,O00533,IEIPSSVQQVPTIIK,174185.0
3,55_0,0,55,O00533,KPQSAVYSTGSNGILLC(UniMod_4)EAEGEPQPTIK,27278.9
4,55_0,0,55,O00533,SMEQNGPGLEYR,30838.7


In [5]:
start_time = time.monotonic()

data_loader = DataLoader()
dict_of_dfs : Dict[str, pd.DataFrame] = {}

dict_of_dfs['proteins'], dict_of_dfs['peptides'], dict_of_dfs['clinical'], _ = data_loader.load_train_data()

df_after_eda = full_EdaNew(dict_of_dfs)
df_after_fe = full_FeatureEngineeringNew(df_after_eda)

if_sample = False

if if_sample: # For debugging purposes.
    df_after_fe = df_after_fe.sample(n=100)

train_results = [df_after_fe for i in range(1, 5)] # .drop(columns=[f'updrs_{j}' for j in range(1, 5) if j != i])

processed_data = []

def split_data(df : pd.DataFrame, target_column : List[str])-> tuple:

    test_size = 0.2
    val_size = 0.2 / (1 - test_size)  # To maintain the proportion.

    X = df.drop(target_column, axis=1)
    y = df[target_column]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=val_size, random_state=42)

    return X_train, X_val, X_test, y_train, y_val, y_test

def normalize_and_split_data(df, target_column):

        X_train, X_val, X_test, y_train, y_val, y_test = split_data(df, target_column)
        
        # Identificar las columnas que no deben ser escaladas
        strings_a_excluir = {'mean', 'median', 'min', 'max', 'std', 'var'}
        columnas_sin_cambios = [col for col in df.columns if len(col.split('_')) > 2 if col.split('_')[2] in strings_a_excluir]
        columnas_para_escalar = [col for col in df.columns if col not in columnas_sin_cambios and col not in target_column]

        preprocesador = ColumnTransformer(
            transformers=[
                ('escalar', StandardScaler(), columnas_para_escalar),
                ('sin_cambio', 'passthrough', columnas_sin_cambios)
            ]
        )

        # Aplicar el preprocesador a los conjuntos de entrenamiento, validación y prueba
        X_train_norm = pd.DataFrame(preprocesador.fit_transform(X_train), columns=columnas_para_escalar + columnas_sin_cambios)
        X_val_norm = pd.DataFrame(preprocesador.transform(X_val), columns=columnas_para_escalar + columnas_sin_cambios)
        X_test_norm = pd.DataFrame(preprocesador.transform(X_test), columns=columnas_para_escalar + columnas_sin_cambios)
        
        return X_train_norm, X_val_norm, X_test_norm, y_train, y_val, y_test


for df, target_column in zip(train_results, [f'updrs_{i+1}' for i in range(len(train_results))]):
    processed_data.append(normalize_and_split_data(df, target_column))

end_time = time.monotonic()

print(f"\nTime of Execution: {end_time - start_time}\n")


Time of Execution: 1.1089999999385327



In [26]:
class EdaNew:

    @staticmethod
    def filter_clinical_by_month(clinical_df : pd.DataFrame, month_to_filter : int = 24, updrs_cols : List[str] = ['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']) -> pd.DataFrame:
        """
        Filter the clinical data by month and select the columns of interest.
        """

        clinical_df_filtered_by_month = clinical_df[clinical_df.visit_month == month_to_filter]

        return clinical_df_filtered_by_month
    
    @staticmethod
    def filter_proteins_by_month(proteins_df: pd.DataFrame, month_to_filter : int = 24) -> pd.DataFrame:
        """
        Filter the proteins data by month and select the columns of interest.
        """

        proteins_df_filtered_by_month = proteins_df[proteins_df.visit_month == month_to_filter]

        return proteins_df_filtered_by_month
    
    def filter_peptides_by_month(peptides_df: pd.DataFrame, month_to_filter : int = 24) -> pd.DataFrame:
        """
        Filter the peptides data by month and select the columns of interest.
        """

        peptides_df_filtered_by_month = peptides_df[peptides_df.visit_month == month_to_filter]

        return peptides_df_filtered_by_month

    @staticmethod
    def analyze_and_visualize_duplicates(data, title :str = 'DataFrame', verbose : bool = False): # TODO analizar y visualizar duplicados.
        """
        Función para analizar y opcionalmente visualizar y eliminar datos duplicados en un DataFrame basándose en columnas específicas.

        Args:
        data (pd.DataFrame): DataFrame a analizar.
        title (str): Título para la visualización.
        index_cols (list): Lista de columnas para identificar duplicados.
        verbose (bool): Si es True, imprime información y muestra gráficos.

        Returns:
        pd.DataFrame: DataFrame con duplicados eliminados.
        """

        duplicate_rows = data.duplicated(keep=False)
        num_duplicate_rows = duplicate_rows.sum()
        proportion_duplicates = num_duplicate_rows / len(data) * 100

        # Eliminar duplicados
        data_filtered = data.drop_duplicates()

        if verbose:
            # Gráfico
            plt.figure(figsize=(10, 4))
            sns.countplot(x=duplicate_rows)
            plt.title(f'Duplicate Counts in {title}')
            plt.ylabel('Count')
            plt.xlabel('Is Duplicate')

            # Mostrar información de duplicados
            print(f'{title} - Proportion of Duplicates: {proportion_duplicates:.2f}%')
            print(f'{title} - Number of Duplicate Rows: {num_duplicate_rows}')
            num_rows_removed = len(data) - len(data_filtered)
            print(f'{title} - Number of Rows Removed: {num_rows_removed}')

        return data_filtered

    @staticmethod
    def calculate_and_remove_null_values(df, groupby_column='Proteins', verbose=False): #TODO calcular y eliminar valores nulos.
        """
        Función para calcular y opcionalmente visualizar y eliminar filas con valores nulos en un DataFrame.

        Args:
        df (pd.DataFrame): DataFrame a analizar.
        groupby_column (str): Nombre de la columna para agrupar los resultados.
        verbose (bool): Si es True, imprime información y muestra gráficos.

        Returns:
        pd.DataFrame: DataFrame con filas nulas eliminadas.
        """
        # Crear una copia del DataFrame para no modificar el original
        temp_df = df.copy()

        # Calcular la cantidad de valores nulos en cada fila
        temp_df["null_count"] = temp_df.isnull().sum(axis=1)

        # Filtrar las filas que tienen al menos un valor nulo
        df_with_nulls = temp_df[temp_df["null_count"] > 0]

        # Información sobre los valores nulos
        num_rows_with_nulls = len(df_with_nulls)
        total_rows = len(df)
        proportion_nulls = num_rows_with_nulls / total_rows * 100

        # Eliminar filas con valores nulos
        data_filtered = temp_df[temp_df["null_count"] == 0]

        if verbose:
            # Gráfico
            plt.figure(figsize=(10, 4))
            sns.histplot(temp_df['null_count'], bins=range(1, temp_df['null_count'].max() + 1), kde=False)
            plt.title(f'Null Value Counts in {groupby_column}')
            plt.ylabel('Count')
            plt.xlabel('Number of Null Values')

            # Mostrar información de valores nulos
            print(f'{groupby_column} - Proportion of Rows with Nulls: {proportion_nulls:.2f}%')
            print(f'{groupby_column} - Number of Rows with Nulls: {num_rows_with_nulls}')
            num_rows_removed = total_rows - len(data_filtered)
            print(f'{groupby_column} - Number of Rows Removed: {num_rows_removed}')

        return data_filtered
    
    @staticmethod #TODO
    def remove_outliers_iqr(df, columns = ['NPX', 'PeptideAbundance'], iqr_factor=1.5):

        """Elimina los outliers basados en el Rango Intercuartílico (IQR)."""

        for column in columns:
            Q1 = df[column].quantile(0.25)
            Q3 = df[column].quantile(0.75)
            IQR = Q3 - Q1

            rango_inferior = Q1 - iqr_factor * IQR
            rango_superior = Q3 + iqr_factor * IQR

            df = df[(df[column] >= rango_inferior) & (df[column] <= rango_superior)]
        return df
    
    @staticmethod #TODO
    def add_log_columns(df, columns=['NPX', 'PeptideAbundance']):

        """Añade columnas logarítmicas para las columnas especificadas."""

        for column in columns:
            df[f'{column}_log'] = np.log(df[column])
        return df

    @staticmethod #TODO
    def remove_outliers_std(df, columns=['NPX', 'PeptideAbundance'], std_factor=3):
        """
        Elimina los outliers basados en la desviación estándar.
        """
            
        for column in columns:
            mean = df[column].mean()
            std = df[column].std()

            df = df[(df[column] >= mean - std_factor * std) & (df[column] <= mean + std_factor * std)]
        return df
    
    @staticmethod #TODO
    def drop_upd23b_clinical_state_on_medication(df):
        df_transformed = df.drop(['upd23b_clinical_state_on_medication'], axis=1)
        return df_transformed

    @staticmethod #TODO
    def drop_group_key(df):
        df_transformed = df.drop(['group_key'], axis=1)
        return df_transformed
    
    @staticmethod #TODO
    def drop_null_count(df):
        df_transformed = df.drop(['null_count'], axis=1)
        return df_transformed
    
    @staticmethod #TODO
    def drop_visit_id_and_visit_month(df):
        df_transformed = df.drop(['visit_id', 'visit_month'], axis=1).reset_index(drop=True)
        return df_transformed

[autoreload of utils failed: Traceback (most recent call last):
  File "c:\Users\monfm\AppData\Local\Programs\Python\Python311\Lib\site-packages\IPython\extensions\autoreload.py", line 276, in check
    superreload(m, reload, self.old_objects)
  File "c:\Users\monfm\AppData\Local\Programs\Python\Python311\Lib\site-packages\IPython\extensions\autoreload.py", line 475, in superreload
    module = reload(module)
             ^^^^^^^^^^^^^^
  File "c:\Users\monfm\AppData\Local\Programs\Python\Python311\Lib\importlib\__init__.py", line 168, in reload
    raise ModuleNotFoundError(f"spec not found for the module {name!r}", name=name)
ModuleNotFoundError: spec not found for the module 'utils'
]


In [167]:
assert train_results[0].shape[0] == 47208, "The execution dont respect the dimension of the data during the process"

---