In [None]:
pip install --upgrade dask

In [None]:
pip install dask_ml

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import IncrementalPCA
#from sklearn.decomposition import PCA


#import dask.array as da
#from dask_ml.decomposition import PCA
#import dask.dataframe as dd

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_df = pd.read_parquet("/kaggle/input/drw-crypto-market-prediction/train.parquet")
test_df = pd.read_parquet("/kaggle/input/drw-crypto-market-prediction/test.parquet")

In [None]:
train_df.info()
print('-'*50)
test_df.info()

In [None]:
train_df_sample = train_df.sample(frac=0.05, random_state=1232) 

In [None]:
train_df_cleaned=train_df_sample.replace([np.inf, -1*np.inf], np.nan)
train_df_cleaned.head()

In [None]:
missing_values_count = train_df_cleaned.isnull().sum()
missing_columns = missing_values_count[missing_values_count > 0]
print(missing_columns)

Hasta el momento todas las columnas tienen todos los valores completos, ahora debo ver que columnas son importantes para utilizar el método de analisis de componentes principales

In [None]:
columns = ['X697','X698','X699','X700','X701','X702','X703','X704','X705','X706','X707','X708','X709','X710','X711','X712','X713','X714','X715','X716','X717']
train_df_useful = train_df_cleaned.drop(columns, axis=1)

In [None]:
print(f"Dimensiones iniciales del dataset: {train_df_useful.shape}")
# Opción 1: Eliminar columnas con desviación estándar cero
constant_columns_std = [col for col in train_df_useful.columns if train_df_useful[col].std() == 0]
df_cleaned = train_df_useful.drop(columns=constant_columns_std)
print(f"Columnas eliminadas por desviación estándar cero: {len(constant_columns_std)}")
print(f"Dimensiones después de eliminar constantes por STD: {df_cleaned.shape}")

In [None]:
# Opción 2: Eliminar columnas con un solo valor único
# (Esta opción es más robusta para identificar constantes exactas)
constant_columns_unique = [col for col in df_cleaned.columns if df_cleaned[col].nunique() == 1]
df_cleaned_unique = df_cleaned.drop(columns=constant_columns_unique)
print(f"Columnas eliminadas por un único valor: {len(constant_columns_unique)}")
print(f"Dimensiones después de eliminar constantes por valores únicos: {df_cleaned_unique.shape}")

In [None]:
# Para cuasi-constantes (ejemplo: más del 99% de los valores son iguales)
quasi_constant_columns = []
threshold = 0.90  # Proporción mínima para considerar cuasi-constante
for col in df_cleaned.columns:
    value_counts = df_cleaned[col].value_counts(normalize=True)
    if value_counts.empty: # Manejar columnas completamente vacías si existen
        continue
    if value_counts.iloc[0] > threshold: # Si el valor más frecuente representa más del umbral
        quasi_constant_columns.append(col)

df_cleaned_quasi = df_cleaned.drop(columns=quasi_constant_columns)
print(f"Columnas eliminadas por ser cuasi-constantes ({threshold*100}%): {len(quasi_constant_columns)}")
print(f"Dimensiones después de eliminar cuasi-constantes: {df_cleaned_quasi.shape}")


In [None]:
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

# Asume que 'df_cleaned' es tu dataset después de eliminar columnas constantes
# Asegúrate de que todas las columnas sean numéricas para calcular la correlación

# Paso 1: Eliminar columnas no numéricas si no lo hiciste antes (para correlación y VIF)
#df_numeric = df_cleaned.select_dtypes(include=np.number)

# Paso 2: Matriz de Correlación
correlation_matrix = df_cleaned_quasi.corr().abs() # Usar valor absoluto para correlaciones negativas y positivas
print("Matriz de Correlación (primeras 5x5):")
print(correlation_matrix.head())

# Identificar pares de columnas altamente correlacionadas
threshold_corr = 0.85 # Define tu umbral de correlación (ej. 0.85 o 0.9)
highly_correlated_cols = set()
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if correlation_matrix.iloc[i, j] > threshold_corr:
            colname1 = correlation_matrix.columns[i]
            colname2 = correlation_matrix.columns[j]
            highly_correlated_cols.add(colname1) # Agrega una de las columnas
            # O podrías decidir cual eliminar basandote en el conocimiento del dominio

print(f"Columnas identificadas como altamente correlacionadas (umbral > {threshold_corr}): {len(highly_correlated_cols)}")
# df_after_corr_removal = df_numeric.drop(columns=list(highly_correlated_cols))
# print(f"Dimensiones después de eliminar columnas por alta correlación: {df_after_corr_removal.shape}")


In [None]:
df_after_corr_removal = df_cleaned_quasi.drop(columns=list(highly_correlated_cols))
print(f"Dimensiones después de eliminar columnas por alta correlación: {df_after_corr_removal.shape}")

In [None]:
train_df_useful = df_after_corr_removal

In [None]:
train_df_useful.hist(bins=70, figsize=(30,30))

In [None]:
Y_train = train_df_useful["label"]
X_train = train_df_useful.drop(['label'], axis=1)

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train)
X_scaled_df = pd.DataFrame(X_scaled, columns=X_train.columns)

In [None]:
pca = IncrementalPCA(n_components=20)
principal_components = pca.fit_transform(X_scaled_df)

In [None]:
## 4. Obtener la varianza explicada

# Varianza explicada por cada componente individual
explained_variance_ratio = pca.explained_variance_ratio_

# Varianza explicada acumulada
cumulative_explained_variance = np.cumsum(explained_variance_ratio)

In [None]:
## 5. Graficar la Varianza Explicada como un Gráfico de Barras

plt.figure(figsize=(10, 6))

# Gráfico de barras de la varianza explicada por cada componente
plt.bar(range(1, len(explained_variance_ratio) + 1), explained_variance_ratio,
        alpha=0.8, color='skyblue')

plt.xlabel('Número de Componente Principal')
plt.ylabel('Proporción de Varianza Explicada')
plt.title('Gráfico de Barras de Varianza Explicada por Componente Principal (Scree Plot)')
plt.xticks(range(1, len(explained_variance_ratio) + 1))
plt.grid(axis='y', linestyle='--', alpha=0.7) # Grid solo en el eje Y para mayor claridad

plt.show()

# Opcional: También puedes superponer la varianza acumulada en el mismo gráfico
plt.figure(figsize=(10, 6))
cumulative_explained_variance = np.cumsum(explained_variance_ratio)

plt.bar(range(1, len(explained_variance_ratio) + 1), explained_variance_ratio,
        alpha=0.8, color='skyblue', label='Varianza Explicada Individual')
plt.plot(range(1, len(explained_variance_ratio) + 1), cumulative_explained_variance,
         marker='x', linestyle='-', color='red', label='Varianza Explicada Acumulada')

plt.xlabel('Número de Componente Principal')
plt.ylabel('Proporción de Varianza Explicada')
plt.title('Gráfico de Barras y Línea de Varianza Explicada (Scree Plot)')
plt.xticks(range(1, len(explained_variance_ratio) + 1))
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.legend()
plt.show()

In [None]:
# 1. Crear los nombres de las columnas para los componentes principales
# Por ejemplo, 'PC1', 'PC2', ..., 'PC_n'
n_components_to_keep = 20
pca_column_names = [f'PC{i+1}' for i in range(n_components_to_keep)]

# 2. Crear el DataFrame de pandas
# Es buena práctica mantener el mismo índice que el DataFrame original de la muestra
# para poder unirlo fácilmente si lo necesitas.
df_pca = pd.DataFrame(data=principal_components, columns=pca_column_names, index=train_df_sample.index)

print("\nDataFrame de PCA (primeras 5 filas):")
df_pca.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew
from scipy.stats import kurtosis
from scipy.stats import entropy

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

#import xgboost as xgb
import lightgbm as lgb
#from catboost import CatBoostRegressor
import xgboost as xgb

## Training y Testing different models

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df_pca, Y_train, test_size=0.3, random_state=42)

In [None]:
def objective(trial):
    model = lgb.LGBMRegressor(
        objective= "regression",
        metric= 'rmse',
        verbosity= -1,
        boosting_type= "gbdt",
        ##lambda_l1= trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        ##lambda_l2= trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        num_leaves= trial.suggest_int("num_leaves", 2, 256),
        max_depth=trial.suggest_int("max_depth",5,15),
        ##feature_fraction= trial.suggest_float("feature_fraction", 0.4, 1.0),
        bagging_fraction= trial.suggest_float("bagging_fraction", 0.7, 1.0),
        ##bagging_freq= trial.suggest_int("bagging_freq", 1, 7),
        min_child_samples= trial.suggest_int("min_child_samples", 20, 100),
        n_estimators=trial.suggest_int("n_estimators", 100, 5000),
        learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        #device='cuda',
    )
    model.fit(x_train, y_train,
            eval_metric='rmse',
            eval_set=[(x_test, y_test)],
            callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=False)])
    y_pred = model.predict(x_test)
    mse = mean_squared_error(y_test, y_pred)
    return mse

In [None]:
pip install optuna-integration[lightgbm]

In [None]:
import optuna
from optuna.samplers import TPESampler
from sklearn.metrics import accuracy_score

from optuna.visualization import plot_optimization_history
from optuna.visualization import plot_parallel_coordinate
from optuna.visualization import plot_slice
from optuna.visualization import plot_param_importances

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
optuna.logging.set_verbosity(optuna.logging.WARNING)

sampler = TPESampler(seed=1)

In [None]:
study = optuna.create_study(study_name="LightGBM", direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=100)

In [None]:
print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial
print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
LightGBM_Regressor_Tuned = lgb.LGBMRegressor(
    **trial.params, 
    verbose=-1, 
    #device='cuda'
)

In [None]:
LightGBM_Regressor_Tuned.fit(x_train, y_train)

In [None]:
LightGBM_Regressor_Tuned = lgb.LGBMRegressor(
    n_estimators = 696,
    learning_rate = 0.07789968841005522,
    num_leaves = 190,
    max_depth = 14,
    min_child_samples = 28,
    subsample = 0.7852145041579535,
    colsample_bytree = 0.9527399899405662,
    reg_alpha = 2.5910995218247588,
    reg_lambda = 6.658046086348257e-08
)

In [None]:
Predict_label = LightGBM_Regressor_Tuned.predict(x_test)

# Prediction and Submission

In [None]:
for columns in train_df_useful.columns:
    print(columns)

In [None]:
desired_columns = [
    'bid_qty', 'ask_qty', 'buy_qty', 'sell_qty', 'X1', 'X2', 'X9', 'X10',
    'X17', 'X18', 'X25', 'X26', 'X57', 'X58', 'X59', 'X60', 'X61', 'X62',
    'X99', 'X101', 'X102', 'X103', 'X141', 'X143', 'X144', 'X145', 'X183',
    'X184', 'X192', 'X213', 'X234', 'X235', 'X238', 'X248', 'X249', 'X256',
    'X257', 'X264', 'X265', 'X272', 'X273', 'X304', 'X305', 'X306', 'X307',
    'X308', 'X346', 'X348', 'X349', 'X350', 'X388', 'X390', 'X391', 'X392',
    'X430', 'X439', 'X460', 'X481', 'X482', 'X495', 'X496', 'X497', 'X498',
    'X499', 'X502', 'X503', 'X504', 'X505', 'X506', 'X512', 'X513', 'X519',
    'X520', 'X523', 'X524', 'X525', 'X526', 'X527', 'X528', 'X533', 'X534',
    'X535', 'X536', 'X538', 'X540', 'X541', 'X542', 'X543', 'X544', 'X545',
    'X546', 'X547', 'X548', 'X551', 'X552', 'X553', 'X554', 'X555', 'X561',
    'X562', 'X568', 'X569', 'X572', 'X573', 'X574', 'X575', 'X576', 'X577',
    'X582', 'X583', 'X584', 'X585', 'X587', 'X589', 'X590', 'X591', 'X592',
    'X593', 'X595', 'X597', 'X598', 'X599', 'X606', 'X613', 'X614', 'X615',
    'X616', 'X617', 'X620', 'X623', 'X626', 'X627', 'X630', 'X632', 'X633',
    'X635', 'X639', 'X642', 'X645', 'X647', 'X662', 'X718', 'X732', 'X734',
    'X735', 'X739', 'X743', 'X747', 'X751', 'X755', 'X759', 'X760', 'X762',
    'X763', 'X767', 'X771', 'X775', 'X779', 'X783', 'X787', 'X788', 'X790',
    'X791', 'X795', 'X799', 'X803', 'X807', 'X811', 'X815', 'X816', 'X818',
    'X819', 'X823', 'X827', 'X831', 'X835', 'X839', 'X843', 'X844', 'X846',
    'X847', 'X848', 'X849', 'X850', 'X851', 'X852', 'X853', 'X855', 'X856',
    'X857', 'X858', 'X859', 'X860', 'X862', 'X863', 'X866', 'X873', 'X875',
    'X876', 'X877', 'X884'
]
# --- 3. Filtra las columnas deseadas ---
existing_columns = [col for col in desired_columns if col in test_df.columns]
missing_columns = [col for col in desired_columns if col not in test_df.columns]

if missing_columns:
    print(f"\nAdvertencia: Las siguientes columnas deseadas no se encontraron en el dataset original: {missing_columns}")
    print("Por favor, verifica los nombres de las columnas o si el dataset original es el correcto.")

df_cleaned = test_df[existing_columns].copy()

# --- 4. Muestra información del nuevo dataset ---
print(f"\nDataset filtrado creado. Dimensiones: {df_cleaned.shape}")
print("\nPrimeras 5 filas del dataset filtrado:")
print(df_cleaned.head())

print(f"\nColumnas en el dataset filtrado ({len(df_cleaned.columns)}):")
print(df_cleaned.columns.tolist())


In [None]:
test_df_cleaned=df_cleaned.replace([np.inf, -1*np.inf], np.nan)
test_df_cleaned.head()

In [None]:
columns = ['X697','X698','X699','X700','X701','X702','X703','X704','X705','X706','X707','X708','X709','X710','X711','X712','X713','X714','X715','X716','X717']
test_df_useful = test_df_cleaned.drop(columns, axis=1)

In [None]:
test_df_useful = test_df_cleaned

In [None]:
scaler = StandardScaler()
X_scaled_useful = scaler.fit_transform(test_df_useful)
X_scaled_df_useful = pd.DataFrame(X_scaled_useful, columns=test_df_useful.columns)

In [None]:
pca = IncrementalPCA(n_components=20)
principal_components = pca.fit_transform(X_scaled_df_useful)

In [None]:
# 1. Crear los nombres de las columnas para los componentes principales
# Por ejemplo, 'PC1', 'PC2', ..., 'PC_n'
n_components_to_keep = 20
pca_column_names = [f'PC{i+1}' for i in range(n_components_to_keep)]

# 2. Crear el DataFrame de pandas
# Es buena práctica mantener el mismo índice que el DataFrame original de la muestra
# para poder unirlo fácilmente si lo necesitas.
df_pca_test = pd.DataFrame(data=principal_components, columns=pca_column_names, index=test_df.index)

print("\nDataFrame de PCA (primeras 5 filas):")
df_pca_test.head()

In [None]:
Prediction_Y = LightGBM_Regressor_Tuned.predict(df_pca_test)

In [None]:
prediction_df = test_df.assign(prediction=Prediction_Y)
prediction_df.head()

In [None]:
Submission = prediction_df[['prediction']]
Submission.head()

In [None]:
Submission.index = pd.RangeIndex(start=1, stop=len(Submission) + 1)
Submission.tail()

In [None]:
Submission.index.name = 'ID'

In [None]:
Final_submission = Submission.reset_index()

In [None]:
Final_submission.head()

In [None]:
Final_submission.to_csv('Prediction_LightGBM_Regressor_1.csv', index=False)

## With PCA and ALL COLUMNS
* Number of finished trials:  100
* Best trial:
    * Value:  0.9779108503752775
    * Params: 
        * num_leaves: 2
        * max_depth: 11
        * bagging_fraction: 0.8466040335767704
        * min_child_samples: 70
        * n_estimators: 1205
        * learning_rate: 0.010147669453001346
* **score:0.01317**
## With PCA, but with cleaning
* Number of finished trials:  100
* Best trial:
  * Value:  0.9808493378105629
  * Params: 
    * num_leaves: 61
    * max_depth: 11
    * bagging_fraction: 0.8210028107967959
    * min_child_samples: 38
    * n_estimators: 3450
    * learning_rate: 0.29376295914423245
* **score:0.01576**