In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from concurrent.futures import ThreadPoolExecutor
import os
from tqdm import tqdm
from sklearn.impute import KNNImputer
from sklearn.base import clone
from sklearn.linear_model import LassoCV
from sklearn.linear_model import LogisticRegression

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from scipy.optimize import minimize
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import mean_squared_error, accuracy_score, cohen_kappa_score
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge
from sklearn.ensemble import ExtraTreesRegressor
from catboost import CatBoostRegressor

import optuna
from scipy import stats


In [11]:
SEED = 42

In [12]:
# Loading the datasets
train_file_path = "train_ft.csv"
train = pd.read_csv(train_file_path)
test_file_path = "test_ft.csv"
test = pd.read_csv(test_file_path)
df = train

In [13]:
train.dropna(subset=['sii'], how='all', inplace=True)

In [14]:
sii_thresholds = [30, 50, 80, 100] # Thresholds for rating
def get_rating(x): ## Thresholds for rating
    if 0 <= x <= 29:
        return 0
    elif 30 <= x <= 49:
        return 1
    elif 50 <= x <= 79:
        return 2
    else:
        return 3

def quadratic_weighted_kappa(y_true, y_pred): # The Quadric Kappa Evaluation Metric
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def evaluate_predictions(y_true, oof_non_rounded):
    rounded_p = get_rating(oof_non_rounded)
    return -quadratic_weighted_kappa(y_true, rounded_p)

#train_impute_lr['new_sii'] = train_impute_lr['PCIAT_Total_Imputed'].apply(get_rating)

In [15]:
for col in train.columns:
    if train[col].dtype == 'object' or str(train[col].dtype) == 'category':
        le = LabelEncoder()
        train[col] = le.fit_transform(train[col].astype(str))

for col in test.columns:
    if test[col].dtype == 'object' or str(test[col].dtype) == 'category':
        le = LabelEncoder()
        test[col] = le.fit_transform(test[col].astype(str))

In [16]:
train_model = train
Y_target = train['PCIAT-PCIAT_Total']
X = train_model.drop(columns=['id', 'sii', 'PCIAT-PCIAT_01', 'PCIAT-PCIAT_02', 'PCIAT-PCIAT_03','PCIAT-PCIAT_04', 'PCIAT-PCIAT_05', 'PCIAT-PCIAT_06', 'PCIAT-PCIAT_07',
           'PCIAT-PCIAT_08', 'PCIAT-PCIAT_09', 'PCIAT-PCIAT_10', 'PCIAT-PCIAT_11',
           'PCIAT-PCIAT_12', 'PCIAT-PCIAT_13', 'PCIAT-PCIAT_14', 'PCIAT-PCIAT_15',
           'PCIAT-PCIAT_16', 'PCIAT-PCIAT_17', 'PCIAT-PCIAT_18', 'PCIAT-PCIAT_19',
           'PCIAT-PCIAT_20', 'PCIAT-PCIAT_Total'])
Y_class = train['sii']

In [17]:
## BALANCING THE DATASET (UPSAMPLING AND DOWNSAMPLING IS NOT APPROPRIATE FOR THIS DATASET)
def sample_weights_optimized(series):
    """
    Calculate sample weights for continuous PCIAT target variables using equal-width binning to solve class imbalance issue by finding frequency of each pciat total value: Converts frequency into weight:
    Rare bins get higher weights (inverse of frequency is used to calculate weight )
    Common bins get lower weights
  
    
    Parameters:
    series: pandas Series containing all target values (assumed continuous with no nulls)
    
    Returns:
    pandas Series: Sample weights normalized to mean 1.0
    """
    # Handle edge cases efficiently
    if len(series) <= 1 or series.nunique() <= 1:
        return pd.Series(1.0, index=series.index)
    
    # Create equal-width bins directly
    bins = pd.cut(series, bins=10, labels=False)
    
    # Get bin counts and calculate inverse frequency in one step
    bin_counts = bins.value_counts()
    inverse_freq = 1.0 / bin_counts
    
    # Map weights back to samples using the bin indices
    weights = bins.map(inverse_freq)
    
    # Normalize weights to mean 1.0
    return weights / weights.mean()

In [18]:
def convert_to_categories(predictions, boundary_values):
    """
    Transform continuous prediction values into discrete categories 
    based on specified boundary thresholds.
    """
    categories = np.zeros(len(predictions), dtype=int)
    
    # Apply each threshold sequentially
    for category, threshold in enumerate(boundary_values):
        categories[predictions >= threshold] = category + 1
        
    return categories


def find_optimal_boundaries(actual_values, model_predictions, initial_boundaries=None):
    """
    Determine the optimal category boundaries that maximize agreement
    between predictions and actual values using quadratic weighted kappa.
    """
    # Set default initial boundaries if none provided
    if initial_boundaries is None:
        initial_boundaries = [0.5, 1.5, 2.5]
    
    # Define optimization objective function
    def objective_function(boundaries, actuals, predictions):
        categorized_predictions = convert_to_categories(predictions, boundaries)
        kappa_score = cohen_kappa_score(actuals, categorized_predictions, weights='quadratic')
        # Return negative since we want to maximize kappa
        return -kappa_score
    
    # Perform optimization
    optimization_result = minimize(
        objective_function, 
        x0=initial_boundaries,
        args=(actual_values, model_predictions),
        method='Powell'
    )
    
    # Verify optimization completed successfully
    if not optimization_result.success:
        raise RuntimeError("Boundary optimization failed to converge")
        
    return optimization_result.x

In [19]:
def evaluate_k_fold_validate(estimator, dataset, target_col, class_col, splitter, apply_weighting=False, show_progress=False):
   """
   Evaluates a model using k-fold validation, optimizing decision boundaries to maximize agreement.
   
   Parameters:
   - estimator: Model object with fit and predict methods
   - dataset: DataFrame containing all data
   - predictors: List of feature column names
   - target_col: Column name for continuous target variable
   - class_col: Column name for class labels
   - splitter: Cross-validation iterator
   - apply_weighting: Whether to apply balancing weights during training
   - show_progress: Whether to display progress information
   
   Returns:
   - mean_agreement: Average kappa agreement score across folds
   - holdout_predictions: Predictions for all samples from their respective holdout folds
   - boundary_sets: Optimized decision boundaries from each fold
   """
   agreement_metrics = []
   holdout_predictions = np.zeros(dataset.shape[0])
   boundary_sets = []
   
   # Default decision boundaries if needed
   default_boundaries = sii_thresholds
   
   # Iterate through cross-validation folds
   for fold_num, (training_indices, validation_indices) in enumerate(splitter.split(dataset, class_col)):
       # Extract training and validation data
       X_training = dataset.iloc[training_indices]
       y_training_target = target_col.iloc[training_indices] ## PCIAT Total
       y_training_class = class_col.iloc[training_indices] ## SII
       
       X_validation = dataset.iloc[validation_indices]
       y_validation_target = target_col.iloc[validation_indices]  ## PCIAT Total
       y_validation_class = class_col.iloc[validation_indices] ## SIIs

       # # Remove the 'id' column from your training data
       # X_training = X_training.drop(columns=['id'])
       # X_validation = X_validation.drop(columns=['id'])
       
       # Apply sample weighting if needed for class imbalance issue 
       if apply_weighting:
           sample_weights = sample_weights_optimized(y_training_target)
           estimator.fit(X_training, y_training_target, sample_weight=sample_weights)
       else:
           estimator.fit(X_training, y_training_target)
       
       # Generate predictions
       training_predictions = estimator.predict(X_training)
       validation_predictions = estimator.predict(X_validation)
       
       # Store predictions in the appropriate positions
       holdout_predictions[validation_indices] = validation_predictions
       
       # Optimize decision boundaries based on training predictions
       optimized_boundaries = find_optimal_boundaries(
           y_training_class, 
           training_predictions, 
           initial_boundaries=default_boundaries
       )
       boundary_sets.append(optimized_boundaries)
       
       # Apply optimized boundaries to validation predictions
       discretized_predictions = convert_to_categories(validation_predictions, optimized_boundaries)
       
       # Calculate kappa metric
       kappa = cohen_kappa_score(y_validation_class, discretized_predictions, weights='quadratic')
       agreement_metrics.append(kappa)
       # 
       if show_progress:
           print(f"Fold {fold_num+1}: Quadratic Kappa = {kappa:.4f}")
   
   if show_progress:
       print(f"Mean Agreement: {np.mean(agreement_metrics):.4f}")
       print(f"Standard Deviation: {np.std(agreement_metrics):.4f}")
   
   return np.mean(agreement_metrics), holdout_predictions, boundary_sets

In [20]:
def optimize_xgb(trial, dataset, target_col, class_col,splitter, apply_weighting):
    """
    Optuna objective function for XGBoost parameter optimization.
    
    Parameters:
    trial: Optuna trial object
    X: Features DataFrame
    y: Target Series
    eval_metric: Metric to evaluate on
    
    Returns:
    float: Mean cross-validation score (RMSE)
    """
    # Parameter search space definition
    params = {
        'objective': trial.suggest_categorical('objective', ['reg:squarederror', 'reg:tweedie']),
        'tree_method': 'approx',
        'n_estimators': trial.suggest_int('n_estimators', 100, 300),
        'max_depth': trial.suggest_int('max_depth', 2, 6),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'subsample': trial.suggest_float('subsample', 0.5, 0.9),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 0.9),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-5, 0.1),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-5, 0.1),
        'random_state': 42,
        'verbosity': 0
    }
    
    # Add tweedie-specific parameters if that objective is selected
    if params['objective'] == 'reg:tweedie':
        params['tweedie_variance_power'] = trial.suggest_float('tweedie_variance_power', 1.0, 2.0)
    
    model = XGBRegressor(**params)
    score, _, _ = evaluate_k_fold_validate(
        estimator=model,
        dataset=dataset,
        target_col=target_col,
        class_col=class_col,
        splitter=splitter,
        apply_weighting=apply_weighting,
        show_progress=False
    )
    return score


def optimize_lgbm(trial, dataset, target_col, class_col, splitter, apply_weighting):
    params = {
        'objective': trial.suggest_categorical('objective', ['regression', 'poisson', 'tweedie']),
        'n_estimators': trial.suggest_int('n_estimators', 100, 300),
        'max_depth': trial.suggest_int('max_depth', 2, 6),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'subsample': trial.suggest_float('subsample', 0.5, 0.9),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 0.9),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),
        'random_state': 42,
        'verbosity': -1
    }

    if params['objective'] == 'tweedie':
        params['tweedie_variance_power'] = trial.suggest_float('tweedie_variance_power', 1.0, 2.0)

    model = LGBMRegressor(**params)
    score, _, _ = evaluate_k_fold_validate(
        estimator=model,
        dataset=dataset,
        target_col=target_col,
        class_col=class_col,
        splitter=splitter,
        apply_weighting=apply_weighting,
        show_progress=False
    )
    return score


def optimize_extraTrees(trial, dataset, target_col, class_col, splitter, apply_weighting):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 5, 30),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
        'random_state': 42
    }

    model = ExtraTreesRegressor(**params)
    score, _, _ = evaluate_k_fold_validate(
        estimator=model,
        dataset=dataset,
        target_col=target_col,
        class_col=class_col,
        splitter=splitter,
        apply_weighting=apply_weighting,
        show_progress=False
    )
    return score

def optimize_catboost(trial, dataset, target_col, class_col, splitter, apply_weighting):
    params = {
        'loss_function': trial.suggest_categorical('loss_function', ['RMSE', 'Poisson']),
        'iterations': trial.suggest_int('iterations', 100, 300),
        'depth': trial.suggest_int('depth', 3, 7),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-3, 0.1),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
        'random_strength': trial.suggest_float('random_strength', 1e-3, 10.0),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 20, 60),
        'random_seed': 42,
        'verbose': 0
    }

    model = CatBoostRegressor(**params)
    score, _, _ = evaluate_k_fold_validate(
        estimator=model,
        dataset=dataset,
        target_col=target_col,
        class_col=class_col,
        splitter=splitter,
        apply_weighting=apply_weighting,
        show_progress=False
    )
    return score
    

def run_optuna_optimization(dataset, target_col, class_col, model_type, splitter, apply_weighting=True, n_trials=30):
    """
    Run Optuna optimization for the specified model type.
    
    Parameters:
    X: Features DataFrame
    y: Target Series
    model_type: Type of model to optimize ('xgb', 'lgbm', or 'catboost')
    n_trials: Number of optimization trials
    
    Returns:
    dict: Best parameters found by Optuna
    """
    print(f"Starting optimization for {model_type.upper()} with {n_trials} trials")
    
    # Create Optuna study (set direction to minimize for RMSE)
    study = optuna.create_study(direction='maximize')
    
    # Select the appropriate objective function
    if model_type.lower() == 'xgb':
        study.optimize(lambda trial: optimize_xgb(trial, dataset, target_col, class_col, splitter, apply_weighting),
                   n_trials=n_trials)
    elif model_type.lower() == 'lgbm':
        study.optimize(lambda trial: optimize_lgbm(trial, dataset, target_col, class_col, splitter, apply_weighting),
                   n_trials=n_trials)
    elif model_type.lower() == 'extratrees':
        study.optimize(lambda trial: optimize_extraTrees(trial, dataset, target_col, class_col, splitter, apply_weighting),
                   n_trials=n_trials)
    elif model_type.lower() == 'catboost':
        study.optimize(lambda trial: optimize_xgb(trial, dataset, target_col, class_col, splitter, apply_weighting),
                   n_trials=n_trials)
    else:
        raise ValueError("model_type must be 'xgb', 'lgbm', 'catboost' or 'extratrees'")
    
    print("\nBest Parameters:")
    print(study.best_params)
    print(f"Best Quadratic Kappa: {study.best_value:.4f}")
    return study.best_params

In [21]:
splitter = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [22]:
# For XGBoost
XGB_OPTIMAL_PARAMS = run_optuna_optimization(X, Y_target, Y_class, model_type='xgb', splitter=splitter, apply_weighting=True, n_trials=30)

# For LightGBM
LGBM_OPTIMAL_PARAMS = run_optuna_optimization(X, Y_target, Y_class, model_type='lgbm', splitter=splitter, apply_weighting=True, n_trials=30)

# For CatBoost
CATBOOST_OPTIMAL_PARAMS = run_optuna_optimization(X, Y_target, Y_class, model_type='catboost', splitter=splitter, apply_weighting=True, n_trials=30)

# For ExtraTree
XTRATREE_OPTIMAL_PARAMS = run_optuna_optimization(X, Y_target, Y_class, model_type='extratrees', splitter=splitter, apply_weighting=True, n_trials=30)

[I 2025-04-17 22:03:39,990] A new study created in memory with name: no-name-39398760-8958-40e2-9b31-9f078c77ed7a


Starting optimization for XGB with 30 trials


[I 2025-04-17 22:03:46,170] Trial 0 finished with value: 0.9944540166555506 and parameters: {'objective': 'reg:tweedie', 'n_estimators': 214, 'max_depth': 5, 'learning_rate': 0.058131242366271714, 'subsample': 0.8350136284893762, 'colsample_bytree': 0.8425491436287053, 'reg_alpha': 0.008320724359235128, 'reg_lambda': 0.0004251631484097355, 'tweedie_variance_power': 1.8880546620140883}. Best is trial 0 with value: 0.9944540166555506.
[I 2025-04-17 22:03:49,998] Trial 1 finished with value: 0.9932235702390001 and parameters: {'objective': 'reg:squarederror', 'n_estimators': 296, 'max_depth': 5, 'learning_rate': 0.051381152682945706, 'subsample': 0.7045204257835828, 'colsample_bytree': 0.7190558659684394, 'reg_alpha': 0.09069581793382966, 'reg_lambda': 0.09474529207531625}. Best is trial 0 with value: 0.9944540166555506.
[I 2025-04-17 22:03:54,005] Trial 2 finished with value: 0.9728698963413297 and parameters: {'objective': 'reg:tweedie', 'n_estimators': 255, 'max_depth': 2, 'learning_ra


Best Parameters:
{'objective': 'reg:squarederror', 'n_estimators': 153, 'max_depth': 6, 'learning_rate': 0.013769795623498312, 'subsample': 0.5036259961123959, 'colsample_bytree': 0.8990403873185487, 'reg_alpha': 0.03718980808445292, 'reg_lambda': 0.02905102865197595}
Best Quadratic Kappa: 1.0000
Starting optimization for LGBM with 30 trials


[I 2025-04-17 22:05:25,597] Trial 0 finished with value: 0.999086783692646 and parameters: {'objective': 'tweedie', 'n_estimators': 264, 'max_depth': 6, 'learning_rate': 0.042108703182399225, 'subsample': 0.8807326778859068, 'colsample_bytree': 0.8268314891460407, 'min_child_samples': 77, 'tweedie_variance_power': 1.1007983721478802}. Best is trial 0 with value: 0.999086783692646.
[I 2025-04-17 22:05:28,952] Trial 1 finished with value: 0.996944676066996 and parameters: {'objective': 'tweedie', 'n_estimators': 295, 'max_depth': 4, 'learning_rate': 0.03213793447305246, 'subsample': 0.6249670102548569, 'colsample_bytree': 0.6866324679858986, 'min_child_samples': 10, 'tweedie_variance_power': 1.8917917994050568}. Best is trial 0 with value: 0.999086783692646.
[I 2025-04-17 22:05:30,285] Trial 2 finished with value: 0.9956929789103881 and parameters: {'objective': 'tweedie', 'n_estimators': 111, 'max_depth': 3, 'learning_rate': 0.0421330553061845, 'subsample': 0.7787679978934577, 'colsampl


Best Parameters:
{'objective': 'poisson', 'n_estimators': 201, 'max_depth': 6, 'learning_rate': 0.08636427693081034, 'subsample': 0.7886973921073616, 'colsample_bytree': 0.891452897563926, 'min_child_samples': 43}
Best Quadratic Kappa: 0.9997
Starting optimization for CATBOOST with 30 trials


[I 2025-04-17 22:07:05,454] Trial 0 finished with value: 0.9721850009712185 and parameters: {'objective': 'reg:tweedie', 'n_estimators': 201, 'max_depth': 2, 'learning_rate': 0.05758940192305538, 'subsample': 0.7769602622748815, 'colsample_bytree': 0.585462333511136, 'reg_alpha': 0.062355738541679716, 'reg_lambda': 0.05734873000379019, 'tweedie_variance_power': 1.9387442553489547}. Best is trial 0 with value: 0.9721850009712185.
[I 2025-04-17 22:07:08,354] Trial 1 finished with value: 0.9883269915910129 and parameters: {'objective': 'reg:tweedie', 'n_estimators': 103, 'max_depth': 5, 'learning_rate': 0.06094165459596609, 'subsample': 0.5781175500659781, 'colsample_bytree': 0.7121289425757289, 'reg_alpha': 0.038879631183110455, 'reg_lambda': 0.07301627235587334, 'tweedie_variance_power': 1.3902673927154106}. Best is trial 1 with value: 0.9883269915910129.
[I 2025-04-17 22:07:11,990] Trial 2 finished with value: 0.9855409516396618 and parameters: {'objective': 'reg:tweedie', 'n_estimator


Best Parameters:
{'objective': 'reg:squarederror', 'n_estimators': 262, 'max_depth': 4, 'learning_rate': 0.03266711907186448, 'subsample': 0.6480198149422716, 'colsample_bytree': 0.898523689014005, 'reg_alpha': 0.09266144837373504, 'reg_lambda': 0.0316571095313497}
Best Quadratic Kappa: 1.0000
Starting optimization for EXTRATREES with 30 trials


[I 2025-04-17 22:08:39,964] Trial 0 finished with value: 0.9444645693546713 and parameters: {'n_estimators': 162, 'max_depth': 20, 'min_samples_leaf': 10, 'max_features': 'log2', 'bootstrap': False}. Best is trial 0 with value: 0.9444645693546713.
[I 2025-04-17 22:08:43,051] Trial 1 finished with value: 0.9444310673447591 and parameters: {'n_estimators': 388, 'max_depth': 27, 'min_samples_leaf': 20, 'max_features': 'sqrt', 'bootstrap': False}. Best is trial 0 with value: 0.9444645693546713.
[I 2025-04-17 22:08:48,522] Trial 2 finished with value: 0.9625581051277796 and parameters: {'n_estimators': 372, 'max_depth': 30, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'bootstrap': False}. Best is trial 2 with value: 0.9625581051277796.
[I 2025-04-17 22:09:04,410] Trial 3 finished with value: 0.9996888323819114 and parameters: {'n_estimators': 266, 'max_depth': 27, 'min_samples_leaf': 2, 'max_features': None, 'bootstrap': False}. Best is trial 3 with value: 0.9996888323819114.
[I 2025-04-1


Best Parameters:
{'n_estimators': 362, 'max_depth': 23, 'min_samples_leaf': 3, 'max_features': None, 'bootstrap': True}
Best Quadratic Kappa: 1.0000


In [23]:
model_param_combination = [
    [LGBMRegressor, LGBM_OPTIMAL_PARAMS],
    [XGBRegressor, XGB_OPTIMAL_PARAMS],
    [ExtraTreesRegressor, XTRATREE_OPTIMAL_PARAMS],
    [CatBoostRegressor, CATBOOST_OPTIMAL_PARAMS],
]
train_list = []
predictions_list = []
score_list = []
weights = sample_weights_optimized(train['PCIAT-PCIAT_Total'])
test = test.drop(columns=['id', 'BIA-BIA_BMI', 'SDS-SDS_Total_T'])

for model,params in model_param_combination:
    print(f"Model: {model.__name__}, Parameters: {params}")
    model_instance = model(**params)
    kappa_score, oof, thresholds = evaluate_k_fold_validate(
        model_instance, X, Y_target, Y_class, splitter, apply_weighting=True)
    score_list.append(kappa_score)

    model_instance.fit(X, Y_target, sample_weight=weights)
    thresholds_ens = np.mean(thresholds, axis=0)
    predictions = model_instance.predict(test)
    predictions = convert_to_categories(predictions, thresholds_ens)
    predictions_list.append(predictions)
    train_pred = model_instance.predict(X)
    train_pred = convert_to_categories(train_pred, thresholds_ens)
    train_list.append(train_pred)

ENSEMBLE = 'voting'
if ENSEMBLE == 'voting':
    # Mode voting (majority rules)
    test_preds = np.array(predictions_list)
    voted_test = stats.mode(test_preds, axis=0).mode.flatten().astype(int)
    final_test = voted_test

Model: LGBMRegressor, Parameters: {'objective': 'poisson', 'n_estimators': 201, 'max_depth': 6, 'learning_rate': 0.08636427693081034, 'subsample': 0.7886973921073616, 'colsample_bytree': 0.891452897563926, 'min_child_samples': 43}


[LightGBM] [Fatal] The number of features in data (127) is not the same as it was in training data (165).
You can set ``predict_disable_shape_check=true`` to discard this error, but please be aware what you are doing.


LightGBMError: The number of features in data (127) is not the same as it was in training data (165).
You can set ``predict_disable_shape_check=true`` to discard this error, but please be aware what you are doing.

In [None]:
final_test

In [None]:
# Create submission file
submission = pd.read_csv("/kaggle/input/child-mind-institute-problematic-internet-use/sample_submission.csv")
submission['sii'] = final_test
submission.to_csv("submission.csv", index=False)