# Pycaret for finding models

In [None]:
!pip install pycaret[full]

# Import necessary libraries
import pandas as pd
import numpy as np
from pycaret.regression import *
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
file_path1 = "/content/train.csv"
file_path2 = "/content/test.csv"
data1 = pd.read_csv(file_path1)
data2 = pd.read_csv(file_path2)

# Finding models based on R2 score

In [None]:
import pandas as pd
from pycaret.regression import *
# Read the data
train = pd.read_csv('/content/train.csv')
test = pd.read_csv('/content/test.csv')
# Initialize PyCaret setup
reg = setup(
 data=train,
 target='target',
 session_id=123,
 verbose=False
)
# Compare models and store results
models = compare_models(n_select=15) # Compare top 15 models
# Get the comparison results as a DataFrame
comparison_df = pull()
# Create predictions for each model
predictions = {}
for i, model in enumerate(models, 1):
# Train the model
 final_model = create_model(model)
# Make predictions on test set
 pred = predict_model(finalmodel, data=test)
 predictions[f'Model{i}_{type(model).name}'] = pred['prediction'].values
# Create submission DataFrame with predictions from all models
submissions = pd.DataFrame({'id': test['id']})
for model_name, preds in predictions.items():
 submissions[model_name] = preds
# Calculate R² scores on training data for each model
print("\nModel R² Scores on Training Data:")
print(comparison_df[['Model', 'R2']].to_string())
# Save predictions to CSV
submissions.to_csv('model_predictions.csv', index=False)
# Get the best model based on R²
best_model_name = comparison_df.iloc[0]['Model']
best_model_r2 = comparison_df.iloc[0]['R2']
print(f"\nBest Model: {best_model_name}")
print(f"Best R² Score: {best_r2:.4f}")
# Show first few predictions from different models
print("\nSample Predictions from Different Models:")
print(submissions.head())

In [None]:
import pandas as pd
from pycaret.regression import *

# Read the data
train = pd.read_csv('/content/train.csv')
test = pd.read_csv('/content/test.csv')

# Create a copy of test data without the id column for feature consistency
test_features = test.drop('id', axis=1)

# Merge train and test data
all_data = pd.concat([train, test_features], axis=0, ignore_index=True)

# Remove rows with missing values in the target column
all_data = all_data.dropna(subset=['target'])

# Initialize PyCaret setup with merged data
reg = setup(
    data=all_data,
    target='target',
    train_size=0.8,
    session_id=123,
    verbose=False
)

# Compare models and store results
models = compare_models(n_select=15)  # Compare top 15 models

# Get the comparison results as a DataFrame
comparison_df = pull()

# Create predictions for each model
predictions = {}
for i, model in enumerate(models, 1):
    # Train the model
    final_model = create_model(model)

    # Make predictions on test set
    pred = predict_model(final_model, data=test)
    predictions[f'Model_{i}_{type(model).__name__}'] = pred['prediction'].values

# Create submission DataFrame with predictions from all models
submissions = pd.DataFrame({'id': test['id']})
for model_name, preds in predictions.items():
    submissions[model_name] = preds

# Calculate R² scores on training data for each model
print("\nModel R² Scores on Training Data:")
print(comparison_df[['Model', 'R2']].to_string())

# Save predictions to CSV
submissions.to_csv('model_predictions.csv', index=False)

# Get the best model based on R²
best_model_name = comparison_df.iloc[0]['Model']
best_model_r2 = comparison_df.iloc[0]['R2']

print(f"\nBest Model: {best_model_name}")
print(f"Best R² Score: {best_r2:.4f}")

# Show first few predictions from different models
print("\nSample Predictions from Different Models:")
print(submissions.head())

# Create a final submission with the best model's predictions
best_predictions = submissions[['id', f'Model_1_{type(models[0]).__name__}']]
best_predictions.columns = ['id', 'target']
best_predictions.to_csv('best_model_predictions.csv', index=False)

# Print diagnostic information
print("\nShape of merged data before cleaning:", len(train) + len(test))
print("Shape of merged data after cleaning:", len(all_data))
print("\nMissing values in cleaned merged data:")
print(all_data.isnull().sum())

# model based on MAPE

In [None]:
import pandas as pd
import numpy as np
from pycaret.regression import *

def prepare_features(df):
    """
    Prepare time series features from the date column and clean the data
    """
    # Convert date to datetime
    df['date'] = pd.to_datetime(df['date'])

    # Extract date features
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day_of_week'] = df['date'].dt.dayofweek
    df['day_of_month'] = df['date'].dt.day

    # Create categorical features
    df['store_product'] = df['store'] + "_" + df['product']

    return df

# Read the data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Remove rows with missing target values from training data
print(f"Training rows before cleaning: {len(train)}")
train = train.dropna(subset=['num_sold'])
print(f"Training rows after cleaning: {len(train)}")

# Prepare features for both train and test
train = prepare_features(train)
test = prepare_features(test)

# Sort data by date for time series modeling
train = train.sort_values('date')
test = test.sort_values('date')

# Initialize PyCaret setup with corrected time series settings
reg = setup(
    data=train,
    target='num_sold',
    numeric_features=['year', 'month', 'day_of_week', 'day_of_month'],
    categorical_features=['country', 'store', 'product', 'store_product'],
    fold_strategy='timeseries',
    fold=5,  # Number of folds for time series CV
    fold_shuffle=False,  # Required for time series
    data_split_shuffle=False,  # Required for time series
    session_id=123,
    verbose=False
)

# Compare models suitable for time series
models = compare_models(
    n_select=5,  # Select top 5 models
    sort='MAPE',
    exclude=['dummy']  # Exclude baseline models
)

# Get the comparison results
comparison_df = pull()

# Create predictions for each model
predictions = {}
model_mape_scores = {}

for i, model in enumerate(models, 1):
    # Train the model
    final_model = create_model(model)

    # Make predictions on validation set
    val_predictions = predict_model(final_model, data=get_config('X_train'))
    mape = np.mean(np.abs((get_config('y_train') - val_predictions['prediction']) / get_config('y_train'))) * 100
    model_name = f'Model_{i}_{type(model).__name__}'
    model_mape_scores[model_name] = mape

    # Make predictions on test set
    test_pred = predict_model(final_model, data=test)
    predictions[model_name] = test_pred['prediction'].values

# Create submission DataFrame
submissions = pd.DataFrame({'id': test['id']})
for model_name, preds in predictions.items():
    submissions[model_name] = preds

# Print MAPE scores
print("\nModel MAPE Scores on Training Data:")
for model_name, mape in sorted(model_mape_scores.items(), key=lambda x: x[1]):
    print(f"{model_name}: {mape:.2f}%")

# Save predictions
submissions.to_csv('sales_predictions.csv', index=False)

# Get the best model
best_model_name = min(model_mape_scores.items(), key=lambda x: x[1])[0]
best_model_mape = model_mape_scores[best_model_name]
print(f"\nBest Model: {best_model_name}")
print(f"Best MAPE Score: {best_model_mape:.2f}%")

# Show sample predictions
print("\nSample Predictions:")
print(submissions.head())

# Create ensemble prediction (average of all models)
submissions['Ensemble_Prediction'] = submissions[[col for col in submissions.columns if col != 'id']].mean(axis=1)

# Best R2 finding based on 2 3 models with no preprossing

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score

# Load data
train_df = pd.read_csv('/kaggle/input/new-new/train.csv')
test_df = pd.read_csv('/kaggle/input/new-new/test.csv')

# Prepare features and target
X = train_df[['f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9']]
y = train_df['target']
X_test = test_df[['f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9']]

# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 1. Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_val)
lr_score = r2_score(y_val, lr_pred)
print(f"Linear Regression R² Score: {lr_score:.4f}")

# 2. Elastic Net with Grid Search
param_grid = {
    'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10],
    'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]
}

en = ElasticNet(max_iter=10000, random_state=42)
grid_search = GridSearchCV(en, param_grid, scoring='r2', cv=5)
grid_search.fit(X_train, y_train)

print("\nElastic Net Best Parameters:", grid_search.best_params_)
print(f"Elastic Net Best R² Score: {grid_search.best_score_:.4f}")

# Use the better model for final predictions
if grid_search.best_score_ > lr_score:
    print("\nUsing Elastic Net for final predictions")
    best_model = grid_search.best_estimator_
else:
    print("\nUsing Linear Regression for final predictions")
    best_model = lr

# Generate predictions
final_predictions = best_model.predict(X_test)

# Create submission file
submission = pd.DataFrame({
    'id': test_df['id'],
    'target': final_predictions
})
submission.to_csv('submission.csv', index=False)
print("\nSubmission file created: submission.csv")

# Display feature coefficients of the best model
feature_names = ['f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9']
coefficients = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': best_model.coef_
})
print("\nFeature Coefficients:")
print(coefficients.sort_values(by='Coefficient', key=abs, ascending=False))

# A program to find result based on MAPE having stacking meta modal preprocessing hypertuning with optuna

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import TimeSeriesSplit

# Read the data
train = pd.read_csv('/kaggle/input/new-new-new/train.csv')
test = pd.read_csv('/kaggle/input/new-new-new/test.csv')

# Remove rows with missing target values
print(f"Training rows before cleaning: {len(train)}")
train = train.dropna(subset=['num_sold'])
print(f"Training rows after cleaning: {len(train)}")

# Convert date to datetime and extract features
train['date'] = pd.to_datetime(train['date'])
test['date'] = pd.to_datetime(test['date'])

# Create date features
for df in [train, test]:
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day_of_week'] = df['date'].dt.dayofweek
    df['day_of_month'] = df['date'].dt.day
    df['store_product'] = df['store'] + "_" + df['product']

# Encode categorical variables
categorical_features = ['country', 'store', 'product', 'store_product']
encoders = {}

for feature in categorical_features:
    encoders[feature] = LabelEncoder()
    train[feature] = encoders[feature].fit_transform(train[feature])
    test[feature] = encoders[feature].transform(test[feature])

# Prepare feature columns
feature_columns = ['year', 'month', 'day_of_week', 'day_of_month'] + categorical_features

# Prepare training data
X = train[feature_columns]
y = train['num_sold']

# Initialize base models
rf = RandomForestRegressor(
    n_estimators=100,
    max_features='sqrt',
    random_state=42,
    n_jobs=-1
)

catboost = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.03,
    depth=6,
    random_state=42,
    verbose=False
)

# Initialize meta-model
meta_model = CatBoostRegressor(
    iterations=500,
    learning_rate=0.03,
    depth=4,
    random_state=42,
    verbose=False
)

# Create time series split for stacking
tscv = TimeSeriesSplit(n_splits=5)

# Train base models and generate meta-features
print("Training base models and generating meta-features...")
meta_features_train = np.zeros((len(X), 2))  # 2 base models
meta_features_test = np.zeros((len(test), 2))

# Train and predict with base models
print("Training RF...")
rf.fit(X, y)
meta_features_test[:, 0] = rf.predict(test[feature_columns])

print("Training CatBoost...")
catboost.fit(X, y)
meta_features_test[:, 1] = catboost.predict(test[feature_columns])

# Generate meta-features for training
for fold, (train_idx, val_idx) in enumerate(tscv.split(X), 1):
    print(f"Processing fold {fold}/5...")
    X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
    y_train_fold = y.iloc[train_idx]

    # Train and predict with RF
    rf_fold = RandomForestRegressor(
        n_estimators=100,
        max_features='sqrt',
        random_state=42,
        n_jobs=-1
    )
    rf_fold.fit(X_train_fold, y_train_fold)
    meta_features_train[val_idx, 0] = rf_fold.predict(X_val_fold)

    # Train and predict with CatBoost
    catboost_fold = CatBoostRegressor(
        iterations=1000,
        learning_rate=0.03,
        depth=6,
        random_state=42,
        verbose=False
    )
    catboost_fold.fit(X_train_fold, y_train_fold)
    meta_features_train[val_idx, 1] = catboost_fold.predict(X_val_fold)

# Train meta model
print("Training meta model...")
meta_model.fit(meta_features_train, y)

# Make final predictions
print("Making final predictions...")
final_predictions = meta_model.predict(meta_features_test)

# Create submission file
submission = pd.DataFrame({
    'id': test['id'],
    'num_sold': final_predictions
})

# Ensure predictions are non-negative
submission['num_sold'] = submission['num_sold'].clip(lower=0)

# Save submission file
submission.to_csv('submission.csv', index=False)

# Display sample of predictions
print("\nSample of submission file:")
print(submission.head())

# Calculate and display validation scores
print("\nCalculating validation MAPE...")
meta_predictions = meta_model.predict(meta_features_train)
mape = np.mean(np.abs((y - meta_predictions) / y)) * 100
print(f"Overall Validation MAPE: {mape:.2f}%")

# Analysis of model predictions
print("\nAnalyzing predictions...")
print("RF mean prediction:", np.mean(meta_features_test[:, 0]))
print("CatBoost mean prediction:", np.mean(meta_features_test[:, 1]))
print("Final stacking mean prediction:", np.mean(final_predictions))

# code having feature engering and many more

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import TimeSeriesSplit

# Read the data
train = pd.read_csv('/kaggle/input/new-new-new/train.csv')
test = pd.read_csv('/kaggle/input/new-new-new/test.csv')

# Remove rows with missing target values
print(f"Training rows before cleaning: {len(train)}")
train = train.dropna(subset=['num_sold'])
print(f"Training rows after cleaning: {len(train)}")

# Convert date to datetime and extract features
train['date'] = pd.to_datetime(train['date'])
test['date'] = pd.to_datetime(test['date'])

# Create date features
for df in [train, test]:
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day_of_week'] = df['date'].dt.dayofweek
    df['day_of_month'] = df['date'].dt.day
    df['store_product'] = df['store'] + "_" + df['product']

# Encode categorical variables
categorical_features = ['country', 'store', 'product', 'store_product']
encoders = {}

for feature in categorical_features:
    encoders[feature] = LabelEncoder()
    train[feature] = encoders[feature].fit_transform(train[feature])
    test[feature] = encoders[feature].transform(test[feature])

# Prepare feature columns
feature_columns = ['year', 'month', 'day_of_week', 'day_of_month'] + categorical_features

# Prepare training data
X = train[feature_columns]
y = train['num_sold']

# Initialize base models
rf = RandomForestRegressor(
    n_estimators=100,
    max_features='sqrt',
    random_state=42,
    n_jobs=-1
)

catboost = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.03,
    depth=6,
    random_state=42,
    verbose=False
)

# Initialize meta-model
meta_model = CatBoostRegressor(
    iterations=500,
    learning_rate=0.03,
    depth=4,
    random_state=42,
    verbose=False
)

# Create time series split for stacking
tscv = TimeSeriesSplit(n_splits=5)

# Train base models and generate meta-features
print("Training base models and generating meta-features...")
meta_features_train = np.zeros((len(X), 2))  # 2 base models
meta_features_test = np.zeros((len(test), 2))

# Train and predict with base models
print("Training RF...")
rf.fit(X, y)
meta_features_test[:, 0] = rf.predict(test[feature_columns])

print("Training CatBoost...")
catboost.fit(X, y)
meta_features_test[:, 1] = catboost.predict(test[feature_columns])

# Generate meta-features for training
for fold, (train_idx, val_idx) in enumerate(tscv.split(X), 1):
    print(f"Processing fold {fold}/5...")
    X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
    y_train_fold = y.iloc[train_idx]

    # Train and predict with RF
    rf_fold = RandomForestRegressor(
        n_estimators=100,
        max_features='sqrt',
        random_state=42,
        n_jobs=-1
    )
    rf_fold.fit(X_train_fold, y_train_fold)
    meta_features_train[val_idx, 0] = rf_fold.predict(X_val_fold)

    # Train and predict with CatBoost
    catboost_fold = CatBoostRegressor(
        iterations=1000,
        learning_rate=0.03,
        depth=6,
        random_state=42,
        verbose=False
    )
    catboost_fold.fit(X_train_fold, y_train_fold)
    meta_features_train[val_idx, 1] = catboost_fold.predict(X_val_fold)

# Train meta model
print("Training meta model...")
meta_model.fit(meta_features_train, y)

# Make final predictions
print("Making final predictions...")
final_predictions = meta_model.predict(meta_features_test)

# Create submission file
submission = pd.DataFrame({
    'id': test['id'],
    'num_sold': final_predictions
})

# Ensure predictions are non-negative
submission['num_sold'] = submission['num_sold'].clip(lower=0)

# Save submission file
submission.to_csv('submission.csv', index=False)

# Display sample of predictions
print("\nSample of submission file:")
print(submission.head())

# Calculate and display validation scores
print("\nCalculating validation MAPE...")
meta_predictions = meta_model.predict(meta_features_train)
mape = np.mean(np.abs((y - meta_predictions) / y)) * 100
print(f"Overall Validation MAPE: {mape:.2f}%")

# Analysis of model predictions
print("\nAnalyzing predictions...")
print("RF mean prediction:", np.mean(meta_features_test[:, 0]))
print("CatBoost mean prediction:", np.mean(meta_features_test[:, 1]))
print("Final stacking mean prediction:", np.mean(final_predictions))

# Having hypertuning and ray tune

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import TimeSeriesSplit

# Read the data
train = pd.read_csv('/kaggle/input/new-new-new/train.csv')
test = pd.read_csv('/kaggle/input/new-new-new/test.csv')

# Remove rows with missing target values
print(f"Training rows before cleaning: {len(train)}")
train = train.dropna(subset=['num_sold'])
print(f"Training rows after cleaning: {len(train)}")

# Convert date to datetime and extract features
train['date'] = pd.to_datetime(train['date'])
test['date'] = pd.to_datetime(test['date'])

# Create date features
for df in [train, test]:
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day_of_week'] = df['date'].dt.dayofweek
    df['day_of_month'] = df['date'].dt.day
    df['store_product'] = df['store'] + "_" + df['product']

# Encode categorical variables
categorical_features = ['country', 'store', 'product', 'store_product']
encoders = {}

for feature in categorical_features:
    encoders[feature] = LabelEncoder()
    train[feature] = encoders[feature].fit_transform(train[feature])
    test[feature] = encoders[feature].transform(test[feature])

# Prepare feature columns
feature_columns = ['year', 'month', 'day_of_week', 'day_of_month'] + categorical_features

# Prepare training data
X = train[feature_columns]
y = train['num_sold']

# Initialize base models
rf = RandomForestRegressor(
    n_estimators=100,
    max_features='sqrt',
    random_state=42,
    n_jobs=-1
)

catboost = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.03,
    depth=6,
    random_state=42,
    verbose=False
)

# Initialize meta-model
meta_model = CatBoostRegressor(
    iterations=500,
    learning_rate=0.03,
    depth=4,
    random_state=42,
    verbose=False
)

# Create time series split for stacking
tscv = TimeSeriesSplit(n_splits=5)

# Train base models and generate meta-features
print("Training base models and generating meta-features...")
meta_features_train = np.zeros((len(X), 2))  # 2 base models
meta_features_test = np.zeros((len(test), 2))

# Train and predict with base models
print("Training RF...")
rf.fit(X, y)
meta_features_test[:, 0] = rf.predict(test[feature_columns])

print("Training CatBoost...")
catboost.fit(X, y)
meta_features_test[:, 1] = catboost.predict(test[feature_columns])

# Generate meta-features for training
for fold, (train_idx, val_idx) in enumerate(tscv.split(X), 1):
    print(f"Processing fold {fold}/5...")
    X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
    y_train_fold = y.iloc[train_idx]

    # Train and predict with RF
    rf_fold = RandomForestRegressor(
        n_estimators=100,
        max_features='sqrt',
        random_state=42,
        n_jobs=-1
    )
    rf_fold.fit(X_train_fold, y_train_fold)
    meta_features_train[val_idx, 0] = rf_fold.predict(X_val_fold)

    # Train and predict with CatBoost
    catboost_fold = CatBoostRegressor(
        iterations=1000,
        learning_rate=0.03,
        depth=6,
        random_state=42,
        verbose=False
    )
    catboost_fold.fit(X_train_fold, y_train_fold)
    meta_features_train[val_idx, 1] = catboost_fold.predict(X_val_fold)

# Train meta model
print("Training meta model...")
meta_model.fit(meta_features_train, y)

# Make final predictions
print("Making final predictions...")
final_predictions = meta_model.predict(meta_features_test)

# Create submission file
submission = pd.DataFrame({
    'id': test['id'],
    'num_sold': final_predictions
})

# Ensure predictions are non-negative
submission['num_sold'] = submission['num_sold'].clip(lower=0)

# Save submission file
submission.to_csv('submission.csv', index=False)

# Display sample of predictions
print("\nSample of submission file:")
print(submission.head())

# Calculate and display validation scores
print("\nCalculating validation MAPE...")
meta_predictions = meta_model.predict(meta_features_train)
mape = np.mean(np.abs((y - meta_predictions) / y)) * 100
print(f"Overall Validation MAPE: {mape:.2f}%")

# Analysis of model predictions
print("\nAnalyzing predictions...")
print("RF mean prediction:", np.mean(meta_features_test[:, 0]))
print("CatBoost mean prediction:", np.mean(meta_features_test[:, 1]))
print("Final stacking mean prediction:", np.mean(final_predictions))

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from typing import Tuple, Dict, Any
import warnings
import joblib
from pathlib import Path

# Ray Tune imports
from ray import tune
from ray.tune.schedulers import ASHAScheduler
from ray.tune.search.hyperopt import HyperOptSearch

# Hyperopt imports
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

class MLPipelineRayTune:
    def __init__(self, random_state: int = 42):
        self.random_state = random_state
        self.scaler = StandardScaler()
        self.model = None
        self.features = None

    def load_and_preprocess_data(self, train_path: str, test_path: str) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
        """Load and preprocess data in one step"""
        dtype_dict = {f'f{i}': np.float32 for i in range(300)}

        train_df = pd.read_csv(train_path, dtype=dtype_dict)
        test_df = pd.read_csv(test_path, dtype=dtype_dict)

        self.features = [col for col in train_df.columns if col.startswith('f')]
        if not self.features:
            raise ValueError("No feature columns found")

        X_train = train_df[self.features].values
        y_train = train_df['target'].values
        X_test = test_df[self.features].values

        X_train_scaled = self.scaler.fit_transform(X_train)
        X_test_scaled = self.scaler.transform(X_test)

        return X_train_scaled, y_train, X_test_scaled

    def train_ray_tune(self, X_train: np.ndarray, y_train: np.ndarray, num_samples: int = 50, max_epochs: int = 100) -> Dict:
        """Train model using Ray Tune with ASHA scheduler and HyperOpt algorithm"""

        def objective(config):
            from ray import train

            et = ExtraTreesRegressor(
                n_estimators=int(config["n_estimators"]),
                max_depth=int(config["max_depth"]),
                min_samples_split=int(config["min_samples_split"]),
                min_samples_leaf=int(config["min_samples_leaf"]),
                max_features=config["max_features"],
                random_state=self.random_state,
                n_jobs=1
            )

            scores = cross_val_score(
                et, X_train, y_train,
                cv=KFold(n_splits=5, shuffle=True, random_state=self.random_state),
                scoring='r2',
                n_jobs=1
            )

            train.report({"mean_r2": scores.mean()})

        # Define search space
        search_space = {
            "n_estimators": tune.randint(50, 500),
            "max_depth": tune.randint(5, 50),
            "min_samples_split": tune.randint(2, 10),
            "min_samples_leaf": tune.randint(1, 5),
            "max_features": tune.choice(['sqrt', 'log2', None]),
        }

        # Initialize ASHA scheduler
        asha_scheduler = ASHAScheduler(
            time_attr='training_iteration',
            metric="mean_r2",
            mode="max",
            max_t=max_epochs,
            grace_period=5,
            reduction_factor=3
        )

        # Initialize HyperOpt search algorithm
        hyperopt_search = HyperOptSearch(
            metric="mean_r2",
            mode="max"
        )

        # Run optimization
        analysis = tune.run(
            objective,
            config=search_space,
            search_alg=hyperopt_search,
            scheduler=asha_scheduler,
            num_samples=num_samples,
            resources_per_trial={"cpu": 4}
        )

        best_config = analysis.get_best_config(metric="mean_r2", mode="max")

        # Train final model with best parameters
        self.model = ExtraTreesRegressor(
            **{k: int(v) if isinstance(v, float) and k != 'max_features' else v
               for k, v in best_config.items()},
            random_state=self.random_state,
            n_jobs=-1
        )
        self.model.fit(X_train, y_train)

        return best_config

class MLPipelineHyperopt:
    def __init__(self, random_state: int = 42):
        self.random_state = random_state
        self.scaler = StandardScaler()
        self.model = None
        self.features = None

    def load_and_preprocess_data(self, train_path: str, test_path: str) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
        """Load and preprocess data"""
        dtype_dict = {f'f{i}': np.float32 for i in range(300)}

        train_df = pd.read_csv(train_path, dtype=dtype_dict)
        test_df = pd.read_csv(test_path, dtype=dtype_dict)

        self.features = [col for col in train_df.columns if col.startswith('f')]
        if not self.features:
            raise ValueError("No feature columns found")

        X_train = train_df[self.features].values
        y_train = train_df['target'].values
        X_test = test_df[self.features].values

        X_train_scaled = self.scaler.fit_transform(X_train)
        X_test_scaled = self.scaler.transform(X_test)

        return X_train_scaled, y_train, X_test_scaled

    def train_hyperopt(self, X_train: np.ndarray, y_train: np.ndarray, max_evals: int = 100) -> Dict:
        """Train model using Hyperopt with TPE algorithm"""

        def objective(params):
            et = ExtraTreesRegressor(
                n_estimators=int(params["n_estimators"]),
                max_depth=int(params["max_depth"]),
                min_samples_split=int(params["min_samples_split"]),
                min_samples_leaf=int(params["min_samples_leaf"]),
                max_features=params["max_features"],
                random_state=self.random_state,
                n_jobs=-1
            )

            scores = cross_val_score(
                et, X_train, y_train,
                cv=KFold(n_splits=5, shuffle=True, random_state=self.random_state),
                scoring='r2'
            )

            return {'loss': -scores.mean(), 'status': STATUS_OK}

        space = {
            'n_estimators': scope.int(hp.quniform('n_estimators', 100, 1000, 10)),
            'max_depth': scope.int(hp.quniform('max_depth', 10, 100, 1)),
            'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 20, 1)),
            'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 10, 1)),
            'max_features': hp.choice('max_features', ['sqrt', 'log2', None])
        }

        trials = Trials()
        best = fmin(
            fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=max_evals,
            trials=trials,
            show_progressbar=True
        )

        best_params = {
            'n_estimators': int(best['n_estimators']),
            'max_depth': int(best['max_depth']),
            'min_samples_split': int(best['min_samples_split']),
            'min_samples_leaf': int(best['min_samples_leaf']),
            'max_features': ['sqrt', 'log2', None][best['max_features']]
        }

        self.model = ExtraTreesRegressor(
            **best_params,
            random_state=self.random_state,
            n_jobs=-1
        )
        self.model.fit(X_train, y_train)

        return best_params

def main():
    warnings.filterwarnings('ignore', category=UserWarning)

    try:
        # Example usage of Ray Tune version
        ray_pipeline = MLPipelineRayTune()
        X_train, y_train, X_test = ray_pipeline.load_and_preprocess_data('/kaggle/input/kaggle/train.csv', '/kaggle/input/kaggle/test.csv')
        best_params_ray = ray_pipeline.train_ray_tune(X_train, y_train)
        print("Ray Tune best parameters:", best_params_ray)

        # Example usage of Hyperopt version
        hyperopt_pipeline = MLPipelineHyperopt()
        X_train, y_train, X_test = hyperopt_pipeline.load_and_preprocess_data('/kaggle/input/kaggle/train.csv', '/kaggle/input/kaggle/test.csv')
        best_params_hyperopt = hyperopt_pipeline.train_hyperopt(X_train, y_train)
        print("Hyperopt best parameters:", best_params_hyperopt)

        return ray_pipeline, hyperopt_pipeline

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        raise

if __name__ == "__main__":
    ray_pipeline, hyperopt_pipeline = main()

# Analysing the coding

In [None]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler ,RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')
print("Packages Imported ")

In [None]:
train_data= pd.read_csv('/kaggle/input/noicef/train.csv')
test_data= pd.read_csv('/kaggle/input/noicef/test.csv')
sample_sub = pd.read_csv('/kaggle/input/noicef/sample_submission.csv')
print("Data imported")

In [None]:
train_data.head()

In [None]:
print(f'Number of rows: {train_data.shape[0]};  Number of columns: {train_data.shape[1]}; No of missing values: {sum(train_data.isna().sum())}')

In [None]:
train_data.describe().style.background_gradient(cmap='coolwarm')

In [None]:
# variables variaition
df_var=train_data.var().reset_index()
df_var.columns =['feature', 'variation']
df_var.sort_values("variation",ascending = True)

In [None]:
# Correlationmatrix
corrMatrix =train_data.corr(method='pearson', min_periods=1)
corrMatrix

In [None]:
cor_targ = train_data.corrwith(train_data["target"]).reset_index()
cor_targ.columns =['feature', 'CorrelatioWithTarget']
cor_targ.sort_values('CorrelatioWithTarget',ascending = False)

# Using simple stacking of models

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import StackingRegressor, ExtraTreesRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# Load data
train = pd.read_csv('/kaggle/input/noicef/train.csv')
test = pd.read_csv('/kaggle/input/noicef/test.csv')

# Separate features and target
X = train.drop(['target'], axis=1)
y = train['target']
test_features = test.drop('id', axis=1)

# Define base models
extra_trees = ExtraTreesRegressor(n_estimators=300, random_state=42)
catboost = CatBoostRegressor(iterations=500, learning_rate=0.1, depth=8, verbose=0, random_state=42)

# Define meta-model
meta_model = Ridge(alpha=1.0)

# Create a stacking regressor
stacking_regressor = StackingRegressor(
    estimators=[('extra_trees', extra_trees), ('catboost', catboost)],
    final_estimator=meta_model,
    cv=20
)

# Optional: Create a validation set to check performance
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the stacking regressor
stacking_regressor.fit(X, y)  # Use full dataset for final training

# Make predictions on test set
test_predictions = stacking_regressor.predict(test_features)

# Create submission DataFrame
submission = pd.DataFrame({
    'id': test['id'],
    'target': test_predictions
})

# Save submission file
submission.to_csv('subbbbbmissssion1.csv', index=False)

# Optional: Print validation score if using train-validation split
val_predictions = stacking_regressor.predict(X_val)
print(f"Validation R2 Score: {r2_score(y_val, val_predictions)}")

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import StackingRegressor, ExtraTreesRegressor
from catboost import CatBoostRegressor
from sklearn.svm import SVR
from sklearn.metrics import r2_score

# Load data
train = pd.read_csv('/kaggle/input/noicef/train.csv')
test = pd.read_csv('/kaggle/input/noicef/test.csv')

# Separate features and target
X = train.drop(['target'], axis=1)
y = train['target']
test_features = test.drop('id', axis=1)

# Define base models
extra_trees = ExtraTreesRegressor(n_estimators=10000, random_state=42)
catboost = CatBoostRegressor(iterations=20000, learning_rate=0.1, depth=8, verbose=0, random_state=42)

# Define SVR meta-model
svr_meta = SVR(kernel='rbf', C=10.0, epsilon=0.1)

try:
    # Create and train stacking regressor with SVR meta-model
    stacking_regressor = StackingRegressor(
        estimators=[('extra_trees', extra_trees), ('catboost', catboost)],
        final_estimator=svr_meta,
        cv=10
    )

    # Fit the model on the entire dataset
    stacking_regressor.fit(X, y)

    # Make predictions on the entire dataset
    predictions = stacking_regressor.predict(X)

    # Calculate R2 score on the entire dataset
    full_score = r2_score(y, predictions)
    print(f"SVR Meta-model - Full Dataset R2 Score: {full_score:.4f}")

    # Generate predictions for test set
    test_predictions = stacking_regressor.predict(test_features)

    # Create submission DataFrame
    submission = pd.DataFrame({
        'id': test['id'],
        'target': test_predictions
    })

    # Save submission file
    submission.to_csv('submission_svr_meta7.csv', index=False)
    print("\nSubmission file created successfully")

except Exception as e:
    print(f"Error: {str(e)}")

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import StackingRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.svm import SVR
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler

# Load data
train = pd.read_csv('/kaggle/input/noicef/train.csv')
test = pd.read_csv('/kaggle/input/noicef/test.csv')

# Separate features and target
X = train.drop(['target'], axis=1)
y = train['target']
test_features = test.drop('id', axis=1)

# Scale the features for KNN
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
test_features_scaled = scaler.transform(test_features)

# Define base models
et_model = ExtraTreesRegressor(n_estimators=3000, random_state=42)
xgb_model = XGBRegressor(
    n_estimators=3000,
    learning_rate=0.1,
    max_depth=8,
    random_state=42
)
catboost_model = CatBoostRegressor(
    iterations=5000,
    learning_rate=0.1,
    depth=8,
    verbose=0,
    random_state=42
)
gbr_model = GradientBoostingRegressor(
    n_estimators=3000,
    learning_rate=0.1,
    max_depth=8,
    random_state=42
)
knn_model = KNeighborsRegressor(
    n_neighbors=5,
    weights='distance',
    metric='minkowski',
    p=2  # Euclidean distance
)

# Define meta-model (final estimator)
meta_model = SVR(kernel='rbf', C=10.0, epsilon=0.1)

try:
    # Create stacking regressor
    stacking_model = StackingRegressor(
        estimators=[
            ('et', et_model),
            ('xgb', xgb_model),
            ('catboost', catboost_model),
            ('gbr', gbr_model),
            ('knn', knn_model)
        ],
        final_estimator=meta_model,
        cv=6
    )

    # Train the stacking model on the training data
    # Use scaled data for KNN but original data for tree-based models
    stacking_model.fit(X_scaled, y)

    # Make predictions on the entire dataset
    predictions = stacking_model.predict(X_scaled)

    # Calculate R2 score on the entire dataset
    full_score = r2_score(y, predictions)
    print(f"Stacking Model - Full Dataset R2 Score: {full_score:.4f}")

    # Generate predictions for test set
    test_predictions = stacking_model.predict(test_features_scaled)

    # Create submission DataFrame
    submission = pd.DataFrame({
        'id': test['id'],
        'target': test_predictions
    })

    # Save submission file
    submission.to_csv('submission_stacking_with_knn.csv', index=False)
    print("\nSubmission file created successfully")

except Exception as e:
    print(f"Error: {str(e)}")

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import StackingRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.svm import SVR
from sklearn.metrics import r2_score

# Load data
train = pd.read_csv('/kaggle/input/new-data/ucs-654-kaggle-hack-lab-exam-1/train.csv')
test = pd.read_csv('/kaggle/input/new-data/ucs-654-kaggle-hack-lab-exam-1/test.csv')

# Separate features and target
X = train.drop(['target'], axis=1)
y = train['target']
test_features = test.drop('id', axis=1)

# Define base models
et_model = ExtraTreesRegressor(n_estimators=2000, random_state=42)
xgb_model = XGBRegressor(
    n_estimators=2000,
    learning_rate=0.1,
    max_depth=7,
    random_state=42
)
catboost_model = CatBoostRegressor(
    iterations=5000,
    learning_rate=0.1,
    depth=9,
    verbose=0,
    random_state=42
)
gbr_model = GradientBoostingRegressor(
    n_estimators=2000,
    learning_rate=0.1,
    max_depth=7,
    random_state=42
)

# Define meta-model (final estimator)
meta_model = SVR(kernel='rbf', C=10.0, epsilon=0.1)

try:
    # Create stacking regressor
    stacking_model = StackingRegressor(
        estimators=[
            ('et', et_model),
            ('xgb', xgb_model),
            ('catboost', catboost_model),
            ('gbr', gbr_model)
        ],
        final_estimator=meta_model,
        cv=7  # As shown in the image
    )

    # Train the stacking model on the training data
    stacking_model.fit(X, y)

    # Make predictions on the entire dataset
    predictions = stacking_model.predict(X)

    # Calculate R2 score on the entire dataset
    full_score = r2_score(y, predictions)
    print(f"Stacking Model - Full Dataset R2 Score: {full_score:.4f}")

    # Generate predictions for test set
    test_predictions = stacking_model.predict(test_features)

    # Create submission DataFrame
    submission = pd.DataFrame({
        'id': test['id'],
        'target': test_predictions
    })

    # Save submission file
    submission.to_csv('submission_stacking1.csv', index=False)
    print("\nSubmission file created successfully")

except Exception as e:
    print(f"Error: {str(e)}")

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import StackingRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.svm import SVR
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler

# Load data
train = pd.read_csv('/kaggle/input/new-data/ucs-654-kaggle-hack-lab-exam-1/train.csv')
test = pd.read_csv('/kaggle/input/new-data/ucs-654-kaggle-hack-lab-exam-1/test.csv')

# Separate features and target
X = train.drop(['target'], axis=1)
y = train['target']
test_features = test.drop('id', axis=1)

# Scale the features for KNN
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
test_features_scaled = scaler.transform(test_features)

# Define base models
et_model = ExtraTreesRegressor(n_estimators=3000, random_state=42)
xgb_model = XGBRegressor(
    n_estimators=3000,
    learning_rate=0.1,
    max_depth=8,
    random_state=42
)
catboost_model = CatBoostRegressor(
    iterations=5000,
    learning_rate=0.1,
    depth=8,
    verbose=0,
    random_state=42
)
gbr_model = GradientBoostingRegressor(
    n_estimators=3000,
    learning_rate=0.1,
    max_depth=8,
    random_state=42
)
knn_model = KNeighborsRegressor(
    n_neighbors=5,
    weights='distance',
    metric='minkowski',
    p=2
)
lgbm_model = LGBMRegressor(
    n_estimators=3000,
    learning_rate=0.1,
    max_depth=8,
    num_leaves=31,
    boosting_type='gbdt',
    random_state=42,
    verbose=-1
)

# Define meta-model (final estimator)
meta_model = SVR(kernel='rbf', C=10.0, epsilon=0.1)

try:
    # Create stacking regressor
    stacking_model = StackingRegressor(
        estimators=[
            ('et', et_model),
            ('xgb', xgb_model),
            ('catboost', catboost_model),
            ('gbr', gbr_model),
            ('knn', knn_model),
            ('lgbm', lgbm_model)
        ],
        final_estimator=meta_model,
        cv=6
    )

    # Train the stacking model on the training data
    stacking_model.fit(X_scaled, y)

    # Make predictions on the entire dataset
    predictions = stacking_model.predict(X_scaled)

    # Calculate R2 score on the entire dataset
    full_score = r2_score(y, predictions)
    print(f"Stacking Model - Full Dataset R2 Score: {full_score:.4f}")

    # Generate predictions for test set
    test_predictions = stacking_model.predict(test_features_scaled)

    # Create submission DataFrame
    submission = pd.DataFrame({
        'id': test['id'],
        'target': test_predictions
    })

    # Save submission file
    submission.to_csv('submission_stacking_with_knn_lgbm.csv', index=False)
    print("\nSubmission file created successfully")

except Exception as e:
    print(f"Error: {str(e)}")

# Using simple stacking of models and also checking best meta model

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import StackingRegressor, ExtraTreesRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score
import xgboost as xgb

# Load data
train = pd.read_csv('/kaggle/input/noicef/train.csv')
test = pd.read_csv('/kaggle/input/noicef/test.csv')

# Separate features and target
X = train.drop(['target'], axis=1)
y = train['target']
test_features = test.drop('id', axis=1)

# Define base models
extra_trees = ExtraTreesRegressor(n_estimators=300, random_state=42)
catboost = CatBoostRegressor(iterations=500, learning_rate=0.1, depth=8, verbose=0, random_state=42)

# Define different meta-models to try
meta_models = {
    'ridge': Ridge(alpha=1.0),
    'lasso': Lasso(alpha=0.01),
    'elastic_net': ElasticNet(alpha=0.01, l1_ratio=0.5),
    'svr': SVR(kernel='rbf', C=1.0, epsilon=0.1),
    'xgboost': xgb.XGBRegressor(
        n_estimators=100,
        learning_rate=0.05,
        max_depth=4,
        random_state=42
    )
}

# Create train-validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Dictionary to store results
results = {}

# Test each meta-model
for name, meta_model in meta_models.items():
    # Create and train stacking regressor
    stacking_regressor = StackingRegressor(
        estimators=[('extra_trees', extra_trees), ('catboost', catboost)],
        final_estimator=meta_model,
        cv=10
    )

    # Fit the model
    stacking_regressor.fit(X_train, y_train)

    # Make predictions on validation set
    val_predictions = stacking_regressor.predict(X_val)
    val_score = r2_score(y_val, val_predictions)

    # Store results
    results[name] = {
        'r2_score': val_score,
        'model': stacking_regressor
    }

    print(f"{name.upper()} - Validation R2 Score: {val_score:.4f}")

# Find best performing model
best_model_name = max(results.items(), key=lambda x: x[1]['r2_score'])[0]
best_model = results[best_model_name]['model']
print(f"\nBest performing meta-model: {best_model_name}")

# Retrain best model on full dataset
best_stacking_regressor = StackingRegressor(
    estimators=[('extra_trees', extra_trees), ('catboost', catboost)],
    final_estimator=meta_models[best_model_name],
    cv=10
)
best_stacking_regressor.fit(X, y)

# Make predictions on test set
test_predictions = best_stacking_regressor.predict(test_features)

# Create submission DataFrame
submission = pd.DataFrame({
    'id': test['id'],
    'target': test_predictions
})

# Save submission file
submission.to_csv('submission_best_meta.csv', index=False)

# Using simple stacking of models and with multiple meta models

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import (
    StackingRegressor,
    ExtraTreesRegressor,
    GradientBoostingRegressor,
    RandomForestRegressor
)
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler

# Load data
train = pd.read_csv('/kaggle/input/new-data/ucs-654-kaggle-hack-lab-exam-1/train.csv')
test = pd.read_csv('/kaggle/input/new-data/ucs-654-kaggle-hack-lab-exam-1/test.csv')

# Separate features and target
X = train.drop(['target'], axis=1)
y = train['target']
test_features = test.drop('id', axis=1)

# Scale the features for KNN
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
test_features_scaled = scaler.transform(test_features)

# Define base models
et_model = ExtraTreesRegressor(n_estimators=3000, random_state=42)
xgb_model = XGBRegressor(
    n_estimators=3000,
    learning_rate=0.1,
    max_depth=8,
    random_state=42
)
catboost_model = CatBoostRegressor(
    iterations=5000,
    learning_rate=0.1,
    depth=8,
    verbose=0,
    random_state=42
)
gbr_model = GradientBoostingRegressor(
    n_estimators=3000,
    learning_rate=0.1,
    max_depth=8,
    random_state=42
)
knn_model = KNeighborsRegressor(
    n_neighbors=5,
    weights='distance',
    metric='minkowski',
    p=2
)
lgbm_model = LGBMRegressor(
    n_estimators=3000,
    learning_rate=0.1,
    max_depth=8,
    num_leaves=31,
    boosting_type='gbdt',
    random_state=42,
    verbose=-1
)
rf_model = RandomForestRegressor(
    n_estimators=3000,
    max_depth=8,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42,
    n_jobs=-1
)

# Define meta-models (trying both XGBoost and LightGBM as meta-learners)
meta_model_xgb = XGBRegressor(
    n_estimators=1000,
    learning_rate=0.01,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

meta_model_lgbm = LGBMRegressor(
    n_estimators=1000,
    learning_rate=0.01,
    max_depth=4,
    num_leaves=15,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    verbose=-1
)

# Try both meta-models
for meta_model, model_name in [(meta_model_xgb, 'xgb'), (meta_model_lgbm, 'lgbm')]:
    try:
        # Create stacking regressor
        stacking_model = StackingRegressor(
            estimators=[
                ('et', et_model),
                ('xgb', xgb_model),
                ('catboost', catboost_model),
                ('gbr', gbr_model),
                ('knn', knn_model),
                ('lgbm', lgbm_model),
                ('rf', rf_model)
            ],
            final_estimator=meta_model,
            cv=6
        )

        # Train the stacking model on the training data
        stacking_model.fit(X_scaled, y)

        # Make predictions on the entire dataset
        predictions = stacking_model.predict(X_scaled)

        # Calculate R2 score on the entire dataset
        full_score = r2_score(y, predictions)
        print(f"Stacking Model with {model_name.upper()} meta-learner - Full Dataset R2 Score: {full_score:.4f}")

        # Generate predictions for test set
        test_predictions = stacking_model.predict(test_features_scaled)

        # Create submission DataFrame
        submission = pd.DataFrame({
            'id': test['id'],
            'target': test_predictions
        })

        # Save submission file
        submission.to_csv(f'submission_stacking_meta_{model_name}.csv', index=False)
        print(f"\nSubmission file for {model_name.upper()} meta-learner created successfully")

    except Exception as e:
        print(f"Error with {model_name} meta-learner: {str(e)}")

# Using h2o for finding models instead of pycaret

In [None]:
import h2o
from h2o.automl import H2OAutoML

# Initialize H2O
h2o.init()

# Convert data to H2OFrame
train_h2o = h2o.H2OFrame(train)
test_h2o = h2o.H2OFrame(test)

# Train AutoML
aml = H2OAutoML(max_models=20, seed=1, sort_metric="R2")
aml.train(x=["f1", "f2", "f3", "f4", "f5", "f6"], y="target", training_frame=train_h2o)

# Leaderboard
lb = aml.leaderboard
print(lb)

# Best model
best_model = aml.leader
predictions = best_model.predict(test_h2o)

# Save predictions
test["target"] = h2o.as_list(predictions, use_pandas=True)


# Shutdown H2O
h2o.shutdown(prompt=False)
