In [2]:
import pandas as pd
import logging
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_absolute_error

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Load data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Separate features and target
X = train_data.drop(columns=['id', 'yield'])
y = train_data['yield']
X_test = test_data.drop(columns=['id'])

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
tree_model = DecisionTreeRegressor(random_state=42)
linear_model = LinearRegression()

# Define the pipeline including polynomial features
pipeline = Pipeline([
    ('poly_features', PolynomialFeatures(degree=3, interaction_only=False, include_bias=False)),
    ('scaler', StandardScaler()),
    ('model', tree_model)
])

# Hyperparameters for RandomizedSearchCV
tree_params = {
    'model__max_depth': [5, 10, 15, 20, None],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4],
    'model__max_features': ['auto', 'sqrt', 'log2', None]
}

# RandomizedSearchCV for decision tree model with polynomial features
search = RandomizedSearchCV(
    pipeline,
    param_distributions=tree_params,
    scoring='neg_mean_absolute_error',
    n_iter=20,  # Increased iterations for better search
    cv=3,
    random_state=42
)
search.fit(X_train, y_train)

# Evaluate on validation set
best_pipeline = search.best_estimator_
y_val_pred = best_pipeline.predict(X_val)
mae = mean_absolute_error(y_val, y_val_pred)
logger.info(f"Decision Tree MAE on validation set: {mae:.2f}")

# If MAE target is not met, switch to Linear Regression
if mae > 220:
    logger.info("Switching to Linear Regression as MAE target was not met.")
    linear_model_pipeline = Pipeline([
        ('poly_features', PolynomialFeatures(degree=3, interaction_only=False, include_bias=False)),
        ('scaler', StandardScaler()),
        ('model', linear_model)
    ])
    linear_model_pipeline.fit(X_train, y_train)
    y_val_pred = linear_model_pipeline.predict(X_val)
    mae = mean_absolute_error(y_val, y_val_pred)
    logger.info(f"Linear Regression MAE on validation set: {mae:.2f}")
    best_pipeline = linear_model_pipeline if mae <= 220 else search.best_estimator_

# Retrain best model on full training data
best_pipeline.fit(X, y)
logger.info("Final model retrained on the entire training dataset.")

# Predictions on the test set
y_test_pred = best_pipeline.predict(X_test)
logger.info("Predictions made on the test set.")

# Create submission file
submission = pd.DataFrame({
    'id': test_data['id'],
    'yield': y_test_pred
})
submission.to_csv('submission_simplified_regression.csv', index=False)
logger.info("Submission file saved as 'submission_simplified_regression.csv'")


12 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
12 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\user\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\user\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\user\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step

In [None]:
import pandas as pd
import numpy as np
import logging
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import GradientBoostingRegressor
import optuna

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Load data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Separate features and target
X = train_data.drop(columns=['id', 'yield'])
y = train_data['yield']
X_test = test_data.drop(columns=['id'])

# Feature Engineering: Adding selective polynomial features
selected_features = ['clonesize', 'honeybee', 'bumbles', 'fruitmass']
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_poly = poly.fit_transform(X[selected_features])
X_poly_df = pd.DataFrame(X_poly, columns=poly.get_feature_names_out(selected_features))
X = pd.concat([X.reset_index(drop=True), X_poly_df], axis=1)
X_test_poly = poly.transform(X_test[selected_features])
X_test_poly_df = pd.DataFrame(X_test_poly, columns=poly.get_feature_names_out(selected_features))
X_test = pd.concat([X_test.reset_index(drop=True), X_test_poly_df], axis=1)

# Split the data for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define Optuna objective function
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'max_features': trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2', None]),
        'random_state': 42
    }
    
    model = GradientBoostingRegressor(**params)
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', model)
    ])
    
    pipeline.fit(X_train, y_train)
    preds = pipeline.predict(X_val)
    mae = mean_absolute_error(y_val, preds)
    return mae

# Optimize model using Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30)  # Limited trials for efficiency

# Retrieve the best parameters and model
best_params = study.best_params
best_params['random_state'] = 42
logger.info(f"Best parameters: {best_params}")

# Train the final model with the best parameters
final_model = GradientBoostingRegressor(**best_params)
final_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', final_model)
])

# Fit the final model pipeline on the entire training data
final_pipeline.fit(X, y)
logger.info("Final model retrained on the entire training dataset.")

# Make predictions on the test set
y_test_pred = final_pipeline.predict(X_test)
logger.info("Predictions made on the test set.")

# Create submission file
submission = pd.DataFrame({
    'id': test_data['id'],
    'yield': y_test_pred
})

# Save submission
submission.to_csv('submission_advanced_dt_regression.csv', index=False)
logger.info("Submission file saved as 'submission_advanced_dt_regression.csv'")


In [4]:
import pandas as pd
import numpy as np
import logging
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import make_scorer, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, StackingRegressor
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
import optuna

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Load data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Separate features and target
X = train_data.drop(columns=['id', 'yield'])
y = train_data['yield']
X_test = test_data.drop(columns=['id'])

# Feature Engineering: Adding more polynomial features
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)
X_poly_df = pd.DataFrame(X_poly, columns=poly.get_feature_names_out(X.columns))
X = pd.concat([X.reset_index(drop=True), X_poly_df], axis=1)
X_test_poly = poly.transform(X_test)
X_test_poly_df = pd.DataFrame(X_test_poly, columns=poly.get_feature_names_out(X_test.columns))
X_test = pd.concat([X_test.reset_index(drop=True), X_test_poly_df], axis=1)

# Define Optuna objective function
def objective(trial):
    # Define hyperparameters for each model
    rf_params = {
        'n_estimators': trial.suggest_int('rf_n_estimators', 200, 1000),
        'max_depth': trial.suggest_int('rf_max_depth', 10, 50),
        'min_samples_split': trial.suggest_int('rf_min_samples_split', 2, 15),
        'min_samples_leaf': trial.suggest_int('rf_min_samples_leaf', 1, 5),
        'max_features': trial.suggest_categorical('rf_max_features', ['sqrt', 'log2', None]),
        'bootstrap': trial.suggest_categorical('rf_bootstrap', [True, False]),
        'random_state': 42,
        'n_jobs': -1
    }
    et_params = {
        'n_estimators': trial.suggest_int('et_n_estimators', 200, 1000),
        'max_depth': trial.suggest_int('et_max_depth', 10, 50),
        'min_samples_split': trial.suggest_int('et_min_samples_split', 2, 15),
        'min_samples_leaf': trial.suggest_int('et_min_samples_leaf', 1, 5),
        'max_features': trial.suggest_categorical('et_max_features', ['sqrt', 'log2', None]),
        'random_state': 42,
        'n_jobs': -1
    }
    svr_params = {
        'C': trial.suggest_float('svr_C', 0.1, 100.0, log=True),
        'epsilon': trial.suggest_float('svr_epsilon', 0.01, 1.0, log=True),
        'kernel': 'rbf'
    }
    enet_params = {
        'alpha': trial.suggest_float('enet_alpha', 1e-4, 1.0, log=True),
        'l1_ratio': trial.suggest_float('enet_l1_ratio', 0.1, 0.9),
        'max_iter': 10000
    }
    # Define base estimators
    estimators = [
        ('rf', RandomForestRegressor(**rf_params)),
        ('et', ExtraTreesRegressor(**et_params)),
        ('svr', SVR(**svr_params)),
        ('enet', ElasticNet(**enet_params))
    ]
    # Define stacking regressor
    stacking_model = StackingRegressor(
        estimators=estimators,
        final_estimator=ElasticNet(max_iter=10000),
        n_jobs=-1,
        passthrough=True
    )
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', stacking_model)
    ])
    # Cross-validation
    cv = KFold(n_splits=5, shuffle=True, random_state=42)
    mae_scores = cross_val_score(pipeline, X, y, cv=cv, scoring=make_scorer(mean_absolute_error), n_jobs=-1)
    return np.mean(mae_scores)

# Optimize model using Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)  # Increased trials for better optimization

# Retrieve the best parameters and model
best_trial = study.best_trial
logger.info(f"Best trial params: {best_trial.params}")
logger.info(f"Best MAE: {best_trial.value:.2f}")

# Extract the best hyperparameters
rf_best_params = {k.replace('rf_', ''): v for k, v in best_trial.params.items() if k.startswith('rf_')}
et_best_params = {k.replace('et_', ''): v for k, v in best_trial.params.items() if k.startswith('et_')}
svr_best_params = {k.replace('svr_', ''): v for k, v in best_trial.params.items() if k.startswith('svr_')}
enet_best_params = {k.replace('enet_', ''): v for k, v in best_trial.params.items() if k.startswith('enet_')}

# Define the final estimators with best params
rf_best_params['random_state'] = 42
rf_best_params['n_jobs'] = -1
et_best_params['random_state'] = 42
et_best_params['n_jobs'] = -1
svr_best_params['kernel'] = 'rbf'
enet_best_params['max_iter'] = 10000

estimators = [
    ('rf', RandomForestRegressor(**rf_best_params)),
    ('et', ExtraTreesRegressor(**et_best_params)),
    ('svr', SVR(**svr_best_params)),
    ('enet', ElasticNet(**enet_best_params))
]

# Define the final stacking regressor
final_model = StackingRegressor(
    estimators=estimators,
    final_estimator=ElasticNet(max_iter=10000),
    n_jobs=-1,
    passthrough=True
)
final_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', final_model)
])

# Fit the final model pipeline on the entire training data
final_pipeline.fit(X, y)
logger.info("Final model retrained on the entire training dataset.")

# Make predictions on the test set
y_test_pred = final_pipeline.predict(X_test)
logger.info("Predictions made on the test set.")

# Create submission file
submission = pd.DataFrame({
    'id': test_data['id'],
    'yield': y_test_pred
})

# Save submission
submission.to_csv('submission_stacking_regression.csv', index=False)
logger.info("Submission file saved as 'submission_stacking_regression.csv'")


[I 2024-11-09 17:40:05,729] A new study created in memory with name: no-name-7b19c7e8-b47d-48bc-a53d-af69f77c80b1
[W 2024-11-09 18:00:57,919] Trial 0 failed with parameters: {'rf_n_estimators': 697, 'rf_max_depth': 43, 'rf_min_samples_split': 5, 'rf_min_samples_leaf': 2, 'rf_max_features': 'sqrt', 'rf_bootstrap': True, 'et_n_estimators': 796, 'et_max_depth': 29, 'et_min_samples_split': 13, 'et_min_samples_leaf': 1, 'et_max_features': 'log2', 'svr_C': 15.615815909493078, 'svr_epsilon': 0.24631264958471535, 'enet_alpha': 0.02496919331080073, 'enet_l1_ratio': 0.5339012783931452} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\user\AppData\Local\Programs\Python\Python311\Lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\user\AppData\Local\Temp\ipykernel_3564\1981592148.py", line 87, in objective
    mae_scores = cross_val_score(pi