In [2]:
import pandas as pd
import numpy as np
import logging
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error
import optuna

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Load data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Separate features and target
X = train_data.drop(columns=['id', 'yield'])
y = train_data['yield']
X_test = test_data.drop(columns=['id'])

# Define Optuna objective function with selected models
def objective(trial):
    model_type = trial.suggest_categorical('model_type', ['lasso', 'elasticnet', 'svr'])
    
    if model_type == 'lasso':
        alpha = trial.suggest_loguniform('alpha', 1e-2, 1.0)
        model = Lasso(alpha=alpha, max_iter=1000)
    elif model_type == 'elasticnet':
        alpha = trial.suggest_loguniform('alpha', 1e-2, 1.0)
        l1_ratio = trial.suggest_uniform('l1_ratio', 0.1, 0.9)
        model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, max_iter=1000)
    elif model_type == 'svr':
        C = trial.suggest_loguniform('C', 0.1, 10.0)
        epsilon = trial.suggest_loguniform('epsilon', 0.01, 0.5)
        model = SVR(C=C, epsilon=epsilon)
    
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('poly_features', PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)),
        ('model', model)
    ])
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    pipeline.fit(X_train, y_train)
    preds = pipeline.predict(X_val)
    mae = mean_absolute_error(y_val, preds)
    
    return mae

# Optimize model using Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

# Retrieve the best parameters and model
best_params = study.best_params
model_type = best_params.pop('model_type')

if model_type == 'lasso':
    final_model = Lasso(**best_params, max_iter=1000)
elif model_type == 'elasticnet':
    final_model = ElasticNet(**best_params, max_iter=1000)
elif model_type == 'svr':
    final_model = SVR(**best_params)

# Define final pipeline with best model
final_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('poly_features', PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)),
    ('model', final_model)
])

# Fit the final model pipeline on the entire training data
final_pipeline.fit(X, y)
logger.info("Final model retrained on the entire training dataset.")

# Prepare test data and make predictions
logger.info("Test dataset prepared.")
y_test_pred = final_pipeline.predict(X_test)
logger.info("Predictions made on the test set.")

# Create submission file
submission = pd.DataFrame({
    'id': test_data['id'],
    'yield': y_test_pred
})

# Save submission
submission.to_csv('submission_advanced_regression.csv', index=False)
logger.info("Submission file saved as 'submission_advanced_regression.csv'")


[I 2024-11-09 17:15:37,164] A new study created in memory with name: no-name-158f206e-3a4d-4bad-a7ac-a537985b4947
  alpha = trial.suggest_loguniform('alpha', 1e-2, 1.0)
  l1_ratio = trial.suggest_uniform('l1_ratio', 0.1, 0.9)
  model = cd_fast.enet_coordinate_descent(
[I 2024-11-09 17:15:43,401] Trial 0 finished with value: 284.15348983289914 and parameters: {'model_type': 'elasticnet', 'alpha': 0.11102637572855162, 'l1_ratio': 0.1253897537140837}. Best is trial 0 with value: 284.15348983289914.
  alpha = trial.suggest_loguniform('alpha', 1e-2, 1.0)
  model = cd_fast.enet_coordinate_descent(
[I 2024-11-09 17:15:46,714] Trial 1 finished with value: 271.86082591283906 and parameters: {'model_type': 'lasso', 'alpha': 0.10295482320782544}. Best is trial 1 with value: 271.86082591283906.
  alpha = trial.suggest_loguniform('alpha', 1e-2, 1.0)
  l1_ratio = trial.suggest_uniform('l1_ratio', 0.1, 0.9)
[I 2024-11-09 17:15:47,236] Trial 2 finished with value: 340.58426750301214 and parameters: {'

In [None]:
import pandas as pd
import numpy as np
import logging
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import optuna

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Load data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Separate features and target
X = train_data.drop(columns=['id', 'yield'])
y = train_data['yield']
X_test = test_data.drop(columns=['id'])

# Feature Engineering: Selecting specific interaction terms for key features
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_poly = poly.fit_transform(X[['clonesize', 'honeybee', 'bumbles', 'fruitmass']])
X_poly_df = pd.DataFrame(X_poly, columns=poly.get_feature_names_out())
X = pd.concat([X, X_poly_df], axis=1)

# Define Optuna objective function with advanced tuning
def objective(trial):
    model_type = trial.suggest_categorical('model_type', ['lasso', 'elasticnet', 'svr'])
    
    if model_type == 'lasso':
        alpha = trial.suggest_float('alpha', 0.01, 0.1)
        model = Lasso(alpha=alpha, max_iter=1000)
    elif model_type == 'elasticnet':
        alpha = trial.suggest_float('alpha', 0.01, 0.1)
        l1_ratio = trial.suggest_float('l1_ratio', 0.3, 0.8)
        model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, max_iter=1000)
    elif model_type == 'svr':
        C = trial.suggest_float('C', 0.5, 5.0)
        epsilon = trial.suggest_float('epsilon', 0.05, 0.2)
        model = SVR(C=C, epsilon=epsilon)
    
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('poly_features', PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)),
        ('model', model)
    ])
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    pipeline.fit(X_train, y_train)
    preds = pipeline.predict(X_val)
    mae = mean_absolute_error(y_val, preds)
    
    return mae

# Optimize model using Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30)  # Reduced trials for faster computation

# Retrieve the best parameters and model
best_params = study.best_params
model_type = best_params.pop('model_type')

if model_type == 'lasso':
    final_model = Lasso(**best_params, max_iter=1000)
elif model_type == 'elasticnet':
    final_model = ElasticNet(**best_params, max_iter=1000)
elif model_type == 'svr':
    final_model = SVR(**best_params)

# Define final pipeline with best model and selective polynomial features
final_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('poly_features', PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)),
    ('model', final_model)
])

# Fit the final model pipeline on the entire training data
final_pipeline.fit(X, y)
logger.info("Final model retrained on the entire training dataset.")

# Prepare test data and make predictions
X_test_poly = poly.transform(X_test[['clonesize', 'honeybee', 'bumbles', 'fruitmass']])
X_test_poly_df = pd.DataFrame(X_test_poly, columns=poly.get_feature_names_out())
X_test = pd.concat([X_test, X_test_poly_df], axis=1)
y_test_pred = final_pipeline.predict(X_test)
logger.info("Predictions made on the test set.")

# Create submission file
submission = pd.DataFrame({
    'id': test_data['id'],
    'yield': y_test_pred
})

# Save submission
submission.to_csv('submission_advanced_regression11.csv', index=False)
logger.info("Submission file saved as 'submission_advanced_regression11.csv'")


[I 2024-11-09 17:29:47,601] A new study created in memory with name: no-name-63b37b0f-3bc0-457c-a5db-ef9dc8f837bf
[I 2024-11-09 17:31:00,122] Trial 0 finished with value: 1105.8660131143704 and parameters: {'model_type': 'svr', 'C': 1.2566215325686747, 'epsilon': 0.18190381056131827}. Best is trial 0 with value: 1105.8660131143704.
  model = cd_fast.enet_coordinate_descent(
[I 2024-11-09 17:31:15,423] Trial 1 finished with value: 269.98723839335753 and parameters: {'model_type': 'lasso', 'alpha': 0.07579873707856588}. Best is trial 1 with value: 269.98723839335753.
  model = cd_fast.enet_coordinate_descent(
[I 2024-11-09 17:31:29,388] Trial 2 finished with value: 270.3727172600074 and parameters: {'model_type': 'elasticnet', 'alpha': 0.029771213962028434, 'l1_ratio': 0.47490683621639107}. Best is trial 1 with value: 269.98723839335753.
  model = cd_fast.enet_coordinate_descent(
[I 2024-11-09 17:31:50,041] Trial 3 finished with value: 269.5257397623815 and parameters: {'model_type': 'el