In [9]:
import pandas as pd
import logging
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_absolute_error

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

X = train_data.drop(columns=['id', 'yield'])
y = train_data['yield']
X_test = test_data.drop(columns=['id'])


In [10]:

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

tree_model = DecisionTreeRegressor(random_state=42)
linear_model = LinearRegression()

pipeline = Pipeline([
    ('poly_features', PolynomialFeatures(degree=3, interaction_only=False, include_bias=False)),
    ('scaler', StandardScaler()),
    ('model', tree_model)
])

tree_params = {
    'model__max_depth': [5, 10, 15, 20, None],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4],
    'model__max_features': ['auto', 'sqrt', 'log2', None]
}

search = RandomizedSearchCV(
    pipeline,
    param_distributions=tree_params,
    scoring='neg_mean_absolute_error',
    n_iter=20, 
    cv=3,
    random_state=42
)
search.fit(X_train, y_train)

best_pipeline = search.best_estimator_
y_val_pred = best_pipeline.predict(X_val)
mae = mean_absolute_error(y_val, y_val_pred)
logger.info(f"Decision Tree MAE on validation set: {mae:.2f}")

if mae > 220:
    logger.info("Switching to Linear Regression as MAE target was not met.")
    linear_model_pipeline = Pipeline([
        ('poly_features', PolynomialFeatures(degree=3, interaction_only=False, include_bias=False)),
        ('scaler', StandardScaler()),
        ('model', linear_model)
    ])
    linear_model_pipeline.fit(X_train, y_train)
    y_val_pred = linear_model_pipeline.predict(X_val)
    mae = mean_absolute_error(y_val, y_val_pred)
    logger.info(f"Linear Regression MAE on validation set: {mae:.2f}")
    best_pipeline = linear_model_pipeline if mae <= 220 else search.best_estimator_

# Retrain best model on full training data
best_pipeline.fit(X, y)
logger.info("Final model retrained on the entire training dataset.")

# Predictions on the test set
y_test_pred = best_pipeline.predict(X_test)
logger.info("Predictions made on the test set.")

# Create submission file
submission = pd.DataFrame({
    'id': test_data['id'],
    'yield': y_test_pred
})
submission.to_csv('submission_simplified_regression.csv', index=False)
logger.info("Submission file saved as 'submission_simplified_regression.csv'")


12 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
12 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\user\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\user\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\user\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step

In [None]:
import pandas as pd
import numpy as np
import logging
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, RobustScaler
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_absolute_error, make_scorer
from sklearn.linear_model import ElasticNet, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, VotingRegressor, StackingRegressor
import optuna

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

target_column = 'yield'
X = train_data.drop(columns=['id', target_column])
y = train_data[target_column]
X_test = test_data.drop(columns=['id'])

if 'seeds' in X.columns and 'fruitmass' in X.columns:
    X['fruit_density'] = X['seeds'] / (X['fruitmass'] + 1e-3)
    X_test['fruit_density'] = X_test['seeds'] / (X_test['fruitmass'] + 1e-3)

if 'clonesize' in X.columns and 'seeds' in X.columns:
    X['seeds_clonesize_interaction'] = X['seeds'] * X['clonesize']
    X_test['seeds_clonesize_interaction'] = X_test['seeds'] * X_test['clonesize']

for col in ['fruitmass', 'seeds', 'RainingDays']:
    if col in X.columns:
        X[f'log_{col}'] = np.log1p(X[col])
        X_test[f'log_{col}'] = np.log1p(X_test[col])

poly_features = PolynomialFeatures(degree=2, include_bias=False)
important_features = ['fruit_density', 'seeds_clonesize_interaction']
X_poly = poly_features.fit_transform(X[important_features])
X_poly_df = pd.DataFrame(X_poly, columns=poly_features.get_feature_names_out(important_features))
X = pd.concat([X.reset_index(drop=True), X_poly_df], axis=1)

X_test_poly = poly_features.transform(X_test[important_features])
X_test_poly_df = pd.DataFrame(X_test_poly, columns=poly_features.get_feature_names_out(important_features))
X_test = pd.concat([X_test.reset_index(drop=True), X_test_poly_df], axis=1)

def objective(trial):
    enet_alpha = trial.suggest_float('enet_alpha', 1e-4, 1.0, log=True)
    enet_l1_ratio = trial.suggest_float('enet_l1_ratio', 0.1, 0.9)
    
    svr_C = trial.suggest_float('svr_C', 0.1, 100.0, log=True)
    svr_epsilon = trial.suggest_float('svr_epsilon', 0.01, 1.0, log=True)
    
    knn_neighbors = trial.suggest_int('knn_neighbors', 5, 20)

    dt_max_depth = trial.suggest_int('dt_max_depth', 5, 30)
    dt_min_samples_split = trial.suggest_int('dt_min_samples_split', 2, 10)
    
    rf_n_estimators = trial.suggest_int('rf_n_estimators', 50, 200)
    rf_max_depth = trial.suggest_int('rf_max_depth', 10, 50)

    estimators = [
        ('ridge', Ridge(alpha=1.0)),
        ('lasso', Lasso(alpha=0.1)),
        ('enet', ElasticNet(alpha=enet_alpha, l1_ratio=enet_l1_ratio, max_iter=10000)),
        ('svr', SVR(C=svr_C, epsilon=svr_epsilon, kernel='rbf')),
        ('knn', KNeighborsRegressor(n_neighbors=knn_neighbors)),
        ('dt', DecisionTreeRegressor(max_depth=dt_max_depth, min_samples_split=dt_min_samples_split)),
        ('rf', RandomForestRegressor(n_estimators=rf_n_estimators, max_depth=rf_max_depth, random_state=42))
    ]
    
    voting_model = VotingRegressor(estimators=estimators)
    
    pipeline = Pipeline([
        ('scaler', RobustScaler()),  
        ('model', voting_model)
    ])
    
    cv = KFold(n_splits=10, shuffle=True, random_state=42)
    mae_scores = cross_val_score(pipeline, X, y, cv=cv, scoring=make_scorer(mean_absolute_error), n_jobs=-1)
    return np.mean(mae_scores)

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

best_trial = study.best_trial
print(f"Best trial params: {best_trial.params}")
print(f"Best MAE: {best_trial.value:.2f}")


enet_alpha = best_trial.params['enet_alpha']
enet_l1_ratio = best_trial.params['enet_l1_ratio']
svr_C = best_trial.params['svr_C']
svr_epsilon = best_trial.params['svr_epsilon']
knn_neighbors = best_trial.params['knn_neighbors']
dt_max_depth = best_trial.params['dt_max_depth']
dt_min_samples_split = best_trial.params['dt_min_samples_split']
rf_n_estimators = best_trial.params['rf_n_estimators']
rf_max_depth = best_trial.params['rf_max_depth']


estimators = [
    ('ridge', Ridge(alpha=1.0)),
    ('lasso', Lasso(alpha=0.1)),
    ('enet', ElasticNet(alpha=enet_alpha, l1_ratio=enet_l1_ratio, max_iter=10000)),
    ('svr', SVR(C=svr_C, epsilon=svr_epsilon, kernel='rbf')),
    ('knn', KNeighborsRegressor(n_neighbors=knn_neighbors)),
    ('dt', DecisionTreeRegressor(max_depth=dt_max_depth, min_samples_split=dt_min_samples_split)),
    ('rf', RandomForestRegressor(n_estimators=rf_n_estimators, max_depth=rf_max_depth, random_state=42))
]

voting_reg = VotingRegressor(estimators=estimators)
stacking_reg = StackingRegressor(estimators=estimators, final_estimator=Ridge(alpha=1.0))

cv = KFold(n_splits=5, shuffle=True, random_state=42)

voting_pipeline = Pipeline([
    ('scaler', RobustScaler()),  
    ('model', voting_reg)
])
voting_mae = cross_val_score(voting_pipeline, X, y, scoring=make_scorer(mean_absolute_error), cv=cv).mean()
print(f"Voting Regressor MAE: {voting_mae}")

stacking_pipeline = Pipeline([
    ('scaler', RobustScaler()),
    ('model', stacking_reg)
])
stacking_mae = cross_val_score(stacking_pipeline, X, y, scoring=make_scorer(mean_absolute_error), cv=cv).mean()
print(f"Stacking Regressor MAE: {stacking_mae}")

if voting_mae < stacking_mae:
    print("Using Voting Regressor for final predictions.")
    final_model = voting_pipeline
else:
    print("Using Stacking Regressor for final predictions.")
    final_model = stacking_pipeline

final_model.fit(X, y)

y_test_pred = final_model.predict(X_test)

submission = pd.DataFrame({
    'id': test_data['id'],
    'yield': y_test_pred
})
submission.to_csv('submission_advanced_model.csv', index=False)
print("Submission file saved as 'submission_advanced_model.csv'")


[I 2024-11-10 07:21:10,783] A new study created in memory with name: no-name-515f365d-3ba3-42a4-9394-4c3eace4be34
[I 2024-11-10 07:22:45,665] Trial 0 finished with value: 272.12263076657393 and parameters: {'enet_alpha': 0.0005734309182864712, 'enet_l1_ratio': 0.6963348233789081, 'svr_C': 5.592970312792988, 'svr_epsilon': 0.8503780591251312, 'knn_neighbors': 12, 'dt_max_depth': 20, 'dt_min_samples_split': 5, 'rf_n_estimators': 114, 'rf_max_depth': 22}. Best is trial 0 with value: 272.12263076657393.
[I 2024-11-10 07:24:35,281] Trial 1 finished with value: 273.50315481420085 and parameters: {'enet_alpha': 0.011275410001108778, 'enet_l1_ratio': 0.12667316129444242, 'svr_C': 4.165323933152529, 'svr_epsilon': 0.8502305837408295, 'knn_neighbors': 6, 'dt_max_depth': 24, 'dt_min_samples_split': 10, 'rf_n_estimators': 196, 'rf_max_depth': 45}. Best is trial 0 with value: 272.12263076657393.
[I 2024-11-10 07:25:36,524] Trial 2 finished with value: 298.9129045736579 and parameters: {'enet_alpha'

KeyboardInterrupt: 

In [11]:
import pandas as pd
import numpy as np
import logging
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, RobustScaler
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_absolute_error, make_scorer
from sklearn.linear_model import ElasticNet, Ridge
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
import optuna

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

target_column = 'yield'
X = train_data.drop(columns=['id', target_column])
y = train_data[target_column]
X_test = test_data.drop(columns=['id'])


In [12]:

X['fruit_density'] = X['seeds'] / (X['fruitmass'] + 1e-3)
X_test['fruit_density'] = X_test['seeds'] / (X_test['fruitmass'] + 1e-3)
X['seeds_clonesize_interaction'] = X['seeds'] * X['clonesize']
X_test['seeds_clonesize_interaction'] = X_test['seeds'] * X_test['clonesize']

for col in ['fruitmass', 'seeds', 'RainingDays']:
    if col in X.columns:
        X[f'log_{col}'] = np.log1p(X[col])
        X_test[f'log_{col}'] = np.log1p(X_test[col])

poly_features = PolynomialFeatures(degree=2, include_bias=False)
important_features = ['fruit_density', 'seeds_clonesize_interaction']
X_poly = poly_features.fit_transform(X[important_features])
X_poly_df = pd.DataFrame(X_poly, columns=poly_features.get_feature_names_out(important_features))
X = pd.concat([X.reset_index(drop=True), X_poly_df], axis=1)

X_test_poly = poly_features.transform(X_test[important_features])
X_test_poly_df = pd.DataFrame(X_test_poly, columns=poly_features.get_feature_names_out(important_features))
X_test = pd.concat([X_test.reset_index(drop=True), X_test_poly_df], axis=1)


In [13]:
def objective(trial):
    enet_alpha = trial.suggest_float('enet_alpha', 1e-4, 1.0, log=True)
    enet_l1_ratio = trial.suggest_float('enet_l1_ratio', 0.1, 0.9)
    svr_C = trial.suggest_float('svr_C', 0.1, 10.0, log=True)
    svr_epsilon = trial.suggest_float('svr_epsilon', 0.01, 0.5)
    knn_neighbors = trial.suggest_int('knn_neighbors', 5, 15)

    estimators = [
        ('ridge', Ridge(alpha=1.0)),
        ('enet', ElasticNet(alpha=enet_alpha, l1_ratio=enet_l1_ratio, max_iter=10000)),
        ('svr', SVR(C=svr_C, epsilon=svr_epsilon, kernel='rbf')),
        ('knn', KNeighborsRegressor(n_neighbors=knn_neighbors)),
        ('dt', DecisionTreeRegressor(max_depth=10, min_samples_split=5)),
        ('rf', RandomForestRegressor(n_estimators=100, max_depth=15, random_state=42))
    ]
    voting_model = VotingRegressor(estimators=estimators)
    
    pipeline = Pipeline([
        ('scaler', RobustScaler()),
        ('model', voting_model)
    ])
    
    cv = KFold(n_splits=5, shuffle=True, random_state=42)
    mae_scores = cross_val_score(pipeline, X, y, cv=cv, scoring=make_scorer(mean_absolute_error), n_jobs=-1)
    return np.mean(mae_scores)


In [14]:

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=5) 

best_trial = study.best_trial
print(f"Best trial params: {best_trial.params}")
print(f"Best MAE: {best_trial.value:.2f}")


[I 2024-11-10 08:59:11,244] A new study created in memory with name: no-name-24133d53-3bbc-4dd0-9660-7ecf61453f5a
[I 2024-11-10 08:59:42,206] Trial 0 finished with value: 289.18963370653864 and parameters: {'enet_alpha': 0.02679465194841782, 'enet_l1_ratio': 0.6001225703163178, 'svr_C': 1.7841398677694225, 'svr_epsilon': 0.21482629225173266, 'knn_neighbors': 9}. Best is trial 0 with value: 289.18963370653864.
[I 2024-11-10 09:00:30,233] Trial 1 finished with value: 280.853918352183 and parameters: {'enet_alpha': 0.045582814271696304, 'enet_l1_ratio': 0.17067576201154244, 'svr_C': 4.105385650975532, 'svr_epsilon': 0.2853857286501716, 'knn_neighbors': 13}. Best is trial 1 with value: 280.853918352183.
[I 2024-11-10 09:01:21,318] Trial 2 finished with value: 328.59116091371754 and parameters: {'enet_alpha': 0.36210774997842793, 'enet_l1_ratio': 0.7645201680553573, 'svr_C': 0.2468024284467862, 'svr_epsilon': 0.22042778498334678, 'knn_neighbors': 10}. Best is trial 1 with value: 280.8539183

Best trial params: {'enet_alpha': 0.000913848445074099, 'enet_l1_ratio': 0.3557798764634188, 'svr_C': 3.8747244471423765, 'svr_epsilon': 0.021340801494422634, 'knn_neighbors': 13}
Best MAE: 278.05


In [15]:

enet_alpha = best_trial.params['enet_alpha']
enet_l1_ratio = best_trial.params['enet_l1_ratio']
svr_C = best_trial.params['svr_C']
svr_epsilon = best_trial.params['svr_epsilon']
knn_neighbors = best_trial.params['knn_neighbors']

estimators = [
    ('ridge', Ridge(alpha=1.0)),
    ('enet', ElasticNet(alpha=enet_alpha, l1_ratio=enet_l1_ratio, max_iter=100)),
    ('svr', SVR(C=svr_C, epsilon=svr_epsilon, kernel='rbf')),
    ('knn', KNeighborsRegressor(n_neighbors=knn_neighbors)),
    ('dt', DecisionTreeRegressor(max_depth=10, min_samples_split=5)),
    ('rf', RandomForestRegressor(n_estimators=33, max_depth=4, random_state=42))
]

final_model = VotingRegressor(estimators=estimators)


In [16]:

final_pipeline = Pipeline([
    ('scaler', RobustScaler()),
    ('model', final_model)
])


In [17]:

final_pipeline.fit(X, y)

y_test_pred = final_pipeline.predict(X_test)

submission = pd.DataFrame({
    'id': test_data['id'],
    'yield': y_test_pred
})
submission.to_csv('submission_simplified_model.csv', index=False)
print("Submission file saved as 'submission_simplified_model.csv'")


  model = cd_fast.enet_coordinate_descent(


Submission file saved as 'submission_simplified_model.csv'


In [None]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, RobustScaler
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_absolute_error, make_scorer
from sklearn.linear_model import Ridge, Lasso
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
import optuna

# Load data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Separate features and target
target_column = 'yield'
X = train_data.drop(columns=['id', target_column])
y = train_data[target_column]
X_test = test_data.drop(columns=['id'])

# Simplified Feature Engineering: Key interactions and log transformations
X['fruit_density'] = X['seeds'] / (X['fruitmass'] + 1e-3)
X_test['fruit_density'] = X_test['seeds'] / (X_test['fruitmass'] + 1e-3)
X['seeds_clonesize_interaction'] = X['seeds'] * X['clonesize']
X_test['seeds_clonesize_interaction'] = X_test['seeds'] * X_test['clonesize']

# Log transformation for skewed features
for col in ['fruitmass', 'seeds', 'RainingDays']:
    if col in X.columns:
        X[f'log_{col}'] = np.log1p(X[col])
        X_test[f'log_{col}'] = np.log1p(X_test[col])

# Polynomial Features for selected key interactions
poly_features = PolynomialFeatures(degree=2, include_bias=False)
important_features = ['fruit_density', 'seeds_clonesize_interaction']
X_poly = poly_features.fit_transform(X[important_features])
X_poly_df = pd.DataFrame(X_poly, columns=poly_features.get_feature_names_out(important_features))
X = pd.concat([X.reset_index(drop=True), X_poly_df], axis=1)

X_test_poly = poly_features.transform(X_test[important_features])
X_test_poly_df = pd.DataFrame(X_test_poly, columns=poly_features.get_feature_names_out(important_features))
X_test = pd.concat([X_test.reset_index(drop=True), X_test_poly_df], axis=1)

# Define Optuna objective function for tuning
def objective(trial):
    # Narrowed parameter search space
    ridge_alpha = trial.suggest_float('ridge_alpha', 0.1, 2.0)
    svr_C = trial.suggest_float('svr_C', 0.1, 5.0)
    rf_max_depth = trial.suggest_int('rf_max_depth', 5, 20)
    
    # Define models with tuned parameters
    estimators = [
        ('ridge', Ridge(alpha=ridge_alpha)),
        ('lasso', Lasso(alpha=0.1)),  # Fixed alpha for simplicity
        ('svr', SVR(C=svr_C, epsilon=0.1, kernel='rbf')),  # Fixed epsilon
        ('rf', RandomForestRegressor(n_estimators=100, max_depth=rf_max_depth, random_state=42))
    ]
    
    # Voting Regressor with reduced model set
    voting_model = VotingRegressor(estimators=estimators)
    
    pipeline = Pipeline([
        ('scaler', RobustScaler()),
        ('model', voting_model)
    ])
    
    # Cross-validation for MAE scoring
    cv = KFold(n_splits=5, shuffle=True, random_state=42)
    mae_scores = cross_val_score(pipeline, X, y, cv=cv, scoring=make_scorer(mean_absolute_error), n_jobs=-1)
    return np.mean(mae_scores)

# Optimize model using Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10)  # Fewer trials for quicker tuning

# Retrieve best parameters and model
best_trial = study.best_trial
print(f"Best trial params: {best_trial.params}")
print(f"Best MAE: {best_trial.value:.2f}")

# Extract the best hyperparameters
ridge_alpha = best_trial.params['ridge_alpha']
svr_C = best_trial.params['svr_C']
rf_max_depth = best_trial.params['rf_max_depth']

# Final estimators with best params
estimators = [
    ('ridge', Ridge(alpha=ridge_alpha)),
    ('lasso', Lasso(alpha=0.1)),
    ('svr', SVR(C=svr_C, epsilon=0.1, kernel='rbf')),
    ('rf', RandomForestRegressor(n_estimators=100, max_depth=rf_max_depth, random_state=42))
]

# Final voting regressor model
final_model = VotingRegressor(estimators=estimators)

# Final pipeline
final_pipeline = Pipeline([
    ('scaler', RobustScaler()),
    ('model', final_model)
])

# Fit the final model pipeline on the entire training data
final_pipeline.fit(X, y)

# Predict on the test set
y_test_pred = final_pipeline.predict(X_test)

# Create submission file
submission = pd.DataFrame({
    'id': test_data['id'],
    'yield': y_test_pred
})
submission.to_csv('submission_optimized_voting21.csv', index=False)
print("Submission file saved as 'submission_optimized_voting21.csv'")


[I 2024-11-10 09:10:39,569] A new study created in memory with name: no-name-387332d7-9baf-4aea-96c7-473caa37eb8e
