In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split,  GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor

In [21]:
trainData = pd.read_csv("train.csv")
testData = pd.read_csv("test.csv")


In [22]:
feature_to_use = trainData.columns[2:18]
# feature_to_use = trainData.columns[2:10]
X = trainData[feature_to_use]
xX = testData[feature_to_use]
y = trainData['yield']

In [None]:
# Step 1: Select best features using Random Forest Feature Importance (this part remains the same)
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor


def select_best_features_using_importance(X, y, n_estimators=200, max_depth=6, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Fit RandomForest to find feature importance
    rf_model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=random_state)
    rf_model.fit(X_train_scaled, y_train)
    
    # Get feature importance
    feature_importances = rf_model.feature_importances_
    
    # Sort features by importance
    feature_importance_df = pd.DataFrame({
        'Feature': X.columns,
        'Importance': feature_importances
    }).sort_values(by='Importance', ascending=False)
    
    print("Features sorted by importance:")
    print(feature_importance_df)
    
    # Select top features (you can decide how many to select)
    selected_features = feature_importance_df['Feature'].head(6)  # Selecting top 5 features
    return selected_features

# Step 2: Train a model using selected features with Cross-Validation
def train_model_with_selected_features_cv(X, y, selected_features, testData=None, idTest=None, model_type='xgboost'):
    X_selected = X[selected_features]
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_selected)

    # Select model type
    if model_type == 'gradient_boosting':
        model = GradientBoostingRegressor(n_estimators=150, learning_rate=0.1, max_depth=4, random_state=42)
    elif model_type == 'xgboost':
        model = XGBRegressor(n_estimators=150, learning_rate=0.1, max_depth=4, random_state=42)
    else:
        raise ValueError("Unsupported model type")

    # Perform 5-Fold Cross-Validation
    cv_scores = cross_val_score(model, X_scaled, y, cv=20, scoring='neg_mean_absolute_error')

    # Since the scoring returns negative MAE, we'll negate it to show positive values
    print(f"\nCross-validated MAE (5-Fold): {np.mean(-cv_scores):.4f}")
    
    # Fit the model on the entire dataset
    model.fit(X_scaled, y)
    
    # Predict on testData (optional)
    if testData is not None:
        X_test_scaled = scaler.transform(testData[selected_features])
        testPredictions = model.predict(X_test_scaled)
        
        output = pd.DataFrame({
            'id': idTest,
            'yield': testPredictions
        })
        
        output_file = 'submission.csv'
        output.to_csv(output_file, index=False)
        print(f"Predictions saved to {output_file}")

# Step 1: Select best features using feature importance
selected_features = select_best_features_using_importance(X, y)

# Step 2: Train a model with cross-validation using those selected features
train_model_with_selected_features_cv(X, y, selected_features, testData=testData, idTest=testData['id'])

In [24]:
# Index(['clonesize', 'honeybee', 'bumbles', 'andrena', 'osmia',
#        'MaxOfUpperTRange', 'MinOfUpperTRange', 'AverageOfUpperTRange',
#        'MaxOfLowerTRange', 'MinOfLowerTRange', 'AverageOfLowerTRange',
#        'RainingDays', 'AverageRainingDays', 'fruitset', 'fruitmass', 'seeds'],
#       dtype='object')

In [25]:
def feature_engineering(df):
    # Create new features
    df['honeybee_bumbles_ratio'] = df['honeybee'] / (df['bumbles'] + 1)  # Avoid division by zero
    df['fruitset_per_raining_days'] = df['fruitset'] / (df['RainingDays'] + 1)  # Avoid division by zero
    df['fruitmass_per_seed'] = df['fruitmass'] / (df['seeds'] + 1)  # Avoid division by zero
    df['clonesize_squared'] = df['clonesize'] ** 2
    df['honeybee_andrena_interaction'] = df['honeybee'] * df['andrena']
    
    # Adding more features
    df['total_pollinators'] = df['honeybee'] + df['bumbles'] + df['andrena'] + df['osmia']
    df['average_rainfall'] = df[['MaxOfUpperTRange', 'MinOfUpperTRange', 'AverageOfUpperTRange']].mean(axis=1)
    df['fruitset_squared'] = df['fruitset'] ** 2
    df['seeds_log'] = np.log1p(df['seeds'])  # Log transformation
    df['fruitmass_log'] = np.log1p(df['fruitmass'])  # Log transformation
    df['honeybee_squared'] = df['honeybee'] ** 2
    df['bumbles_squared'] = df['bumbles'] ** 2
    df['andrena_squared'] = df['andrena'] ** 2
    df['osmia_squared'] = df['osmia'] ** 2
    df['fruitset_to_clonesize'] = df['fruitset'] / (df['clonesize'] + 1)  # Avoid division by zero
    df['fruitmass_to_clonesize'] = df['fruitmass'] / (df['clonesize'] + 1)  # Avoid division by zero
    df['rain_days_squared'] = df['RainingDays'] ** 2
    df['average_upper_range'] = (df['MaxOfUpperTRange'] + df['MinOfUpperTRange']) / 2
    df['upper_range_diff'] = df['MaxOfUpperTRange'] - df['MinOfUpperTRange']
    df['lower_range_diff'] = df['MaxOfLowerTRange'] - df['MinOfLowerTRange']
    df['average_lower_range'] = (df['MaxOfLowerTRange'] + df['MinOfLowerTRange']) / 2
    df['fruitset_to_rain_days'] = df['fruitset'] / (df['RainingDays'] + 1)  # Avoid division by zero
    df['fruitmass_to_rain_days'] = df['fruitmass'] / (df['RainingDays'] + 1)  # Avoid division by zero
    df['honeybee_and_bumbles'] = df['honeybee'] + df['bumbles']
    df['honeybee_and_andrena'] = df['honeybee'] + df['andrena']
    df['bumbles_and_andrena'] = df['bumbles'] + df['andrena']
    df['osmia_and_bumbles'] = df['osmia'] + df['bumbles']
    df['osmia_and_andrena'] = df['osmia'] + df['andrena']
    df['osmia_and_honeybee'] = df['osmia'] + df['honeybee']
    
    # Adding polynomial features
    df['honeybee_bumbles_product'] = df['honeybee'] * df['bumbles']
    df['honeybee_andrena_product'] = df['honeybee'] * df['andrena']
    df['bumbles_andrena_product'] = df['bumbles'] * df['andrena']
    df['osmia_honeybee_product'] = df ['osmia'] * df['honeybee']
    df['osmia_bumbles_product'] = df['osmia'] * df['bumbles']
    df['osmia_andrena_product'] = df['osmia'] * df['andrena']
    
    return df

In [26]:
# Step 1: Select best features using Random Forest Feature Importance (this part remains the same)
def select_best_features_using_importance(X, y, n_estimators=500, max_depth=5, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state)
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Fit RandomForest to find feature importance
    rf_model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=random_state)
    rf_model.fit(X_train_scaled, y_train)
    
    # Get feature importance
    feature_importances = rf_model.feature_importances_
    
    # Sort features by importance
    feature_importance_df = pd.DataFrame({
        'Feature': X.columns,
        'Importance': feature_importances
    }).sort_values(by='Importance', ascending=False)
    
    print("Features sorted by importance:")
    print(feature_importance_df)
    
    # Select top features (you can decide how many to select)
    selected_features = feature_importance_df['Feature'].head(25)  # Selecting top 5 features
    return selected_features

In [27]:
# Step 2: Train a model using selected features with Cross-Validation
def train_model_with_selected_features_cv(X, y, selected_features, testData=None, idTest=None, model_type='gradiant_boosting'):
    X_selected = X[selected_features]
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_selected)

    # Select model type
    if model_type == 'gradiant_boosting':
        model = GradientBoostingRegressor(n_estimators=120, learning_rate=0.06, max_depth=4, random_state=42)
    elif model_type == 'xgboost':
        model = XGBRegressor(n_estimators=120, learning_rate=0.06, max_depth=4, random_state=42)
    else:
        raise ValueError("Unsupported model type")

    # Perform 5-Fold Cross-Validation
    cv_scores = cross_val_score(model, X_scaled, y, cv=20, scoring='neg_mean_absolute_error')

    # Since the scoring returns negative MAE, we'll negate it to show positive values
    print(f"\nCross-validated MAE (5-Fold): {np.mean(-cv_scores):.4f}")
    
    # Fit the model on the entire dataset
    model.fit(X_scaled, y)
    
    # Predict on testData (optional)
    if testData is not None:
        X_test_scaled = scaler.transform(testData[selected_features])
        testPredictions = model.predict(X_test_scaled)
        
        output = pd.DataFrame({
            'id': idTest,
            'yield': testPredictions
        })
        
        output_file = 'submission.csv'
        output.to_csv(output_file, index=False)
        print(f"Predictions saved to {output_file}")
    return model

In [28]:
# # Step 2: Train a model using selected features with Cross-Validation
# def train_model_with_selected_features_cv(X, y, selected_features, testData=None, idTest=None, model_type='gradiant_boosting'):
#     X_selected = X[selected_features]
    
#     scaler = StandardScaler()
#     X_scaled = scaler.fit_transform(X_selected)

#     # Select model type
#     if model_type == 'gradiant_boosting':
#         model = GradientBoostingRegressor(n_estimators=120, learning_rate=0.06, max_depth=4, random_state=42)
#     elif model_type == 'xgboost':
#         model = XGBRegressor(n_estimators=120, learning_rate=0.06, max_depth=4, random_state=42)
#     else:
#         raise ValueError("Unsupported model type")

#     # Perform 5-Fold Cross-Validation
#     cv_scores = cross_val_score(model, X_scaled, y, cv=20, scoring='neg_mean_absolute_error')

#     # Since the scoring returns negative MAE, we'll negate it to show positive values
#     print(f"\nCross-validated MAE (5-Fold): {np.mean(-cv_scores):.4f}")
    
#     # Fit the model on the entire dataset
#     model.fit(X_scaled, y)
    
#     # Predict on testData (optional)
#     if testData is not None:
#         X_test_scaled = scaler.transform(testData[selected_features])
#         testPredictions = model.predict(X_test_scaled)
        
#         output = pd.DataFrame({
#             'id': idTest,
#             'yield': testPredictions
#         })
        
#         output_file = 'submission.csv'
#         output.to_csv(output_file, index=False)
#         print(f"Predictions saved to {output_file}")
#     return model

In [None]:
# Step 1: Select best features using feature importance
selected_features = select_best_features_using_importance(X, y)

# Step 2: Train a model with cross-validation using those selected features
train_model_with_selected_features_cv(X, y, selected_features, testData=xX, idTest=testData['id'])

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from xgboost import XGBRegressor
import numpy as np
import warnings
warnings.filterwarnings("ignore")
trainData = pd.read_csv("train.csv")
testData = pd.read_csv("test.csv")

feature_to_use = trainData.columns[2:18]
X = trainData[feature_to_use]
y = trainData['yield']
xX = testData[feature_to_use]
feature_to_use
# Feature Engineering on Training Data
def feature_engineering(df):
    # Create new features
    df['honeybee_bumbles_ratio'] = df['honeybee'] / (df['bumbles'] + 1)  # Avoid division by zero
    df['fruitset_per_raining_days'] = df['fruitset'] / (df['RainingDays'] + 1)  # Avoid division by zero
    df['fruitmass_per_seed'] = df['fruitmass'] / (df['seeds'] + 1)  # Avoid division by zero
    df['clonesize_squared'] = df['clonesize'] ** 2
    df['honeybee_andrena_interaction'] = df['honeybee'] * df['andrena']
    
    # Adding more features
    df['total_pollinators'] = df['honeybee'] + df['bumbles'] + df['andrena'] + df['osmia']
    df['average_rainfall'] = df[['MaxOfUpperTRange', 'MinOfUpperTRange', 'AverageOfUpperTRange']].mean(axis=1)
    df['fruitset_squared'] = df['fruitset'] ** 2
    df['seeds_log'] = np.log1p(df['seeds'])  # Log transformation
    df['fruitmass_log'] = np.log1p(df['fruitmass'])  # Log transformation
    df['honeybee_squared'] = df['honeybee'] ** 2
    df['bumbles_squared'] = df['bumbles'] ** 2
    df['andrena_squared'] = df['andrena'] ** 2
    df['osmia_squared'] = df['osmia'] ** 2
    df['fruitset_to_clonesize'] = df['fruitset'] / (df['clonesize'] + 1)  # Avoid division by zero
    df['fruitmass_to_clonesize'] = df['fruitmass'] / (df['clonesize'] + 1)  # Avoid division by zero
    df['rain_days_squared'] = df['RainingDays'] ** 2
    df['average_upper_range'] = (df['MaxOfUpperTRange'] + df['MinOfUpperTRange']) / 2
    df['upper_range_diff'] = df['MaxOfUpperTRange'] - df['MinOfUpperTRange']
    df['lower_range_diff'] = df['MaxOfLowerTRange'] - df['MinOfLowerTRange']
    df['average_lower_range'] = (df['MaxOfLowerTRange'] + df['MinOfLowerTRange']) / 2
    df['fruitset_to_rain_days'] = df['fruitset'] / (df['RainingDays'] + 1)  # Avoid division by zero
    df['fruitmass_to_rain_days'] = df['fruitmass'] / (df['RainingDays'] + 1)  # Avoid division by zero
    df['honeybee_and_bumbles'] = df['honeybee'] + df['bumbles']
    df['honeybee_and_andrena'] = df['honeybee'] + df['andrena']
    df['bumbles_and_andrena'] = df['bumbles'] + df['andrena']
    df['osmia_and_bumbles'] = df['osmia'] + df['bumbles']
    df['osmia_and_andrena'] = df['osmia'] + df['andrena']
    df['osmia_and_honeybee'] = df['osmia'] + df['honeybee']
    
    # Adding polynomial features
    df['honeybee_bumbles_product'] = df['honeybee'] * df['bumbles']
    df['honeybee_andrena_product'] = df['honeybee'] * df['andrena']
    df['bumbles_andrena_product'] = df['bumbles'] * df['andrena']
    df['osmia_honeybee_product'] = df ['osmia'] * df['honeybee']
    df['osmia_bumbles_product'] = df['osmia'] * df['bumbles']
    df['osmia_andrena_product'] = df['osmia'] * df['andrena']
    
    return df
X = feature_engineering(X)
xX = feature_engineering(xX)
# Step 1: Select best features using Random Forest Feature Importance (this part remains the same)
def select_best_features_using_importance(X, y, n_estimators=500, max_depth=5, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state)
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Fit RandomForest to find feature importance
    rf_model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=random_state)
    rf_model.fit(X_train_scaled, y_train)
    
    # Get feature importance
    feature_importances = rf_model.feature_importances_
    
    # Sort features by importance
    feature_importance_df = pd.DataFrame({
        'Feature': X.columns,
        'Importance': feature_importances
    }).sort_values(by='Importance', ascending=False)
    
    print("Features sorted by importance:")
    print(feature_importance_df)
    
    # Select top features (you can decide how many to select)
    selected_features = feature_importance_df['Feature'].head(25)  # Selecting top 5 features
    return selected_features

# Step 2: Train a model using selected features with Cross-Validation
def train_model_with_selected_features_cv(X, y, selected_features, testData=None, idTest=None, model_type='gradiant_boosting'):
    X_selected = X[selected_features]
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_selected)

    # Select model type
    if model_type == 'gradiant_boosting':
        model = GradientBoostingRegressor(n_estimators=120, learning_rate=0.06, max_depth=4, random_state=42)
    elif model_type == 'xgboost':
        model = XGBRegressor(n_estimators=120, learning_rate=0.06, max_depth=4, random_state=42)
    else:
        raise ValueError("Unsupported model type")

    # Perform 5-Fold Cross-Validation
    cv_scores = cross_val_score(model, X_scaled, y, cv=20, scoring='neg_mean_absolute_error')

    # Since the scoring returns negative MAE, we'll negate it to show positive values
    print(f"\nCross-validated MAE (5-Fold): {np.mean(-cv_scores):.4f}")
    
    # Fit the model on the entire dataset
    model.fit(X_scaled, y)
    
    # Predict on testData (optional)
    if testData is not None:
        X_test_scaled = scaler.transform(testData[selected_features])
        testPredictions = model.predict(X_test_scaled)
        
        output = pd.DataFrame({
            'id': idTest,
            'yield': testPredictions
        })
        
        output_file = 'submission.csv'
        output.to_csv(output_file, index=False)
        print(f"Predictions saved to {output_file}")
    return model
# Step 1: Select best features using feature importance
selected_features = select_best_features_using_importance(X, y)

# Step 2: Train a model with cross-validation using those selected features
train_model_with_selected_features_cv(X, y, selected_features, testData=xX, idTest=testData['id'])


In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from xgboost import XGBRegressor
import warnings

warnings.filterwarnings("ignore")

# Load the datasets
trainData = pd.read_csv("train.csv")
testData = pd.read_csv("test.csv")

# Define the features and target variable
feature_to_use = trainData.columns[2:18]
X = trainData[feature_to_use]
y = trainData['yield']
xX = testData[feature_to_use]

# Outlier Detection and Removal using IQR Method
def remove_outliers(df):
    for column in df.columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[~((df[column] < lower_bound) | (df[column] > upper_bound))]
    return df

# Remove outliers from the training data
X_no_outliers = remove_outliers(X)
y_no_outliers = y[X.index[X_no_outliers.index]]  # Align target variable with the cleaned features

# Feature Engineering on Training Data
def feature_engineering(df):
    # Create new features
    df['honeybee_bumbles_ratio'] = df['honeybee'] / (df['bumbles'] + 1)  # Avoid division by zero
    df['fruitset_per_raining_days'] = df['fruitset'] / (df['RainingDays'] + 1)  # Avoid division by zero
    df['fruitmass_per_seed'] = df['fruitmass'] / (df['seeds'] + 1)  # Avoid division by zero
    df['clonesize_squared'] = df['clonesize'] ** 2
    df['honeybee_andrena_interaction'] = df['honeybee'] * df['andrena']
    
    # Adding more features
    df['total_pollinators'] = df['honeybee'] + df['bumbles'] + df['andrena'] + df['osmia']
    df['average_rainfall'] = df[['MaxOfUpperTRange', 'MinOfUpperTRange', 'AverageOfUpperTRange']].mean(axis=1)
    df['fruitset_squared'] = df['fruitset'] ** 2
    df['seeds_log'] = np.log1p(df['seeds'])  # Log transformation
    df['fruitmass_log'] = np.log1p(df['fruitmass'])  # Log transformation
    df['honeybee_squared'] = df['honeybee'] ** 2
    df['bumbles_squared'] = df['bumbles'] ** 2
    df['andrena_squared'] = df['andrena'] ** 2
    df['osmia_squared'] = df['osmia'] ** 2
    df['fruitset_to_clonesize'] = df['fruitset'] / (df['clonesize'] + 1)  # Avoid division by zero
    df['fruitmass_to_clonesize'] = df['fruitmass'] / (df['clonesize'] + 1)  # Avoid division by zero
    df['rain_days_squared'] = df['RainingDays'] ** 2
    df['average_upper_range'] = (df['MaxOfUpperTRange'] + df['MinOfUpperTRange']) / 2
    df['upper_range_diff'] = df['MaxOfUpperTRange'] - df['MinOfUpperTRange']
    df['lower_range_diff'] = df['MaxOfLowerTRange'] - df['MinOfLowerTRange']
    df['average_lower_range'] = (df['MaxOfLowerTRange'] + df['MinOfLowerTRange']) / 2
    df['fruitset_to_rain_days'] = df['fruitset'] / (df['RainingDays'] + 1)  # Avoid division by zero
    df['fruitmass_to_rain_days'] = df['fruitmass'] / (df['RainingDays'] + 1)  # Avoid division by zero
    df['honeybee_and_bumbles'] = df['honeybee'] + df['bumbles']
    df['honeybee_and_andrena'] = df['honeybee'] + df['andrena']
    df['bumbles_and_andrena'] = df['bumbles'] + df['andrena']
    df['osmia_and_bumbles'] = df['osmia'] + df['bumbles']
    df['osmia_and_andrena'] = df['osmia'] + df['andrena']
    df['osmia_and_honeybee'] = df['osmia'] + df['honeybee']
    
    # Adding polynomial features
    df['honeybee_bumbles_product'] = df['honeybee'] * df['bumbles']
    df['honeybee_andrena_product'] = df['honeybee'] * df['andrena']
    df['bumbles_andrena_product'] = df['bumbles'] * df['andrena']
    df['osmia_honeybee_product'] = df['osmia'] * df['honeybee']
    df['osmia_bumbles_product'] = df['osmia'] * df['bumbles']
    df['osmia_andrena_product'] = df['osmia'] * df['andrena']
    
    return df

# Apply feature engineering to the datasets
X_no_outliers = feature_engineering(X_no_outliers)
xX = feature_engineering(xX)

# Step 1: Select best features using Random Forest Feature Importance
def select_best_features_using_importance(X, y, n_estimators=500, max_depth=5, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state)
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Fit RandomForest to find feature importance
    rf_model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=random_state)
    rf_model.fit(X_train_scaled, y_train)
    
    # Get feature importance
    feature_importances = rf_model.feature_importances_
    
    # Sort features by importance
    feature_importance_df = pd.DataFrame({
        'Feature': X.columns,
        'Importance': feature_importances
    }).sort_values(by='Importance', ascending=False)
    
    print("Features sorted by importance:")
    print(feature_importance_df)
    
    # Select top features (you can decide how many to select)
    selected_features = feature_importance_df['Feature'].head(25)  # Selecting top 25 features
    return selected_features

# Step 2: Train a model using selected features with Cross-Validation
def train_model_with_selected_features_cv(X, y, selected_features, testData=None, idTest=None, model_type='gradiant_boosting'):
    X_selected = X[selected_features]
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_selected)

    # Select model type
    if model_type == 'gradiant_boosting':
        model = GradientBoostingRegressor(n_estimators=120, learning_rate=0.06, max_depth=4, random_state=42)
    elif model_type == 'xgboost':
        model = XGBRegressor(n_estimators=120, learning_rate=0.06, max_depth=4, random_state=42)
    else:
        raise ValueError("Unsupported model type")

    # Perform 5-Fold Cross-Validation
    cv_scores = cross_val_score(model, X_scaled, y, cv=20, scoring='neg_mean_absolute_error')

    # Since the scoring returns negative MAE, we'll negate it to show positive values
    print(f"\nCross-validated MAE (5-Fold): {np.mean(-cv_scores):.4f}")
    
    # Fit the model on the entire dataset
    model.fit(X_scaled, y)
    
    # Predict on testData (optional)
    if testData is not None:
        X_test_scaled = scaler.transform(testData[selected_features])
        testPredictions = model.predict(X_test_scaled)
        
        output = pd.DataFrame({
            'id': idTest,
            'yield': testPredictions
        })
        
        output_file = 'submission.csv'
        output.to_csv(output_file, index=False)
        print(f"Predictions saved to {output_file}")
    return model

# Step 1: Select best features using feature importance
selected_features = select_best_features_using_importance(X_no_outliers, y_no_outliers)

# Step 2: Train a model with cross-validation using those selected features
train_model_with_selected_features_cv(X_no_outliers, y_no_outliers, selected_features, testData=xX, idTest=testData['id'])

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from xgboost import XGBRegressor
import warnings

warnings.filterwarnings("ignore")

# Load the datasets
trainData = pd.read_csv("train.csv")
testData = pd.read_csv("test.csv")

# Define the features and target variable
feature_to_use = trainData.columns[2:18]
X = trainData[feature_to_use]
y = trainData['yield']
xX = testData[feature_to_use]

# Feature Engineering on Training Data
def feature_engineering(df):
    # Create new features
    df['honeybee_bumbles_ratio'] = df['honeybee'] / (df['bumbles'] + 1)  # Avoid division by zero
    df['fruitset_per_raining_days'] = df['fruitset'] / (df['RainingDays'] + 1)  # Avoid division by zero
    df['fruitmass_per_seed'] = df['fruitmass'] / (df['seeds'] + 1)  # Avoid division by zero
    df['clonesize_squared'] = df['clonesize'] ** 2
    df['honeybee_andrena_interaction'] = df['honeybee'] * df['andrena']
    
    # Adding more features
    df['total_pollinators'] = df['honeybee'] + df['bumbles'] + df['andrena'] + df['osmia']
    df['average_rainfall'] = df[['MaxOfUpperTRange', 'MinOfUpperTRange', 'AverageOfUpperTRange']].mean(axis=1)
    df['fruitset_squared'] = df['fruitset'] ** 2
    df['seeds_log'] = np.log1p(df['seeds'])  # Log transformation
    df['fruitmass_log'] = np.log1p(df['fruitmass'])  # Log transformation
    df['honeybee_squared'] = df['honeybee'] ** 2
    df['bumbles_squared'] = df['bumbles'] ** 2
    df['andrena_squared'] = df['andrena'] ** 2
    df['osmia_squared'] = df['osmia'] ** 2
    df['fruitset_to_clonesize'] = df['fruitset'] / (df['clonesize'] + 1)  # Avoid division by zero
    df['fruitmass_to_clonesize'] = df['fruitmass'] / (df['clonesize'] + 1)  # Avoid division by zero
    df['rain_days_squared'] = df['RainingDays'] ** 2
    df['average_upper_range'] = (df['MaxOfUpperTRange'] + df['MinOfUpperTRange']) / 2
    df['upper_range_diff'] = df['MaxOfUpperTRange'] - df['MinOfUpperTRange']
    df['lower_range_diff'] = df['MaxOfLowerTRange'] - df['MinOfLowerTRange']
    df['average_lower_range'] = (df['MaxOfLowerTRange'] + df['MinOfLowerTRange']) / 2
    df['fruitset_to_rain_days'] = df['fruitset'] / (df['RainingDays'] + 1)  # Avoid division by zero
    df['fruitmass_to_rain_days'] = df['fruitmass'] / (df['RainingDays'] + 1)  # Avoid division by zero
    df['honeybee_and_bumbles'] = df['honeybee'] + df['bumbles']
    df['honeybee_and_andrena'] = df['honeybee'] + df['andrena']
    df['bumbles_and_andrena'] = df['bumbles'] + df['andrena']
    df['osmia_and_bumbles'] = df['osmia'] + df['bumbles']
    df['osmia_and_andrena'] = df['osmia'] + df['andrena']
    df['osmia_and_honeybee'] = df['osmia'] + df['honeybee']
    
    # Adding polynomial features
    df['honeybee_bumbles_product'] = df['honeybee'] * df['bumbles']
    df['honeybee_andrena_product'] = df['honeybee'] * df['andrena']
    df['bumbles_andrena_product'] = df['bumbles'] * df['andrena']
    df['osmia_honeybee_product'] = df['osmia'] * df['honeybee']
    df['osmia_bumbles_product'] = df['osmia'] * df['bumbles']
    df['osmia_andrena_product'] = df['osmia'] * df['andrena']
    
    return df

# Apply feature engineering to the datasets
X = feature_engineering(X)
xX = feature_engineering(xX)

# Outlier Detection and Removal using IQR Method
def remove_outliers(df):
    for column in df.columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[~((df[column] < lower_bound) | (df[column] > upper_bound))]
    return df

# Remove outliers from the training data
X_no_outliers = remove_outliers(X)
y_no_outliers = y[X.index[X_no_outliers.index]]  # Align target variable with the cleaned features

# Step 1: Select best features using Random Forest Feature Importance
def select_best_features_using_importance(X, y, n_estimators=500, max_depth=5, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state)
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Fit RandomForest to find feature importance
    rf_model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=random_state)
    rf_model.fit(X_train_scaled, y_train)
    
    # Get feature importance
    feature_importances = rf_model.feature_importances_
    
    # Sort features by importance
    feature_importance_df = pd.DataFrame({
        'Feature': X.columns,
        'Importance': feature_importances
    }).sort_values(by='Importance', ascending=False)
    
    print("Features sorted by importance:")
    print(feature_importance_df)
    
    # Select top features (you can decide how many to select)
    selected_features = feature_importance_df['Feature'].head(25)  # Selecting top 25 features
    return selected_features

# Step 2: Train a model using selected features with Cross-Validation
def train_model_with_selected_features_cv(X, y, selected_features, testData=None, idTest=None, model_type='gradiant_boosting'):
    X_selected = X[selected_features]
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_selected)

    # Select model type
    if model_type == 'gradiant_boosting':
        model = GradientBoostingRegressor(n_estimators=120, learning_rate=0.06, max_depth=4, random_state=42)
    elif model_type == 'xgboost':
        model = XGBRegressor(n_estimators=120, learning_rate=0.06, max_depth=4, random_state=42)
    else:
        raise ValueError("Unsupported model type")

    # Perform 5-Fold Cross-Validation
    cv_scores = cross_val_score(model, X_scaled, y, cv=20, scoring='neg_mean_absolute_error')

    # Since the scoring returns negative MAE, we'll negate it to show positive values
    print(f"\nCross-validated MAE (5-Fold): {np.mean(-cv_scores):.4f}")
    
    # Fit the model on the entire dataset
    model.fit(X_scaled, y)
    
    # Predict on testData (optional)
    if testData is not None:
        X_test_scaled = scaler.transform(testData[selected_features])
        testPredictions = model.predict(X_test_scaled)
        
        output = pd.DataFrame({
            'id': idTest,
            'yield': testPredictions
        })
        
        output_file = 'submission.csv'
        output.to_csv(output_file, index=False)
        print(f"Predictions saved to {output_file}")
    return model

# Step 1: Select best features using feature importance
selected_features = select_best_features_using_importance(X_no_outliers, y_no_outliers)

# Step 2: Train a model with cross-validation using those selected features
train_model_with_selected_features_cv(X_no_outliers, y_no_outliers, selected_features, testData=xX, idTest=testData['id'])

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from xgboost import XGBRegressor
import warnings

warnings.filterwarnings("ignore")

# Load the datasets
trainData = pd.read_csv("train.csv")
testData = pd.read_csv("test.csv")

# Define the features and target variable
feature_to_use = trainData.columns[2:18]
X = trainData[feature_to_use]
y = trainData['yield']
xX = testData[feature_to_use]

# Outlier Detection and Removal using IQR Method
def remove_outliers(df):
    for column in df.columns:
        Q1 = df[column].quantile(0.1)
        Q3 = df[column].quantile(0.9)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[~((df[column] < lower_bound) | (df[column] > upper_bound))]
    return df

# Remove outliers from the training data
X_no_outliers = remove_outliers(X)
y_no_outliers = y[X.index[X_no_outliers.index]]  # Align target variable with the cleaned features

# Feature Engineering on Training Data
def feature_engineering(df):
    # Create new features
    df['honeybee_bumbles_ratio'] = df['honeybee'] / (df['bumbles'] + 1)  # Avoid division by zero
    df['fruitset_per_raining_days'] = df['fruitset'] / (df['RainingDays'] + 1)  # Avoid division by zero
    df['fruitmass_per_seed'] = df['fruitmass'] / (df['seeds'] + 1)  # Avoid division by zero
    df['clonesize_squared'] = df['clonesize'] ** 2
    df['honeybee_andrena_interaction'] = df['honeybee'] * df['andrena']
    
    # Adding more features
    df['total_pollinators'] = df['honeybee'] + df['bumbles'] + df['andrena'] + df['osmia']
    df['average_rainfall'] = df[['MaxOfUpperTRange', 'MinOfUpperTRange', 'AverageOfUpperTRange']].mean(axis=1)
    df['fruitset_squared'] = df['fruitset'] ** 2
    df['seeds_log'] = np.log1p(df['seeds'])  # Log transformation
    df['fruitmass_log'] = np.log1p(df['fruitmass'])  # Log transformation
    df['honeybee_squared'] = df['honeybee'] ** 2
    df['bumbles_squared'] = df['bumbles'] ** 2
    df['andrena_squared'] = df['andrena'] ** 2
    df['osmia_squared'] = df['osmia'] ** 2
    df['fruitset_to_clonesize'] = df['fruitset'] / (df['clonesize'] + 1)  # Avoid division by zero
    df['fruitmass_to_clonesize'] = df['fruitmass'] / (df['clonesize'] + 1)  # Avoid division by zero
    df['rain_days_squared'] = df['RainingDays'] ** 2
    df['average_upper_range'] = (df['MaxOfUpperTRange'] + df['MinOfUpperTRange']) / 2
    df['upper_range_diff'] = df['MaxOfUpperTRange'] - df['MinOfUpperTRange']
    df['lower_range_diff'] = df['MaxOfLowerTRange'] - df['MinOfLowerTRange']
    df['average_lower_range'] = (df['MaxOfLowerTRange'] + df['MinOfLowerTRange']) / 2
    df['fruitset_to_rain_days'] = df['fruitset'] / (df['RainingDays'] + 1)  # Avoid division by zero
    df['fruitmass_to_rain_days'] = df['fruitmass'] / (df['RainingDays'] + 1)  # Avoid division by zero
    df['honeybee_and_bumbles'] = df['honeybee'] + df['bumbles']
    df['honeybee_and_andrena'] = df['honeybee'] + df['andrena']
    df['bumbles_and_andrena'] = df['bumbles'] + df['andrena']
    df['osmia_and_bumbles'] = df['osmia'] + df['bumbles']
    df['osmia_and_andrena'] = df['osmia'] + df['andrena']
    df['osmia_and_honeybee'] = df['osmia'] + df['honeybee']
    
    # Adding polynomial features
    df['honeybee_bumbles_product'] = df['honeybee'] * df['bumbles']
    df['honeybee_andrena_product'] = df['honeybee'] * df['andrena']
    df['bumbles_andrena_product'] = df['bumbles'] * df['andrena']
    df['osmia_honeybee_product'] = df['osmia'] * df['honeybee']
    df['osmia_bumbles_product'] = df['osmia'] * df['bumbles']
    df['osmia_andrena_product'] = df['osmia'] * df['andrena']
    
    return df

# Apply feature engineering to the datasets
X_no_outliers = feature_engineering(X_no_outliers)
xX = feature_engineering(xX)

# Step 1: Select best features using Random Forest Feature Importance
def select_best_features_using_importance(X, y, n_estimators=500, max_depth=6, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state)
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Fit RandomForest to find feature importance
    rf_model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=random_state)
    rf_model.fit(X_train_scaled, y_train)
    
    # Get feature importance
    feature_importances = rf_model.feature_importances_
    
    # Sort features by importance
    feature_importance_df = pd.DataFrame({
        'Feature': X.columns,
        'Importance': feature_importances
    }).sort_values(by='Importance', ascending=False)
    
    print("Features sorted by importance:")
    print(feature_importance_df)
    
    # Select top features (you can decide how many to select)
    selected_features = feature_importance_df['Feature'].head(25)  # Selecting top 25 features
    return selected_features

# Step 2: Train a model using selected features with Cross-Validation
def train_model_with_selected_features_cv(X, y, selected_features, testData=None, idTest=None, model_type='xgboost'):
    X_selected = X[selected_features]
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_selected)

    # Select model type
    if model_type == 'gradiant_boosting':
        model = GradientBoostingRegressor(n_estimators=120, learning_rate=0.05, max_depth=4, random_state=21)
    elif model_type == 'xgboost':
        model = XGBRegressor(n_estimators=120, learning_rate=0.05, max_depth=4, random_state=21)
    else:
        raise ValueError("Unsupported model type")

    # Perform 5-Fold Cross-Validation
    cv_scores = cross_val_score(model, X_scaled, y, cv=50, scoring='neg_mean_absolute_error')

    # Since the scoring returns negative MAE, we'll negate it to show positive values
    print(f"\nCross-validated MAE (5-Fold): {np.mean(-cv_scores):.4f}")
    
    # Fit the model on the entire dataset
    model.fit(X_scaled, y)
    
    # Predict on testData (optional)
    if testData is not None:
        X_test_scaled = scaler.transform(testData[selected_features])
        testPredictions = model.predict(X_test_scaled)
        
        output = pd.DataFrame({
            'id': idTest,
            'yield': testPredictions
        })
        
        output_file = 'submission1.csv'
        output.to_csv(output_file, index=False)
        print(f"Predictions saved to {output_file}")
    return model

# Step 1: Select best features using feature importance
selected_features = select_best_features_using_importance(X_no_outliers, y_no_outliers)

# Step 2: Train a model with cross-validation using those selected features
train_model_with_selected_features_cv(X_no_outliers, y_no_outliers, selected_features, testData=xX, idTest=testData['id'])

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from xgboost import XGBRegressor
import numpy as np
import warnings
warnings.filterwarnings("ignore")
trainData = pd.read_csv("train.csv")
testData = pd.read_csv("test.csv")

feature_to_use = trainData.columns[2:18]
X = trainData[feature_to_use]
y = trainData['yield']
xX = testData[feature_to_use]
feature_to_use
# Feature Engineering on Training Data
def feature_engineering(df):
    # Create new features
    df['honeybee_bumbles_ratio'] = df['honeybee'] / (df['bumbles'] + 1)  # Avoid division by zero
    df['fruitset_per_raining_days'] = df['fruitset'] / (df['RainingDays'] + 1)  # Avoid division by zero
    df['fruitmass_per_seed'] = df['fruitmass'] / (df['seeds'] + 1)  # Avoid division by zero
    df['clonesize_squared'] = df['clonesize'] ** 2
    df['honeybee_andrena_interaction'] = df['honeybee'] * df['andrena']
    
    # Adding more features
    df['total_pollinators'] = df['honeybee'] + df['bumbles'] + df['andrena'] + df['osmia']
    df['average_rainfall'] = df[['MaxOfUpperTRange', 'MinOfUpperTRange', 'AverageOfUpperTRange']].mean(axis=1)
    df['fruitset_squared'] = df['fruitset'] ** 2
    df['seeds_log'] = np.log1p(df['seeds'])  # Log transformation
    df['fruitmass_log'] = np.log1p(df['fruitmass'])  # Log transformation
    df['honeybee_squared'] = df['honeybee'] ** 2
    df['bumbles_squared'] = df['bumbles'] ** 2
    df['andrena_squared'] = df['andrena'] ** 2
    df['osmia_squared'] = df['osmia'] ** 2
    df['fruitset_to_clonesize'] = df['fruitset'] / (df['clonesize'] + 1)  # Avoid division by zero
    df['fruitmass_to_clonesize'] = df['fruitmass'] / (df['clonesize'] + 1)  # Avoid division by zero
    df['rain_days_squared'] = df['RainingDays'] ** 2
    df['average_upper_range'] = (df['MaxOfUpperTRange'] + df['MinOfUpperTRange']) / 2
    df['upper_range_diff'] = df['MaxOfUpperTRange'] - df['MinOfUpperTRange']
    df['lower_range_diff'] = df['MaxOfLowerTRange'] - df['MinOfLowerTRange']
    df['average_lower_range'] = (df['MaxOfLowerTRange'] + df['MinOfLowerTRange']) / 2
    df['fruitset_to_rain_days'] = df['fruitset'] / (df['RainingDays'] + 1)  # Avoid division by zero
    df['fruitmass_to_rain_days'] = df['fruitmass'] / (df['RainingDays'] + 1)  # Avoid division by zero
    df['honeybee_and_bumbles'] = df['honeybee'] + df['bumbles']
    df['honeybee_and_andrena'] = df['honeybee'] + df['andrena']
    df['bumbles_and_andrena'] = df['bumbles'] + df['andrena']
    df['osmia_and_bumbles'] = df['osmia'] + df['bumbles']
    df['osmia_and_andrena'] = df['osmia'] + df['andrena']
    df['osmia_and_honeybee'] = df['osmia'] + df['honeybee']
    
    # Adding polynomial features
    df['honeybee_bumbles_product'] = df['honeybee'] * df['bumbles']
    df['honeybee_andrena_product'] = df['honeybee'] * df['andrena']
    df['bumbles_andrena_product'] = df['bumbles'] * df['andrena']
    df['osmia_honeybee_product'] = df ['osmia'] * df['honeybee']
    df['osmia_bumbles_product'] = df['osmia'] * df['bumbles']
    df['osmia_andrena_product'] = df['osmia'] * df['andrena']
    
    return df
X = feature_engineering(X)
xX = feature_engineering(xX)
# Step 1: Select best features using Random Forest Feature Importance (this part remains the same)
def select_best_features_using_importance(X, y, n_estimators=500, max_depth=5, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state)
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Fit RandomForest to find feature importance
    rf_model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=random_state)
    rf_model.fit(X_train_scaled, y_train)
    
    # Get feature importance
    feature_importances = rf_model.feature_importances_
    
    # Sort features by importance
    feature_importance_df = pd.DataFrame({
        'Feature': X.columns,
        'Importance': feature_importances
    }).sort_values(by='Importance', ascending=False)
    
    print("Features sorted by importance:")
    print(feature_importance_df)
    
    # Select top features (you can decide how many to select)
    selected_features = feature_importance_df['Feature'].head(25)  # Selecting top 5 features
    return selected_features

# Step 2: Train a model using selected features with Cross-Validation
def train_model_with_selected_features_cv(X, y, selected_features, testData=None, idTest=None, model_type='gradiant_boosting'):
    X_selected = X[selected_features]
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_selected)

    # Select model type
    if model_type == 'gradiant_boosting':
        model = GradientBoostingRegressor(n_estimators=120, learning_rate=0.06, max_depth=4, random_state=42)
    elif model_type == 'xgboost':
        model = XGBRegressor(n_estimators=120, learning_rate=0.06, max_depth=4, random_state=42)
    else:
        raise ValueError("Unsupported model type")

    # Perform 5-Fold Cross-Validation
    cv_scores = cross_val_score(model, X_scaled, y, cv=20, scoring='neg_mean_absolute_error')

    # Since the scoring returns negative MAE, we'll negate it to show positive values
    print(f"\nCross-validated MAE (5-Fold): {np.mean(-cv_scores):.4f}")
    
    # Fit the model on the entire dataset
    model.fit(X_scaled, y)
    
    # Predict on testData (optional)
    if testData is not None:
        X_test_scaled = scaler.transform(testData[selected_features])
        testPredictions = model.predict(X_test_scaled)
        
        output = pd.DataFrame({
            'id': idTest,
            'yield': testPredictions
        })
        
        output_file = 'submission.csv'
        output.to_csv(output_file, index=False)
        print(f"Predictions saved to {output_file}")
    return model
# Step 1: Select best features using feature importance
selected_features = select_best_features_using_importance(X, y)

# Step 2: Train a model with cross-validation using those selected features
train_model_with_selected_features_cv(X, y, selected_features, testData=xX, idTest=testData['id'])


In [None]:
# import pandas as pd
# import numpy as np
# from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
# from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
# from sklearn.preprocessing import StandardScaler
# from sklearn.metrics import mean_absolute_error
# from xgboost import XGBRegressor
# import warnings

# warnings.filterwarnings("ignore")

# # Load the datasets
# trainData = pd.read_csv("train.csv")
# testData = pd.read_csv("test.csv")

# # Define the features and target variable
# feature_to_use = trainData.columns[2:18]
# X = trainData[feature_to_use]
# y = trainData['yield']
# xX = testData[feature_to_use]

# # Outlier Detection and Removal using IQR Method
# def remove_outliers(df):
#     for column in df.columns:
#         Q1 = df[column].quantile(0.25)  # 25th percentile
#         Q3 = df[column].quantile(0.75)  # 75th percentile
#         IQR = Q3 - Q1
#         lower_bound = Q1 - 1.5 * IQR
#         upper_bound = Q3 + 1.5 * IQR
#         df = df[~((df[column] < lower_bound) | (df[column] > upper_bound))]
#     return df

# # Remove outliers from the training data
# X_no_outliers = remove_outliers(X)
# y_no_outliers = y[X.index[X_no_outliers.index]]  # Align target variable with the cleaned features

# # Feature Engineering
# def feature_engineering(df):
#     # Create new features (as per your previous definition)
#     # Ensure to add only useful features
#     return df  # Ensure to return the modified DataFrame

# # Apply feature engineering to the datasets
# X_no_outliers = feature_engineering(X_no_outliers)
# xX = feature_engineering(xX)

# # Select best features using Random Forest Feature Importance
# def select_best_features_using_importance(X, y):
#     scaler = StandardScaler()
#     X_scaled = scaler.fit_transform(X)
    
#     rf_model = RandomForestRegressor(n_estimators=500, max_depth=6, random_state=42)
#     rf_model.fit(X_scaled, y)
    
#     feature_importances = rf_model.feature_importances_
#     feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False)
#     selected_features = feature_importance_df['Feature'].head(25)  # Adjust number of features to select
#     return selected_features

# # Train a model using selected features with Cross-Validation
# def train_model_with_selected_features_cv(X, y, selected_features, testData=None, idTest=None):
#     X_selected = X[selected_features]
#     scaler = StandardScaler()
#     X_scaled = scaler.fit_transform(X_selected)

#     # Hyperparameter tuning with Grid Search
#     param_grid = {
#         'n_estimators': [100, 120],
#         'learning_rate': [0.01, 0.05],
#         'max_depth': [3, 4],
#     }
#     grid_search = GridSearchCV(XGBRegressor(random_state=21), param_grid, cv=5, scoring='neg_mean_absolute_error')
#     grid_search.fit(X_scaled, y)

#     best_model = grid_search.best_estimator_
#     print("Best parameters found: ", grid_search.best_params_)

#     # Evaluate performance
#     cv_scores = cross_val_score(best_model, X_scaled, y, cv=10, scoring='neg_mean_absolute_error')
#     print(f"\nCross-validated MAE (10-Fold): {np.mean(-cv_scores):.4f}")

#     # Fit the model on the entire dataset
#     best_model.fit(X_scaled, y)

#     # Predict on testData (optional)
#     if testData is not None:
#         X_test_scaled = scaler.transform(testData[selected_features])
#         testPredictions = best_model.predict(X_test_scaled)

#         output = pd.DataFrame({'id': idTest, 'yield': testPredictions})
#         output_file = 'submission1.csv'
#         output.to_csv(output_file, index=False)
#         print(f"Predictions saved to {output_file}")

#     return best_model

# # Step 1: Select best features using feature importance
# selected_features = select_best_features_using_importance(X_no_outliers, y_no_outliers)

# # Step 2: Train a model with cross-validation using those selected features
# train_model_with_selected_features_cv(X_no_outliers, y_no_outliers, selected_features, testData=xX, idTest=testData['id'])




import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
import warnings

warnings.filterwarnings("ignore")

# Load the datasets
trainData = pd.read_csv("train.csv")
testData = pd.read_csv("test.csv")

# Define the features and target variable
feature_to_use = trainData.columns[2:18]
X = trainData[feature_to_use]
y = trainData['yield']
xX = testData[feature_to_use]

# Outlier Detection and Adjustment using IQR Method
def adjust_outliers(df, target):
    # Calculate the IQR
    Q1 = df[target].quantile(0.3)
    Q3 = df[target].quantile(0.7)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Identify outliers
    outliers = df[(df[target] < lower_bound) | (df[target] > upper_bound)]

    # Fit a model on non-outlier data
    non_outliers = df[(df[target] >= lower_bound) & (df[target] <= upper_bound)]
    X_non_outliers = non_outliers.drop(columns=[target])
    y_non_outliers = non_outliers[target]
    
    # Train a regression model on the non-outlier data
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_non_outliers, y_non_outliers)
    
    # Predict values for outliers
    outlier_features = outliers.drop(columns=[target])
    predicted_values = model.predict(outlier_features)

    # Adjust outlier values to the predicted values
    df.loc[outliers.index, target] = predicted_values
    return df

# Adjust outliers in the training data
trainData = adjust_outliers(trainData, 'yield')
X = trainData[feature_to_use]
y = trainData['yield']

# Feature Engineering
def feature_engineering(df):
    # Create new features (as per your previous definition)
    df['honeybee_bumbles_ratio'] = df['honeybee'] / (df['bumbles'] + 1)
    df['fruitset_per_raining_days'] = df['fruitset'] / (df['RainingDays'] + 1)
    df['fruitmass_per_seed'] = df['fruitmass'] / (df['seeds'] + 1)
    # Add other features as needed...
    return df

# Apply feature engineering to the datasets
X = feature_engineering(X)
xX = feature_engineering(xX)

# Select best features using Random Forest Feature Importance
def select_best_features_using_importance(X, y):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    rf_model = RandomForestRegressor(n_estimators=500, max_depth=6, random_state=42)
    rf_model.fit(X_scaled, y)
    
    feature_importances = rf_model.feature_importances_
    feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False)
    selected_features = feature_importance_df['Feature'].head(25)  # Adjust number of features to select
    return selected_features

# Train a model using selected features with Cross-Validation
def train_model_with_selected_features_cv(X, y, selected_features, testData=None, idTest=None):
    X_selected = X[selected_features]
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_selected)

    # Hyperparameter tuning with Grid Search
    param_grid = {
        'n_estimators': [100, 120],
        'learning_rate': [0.01, 0.05],
        'max_depth': [3, 4],
    }
    grid_search = GridSearchCV(XGBRegressor(random_state=21), param_grid, cv=5, scoring='neg_mean_absolute_error')
    grid_search.fit(X_scaled, y)

    best_model = grid_search.best_estimator_
    print("Best parameters found: ", grid_search.best_params_)

    # Evaluate performance
    cv_scores = cross_val_score(best_model, X_scaled, y, cv=10, scoring='neg_mean_absolute_error')
    print(f"\nCross-validated MAE (10-Fold): {np.mean(-cv_scores):.4f}")

    # Fit the model on the entire dataset
    best_model.fit(X_scaled, y)

    # Predict on testData (optional)
    if testData is not None:
        X_test_scaled = scaler.transform(testData[selected_features])
        testPredictions = best_model.predict(X_test_scaled)

        output = pd.DataFrame({'id': idTest, 'yield': testPredictions})
        output_file = 'submission1.csv'
        output.to_csv(output_file, index=False)
        print(f"Predictions saved to {output_file}")

    return best_model

# Step 1: Select best features using feature importance
selected_features = select_best_features_using_importance(X, y)

# Step 2: Train a model with cross-validation using those selected features
train_model_with_selected_features_cv(X, y, selected_features, testData=xX, idTest=testData['id'])


In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
import warnings

warnings.filterwarnings("ignore")

# Load the datasets
trainData = pd.read_csv("train.csv")
testData = pd.read_csv("test.csv")

# Define the features and target variable
feature_to_use = trainData.columns[2:18]
X = trainData[feature_to_use]
y = trainData['yield']
xX = testData[feature_to_use]

# Outlier Detection and Adjustment using IQR Method
def adjust_outliers(df, target):
    # Calculate the IQR
    Q1 = df[target].quantile(0.25)
    Q3 = df[target].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Identify outliers
    outliers = df[(df[target] < lower_bound) | (df[target] > upper_bound)]

    # Fit a model on non-outlier data
    non_outliers = df[(df[target] >= lower_bound) & (df[target] <= upper_bound)]
    X_non_outliers = non_outliers.drop(columns=[target])
    y_non_outliers = non_outliers[target]
    
    # Train a regression model on the non-outlier data
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_non_outliers, y_non_outliers)
    
    # Predict values for outliers
    outlier_features = outliers.drop(columns=[target])
    predicted_values = model.predict(outlier_features)

    # Adjust outlier values to the predicted values
    df.loc[outliers.index, target] = predicted_values
    return df

# Adjust outliers in the training data
trainData = adjust_outliers(trainData, 'yield')
X = trainData[feature_to_use]
y = trainData['yield']

# Feature Engineering
def feature_engineering(df):
    df['honeybee_bumbles_ratio'] = df['honeybee'] / (df['bumbles'] + 1)
    df['fruitset_per_raining_days'] = df['fruitset'] / (df['RainingDays'] + 1)
    df['fruitmass_per_seed'] = df['fruitmass'] / (df['seeds'] + 1)
    return df

# Apply feature engineering to the datasets
X = feature_engineering(X)
xX = feature_engineering(xX)

# Select best features using Random Forest Feature Importance
def select_best_features_using_importance(X, y):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    rf_model = RandomForestRegressor(n_estimators=500, max_depth=6, random_state=42)
    rf_model.fit(X_scaled, y)
    
    feature_importances = rf_model.feature_importances_
    feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False)
    selected_features = feature_importance_df['Feature'].head(25)  # Adjust number of features to select
    return selected_features

# Train a model using selected features with Cross-Validation
def train_model_with_selected_features_cv(X, y, selected_features, testData=None, idTest=None):
    X_selected = X[selected_features]
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_selected)

    # Hyperparameter tuning with Grid Search for XGBoost
    param_grid = {
        'n_estimators': [100, 120],
        'learning_rate': [0.01, 0.05],
        'max_depth': [3, 4],
        'min_child_weight': [1, 3],
        'gamma': [0, 0.1]
    }
    
    grid_search = GridSearchCV(XGBRegressor(random_state=21), param_grid, cv=5, scoring='neg_mean_absolute_error')
    grid_search.fit(X_scaled, y)

    best_model = grid_search.best_estimator_
    print("Best parameters found: ", grid_search.best_params_)

    # Evaluate performance with Cross-Validation
    cv_scores = cross_val_score(best_model, X_scaled, y, cv=10, scoring='neg_mean_absolute_error')
    print(f"\nCross-validated MAE (10-Fold): {np.mean(-cv_scores):.4f}")

    # Fit the model on the entire dataset
    best_model.fit(X_scaled, y)

    # Predict on testData (optional)
    if testData is not None:
        X_test_scaled = scaler.transform(testData[selected_features])
        testPredictions = best_model.predict(X_test_scaled)

        output = pd.DataFrame({'id': idTest, 'yield': testPredictions})
        output_file = 'submission1.csv'
        output.to_csv(output_file, index=False)
        print(f"Predictions saved to {output_file}")

    return best_model

# Step 1: Select best features using feature importance
selected_features = select_best_features_using_importance(X, y)

# Step 2: Train a model with cross-validation using those selected features
train_model_with_selected_features_cv(X, y, selected_features, testData=xX, idTest=testData['id'])


In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
import warnings

warnings.filterwarnings("ignore")

# Load the datasets
trainData = pd.read_csv("train.csv")
testData = pd.read_csv("test.csv")

# Define the features and target variable
feature_to_use = trainData.columns[2:18]
X = trainData[feature_to_use]
y = trainData['yield']
xX = testData[feature_to_use]

# Outlier Detection and Adjustment using IQR Method
def adjust_outliers(df, target):
    Q1 = df[target].quantile(0.25)
    Q3 = df[target].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = df[(df[target] < lower_bound) | (df[target] > upper_bound)]
    non_outliers = df[(df[target] >= lower_bound) & (df[target] <= upper_bound)]
    
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(non_outliers.drop(columns=[target]), non_outliers[target])
    
    outlier_features = outliers.drop(columns=[target])
    predicted_values = model.predict(outlier_features)
    
    df.loc[outliers.index, target] = predicted_values
    return df

# Adjust outliers in the training data
trainData = adjust_outliers(trainData, 'yield')
X = trainData[feature_to_use]
y = trainData['yield']

# Feature Engineering
def feature_engineering(df):
    df['honeybee_bumbles_ratio'] = df['honeybee'] / (df['bumbles'] + 1)
    df['fruitset_per_raining_days'] = df['fruitset'] / (df['RainingDays'] + 1)
    df['fruitmass_per_seed'] = df['fruitmass'] / (df['seeds'] + 1)
    return df

X = feature_engineering(X)
xX = feature_engineering(xX)

# Select best features using Random Forest Feature Importance
def select_best_features_using_importance(X, y):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    rf_model = RandomForestRegressor(n_estimators=200, max_depth=6, random_state=42, n_jobs=-1)
    rf_model.fit(X_scaled, y)
    
    feature_importances = rf_model.feature_importances_
    feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False)
    selected_features = feature_importance_df['Feature'].head(15)  # Reduced number of features to select
    return selected_features

# Train a model using selected features with Cross-Validation
def train_model_with_selected_features_cv(X, y, selected_features, testData=None, idTest=None):
    X_selected = X[selected_features]
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_selected)

    # Hyperparameter tuning with Grid Search for XGBoost
    param_grid = {
        'n_estimators': [100, 120],
        'learning_rate': [0.01, 0.05],
        'max_depth': [3, 4],
        'min_child_weight': [1, 3],
        'gamma': [0, 0.1]
    }
    
    # Use RandomizedSearchCV for faster tuning
    grid_search = GridSearchCV(XGBRegressor(random_state=21, n_jobs=-1), param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
    grid_search.fit(X_scaled, y)

    best_model = grid_search.best_estimator_
    print("Best parameters found: ", grid_search.best_params_)

    # Evaluate performance with Cross-Validation
    cv_scores = cross_val_score(best_model, X_scaled, y, cv=10, scoring='neg_mean_absolute_error', n_jobs=-1)
    print(f"\nCross-validated MAE (10-Fold): {np.mean(-cv_scores):.4f}")

    # Fit the model on the entire dataset
    best_model.fit(X_scaled, y)

    # Predict on testData (optional)
    if testData is not None:
        X_test_scaled = scaler.transform(testData[selected_features])
        testPredictions = best_model.predict(X_test_scaled)

        output = pd.DataFrame({'id': idTest, 'yield': testPredictions})
        output_file = 'submission1.csv'
        output.to_csv(output_file, index=False)
        print(f"Predictions saved to {output_file}")

    return best_model

# Step 1: Select best features using feature importance
selected_features = select_best_features_using_importance(X, y)

# Step 2: Train a model with cross-validation using those selected features
train_model_with_selected_features_cv(X, y, selected_features, testData=xX, idTest=testData['id'])


In [1]:
# import pandas as pd
# import numpy as np
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
# from sklearn.preprocessing import StandardScaler, PolynomialFeatures
# from sklearn.metrics import mean_absolute_error
# from xgboost import XGBRegressor
# import warnings

# warnings.filterwarnings("ignore")

# # Load the datasets
# trainData = pd.read_csv("train.csv")
# testData = pd.read_csv("test.csv")

# # Define the features and target variable
# feature_to_use = trainData.columns[2:18]
# X = trainData[feature_to_use]
# y = trainData['yield']
# xX = testData[feature_to_use]

# # Outlier Detection and Adjustment using IQR Method
# def adjust_outliers(df, target):
#     Q1 = df[target].quantile(0.25)
#     Q3 = df[target].quantile(0.75)
#     IQR = Q3 - Q1
#     lower_bound = Q1 - 1.5 * IQR
#     upper_bound = Q3 + 1.5 * IQR
    
#     outliers = df[(df[target] < lower_bound) | (df[target] > upper_bound)]
#     non_outliers = df[(df[target] >= lower_bound) & (df[target] <= upper_bound)]
    
#     # Fit a model on non-outlier data
#     model = RandomForestRegressor(n_estimators=100, random_state=42)
#     model.fit(non_outliers.drop(columns=[target]), non_outliers[target])
    
#     # Predict values for outliers
#     outlier_features = outliers.drop(columns=[target])
#     predicted_values = model.predict(outlier_features)

#     # Adjust outlier values to the predicted values
#     df.loc[outliers.index, target] = predicted_values
#     return df

# # Adjust outliers in the training data
# trainData = adjust_outliers(trainData, 'yield')
# X = trainData[feature_to_use]
# y = trainData['yield']

# # Feature Engineering
# def feature_engineering(df):
#     df['honeybee_bumbles_ratio'] = df['honeybee'] / (df['bumbles'] + 1)
#     df['fruitset_per_raining_days'] = df['fruitset'] / (df['RainingDays'] + 1)
#     df['fruitmass_per_seed'] = df['fruitmass'] / (df['seeds'] + 1)
#     return df

# # Apply feature engineering to the datasets
# X = feature_engineering(X)
# xX = feature_engineering(xX)

# # Add polynomial features
# def add_polynomial_features(X, degree=2):
#     poly = PolynomialFeatures(degree=degree, include_bias=False)
#     return pd.DataFrame(poly.fit_transform(X), columns=poly.get_feature_names_out(X.columns))

# # Step 1: Add polynomial features
# X_poly = add_polynomial_features(X)

# # Select best features using Random Forest Feature Importance
# def select_best_features_using_importance(X, y):
#     scaler = StandardScaler()
#     X_scaled = scaler.fit_transform(X)
    
#     rf_model = RandomForestRegressor(n_estimators=500, max_depth=6, random_state=42)
#     rf_model.fit(X_scaled, y)
    
#     feature_importances = rf_model.feature_importances_
#     feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False)
#     selected_features = feature_importance_df['Feature'].head(25)  # Adjust number of features to select
#     return selected_features

# # Train a model using selected features with Cross-Validation
# def train_model_with_selected_features_cv(X, y, selected_features, testData=None, idTest=None):
#     X_selected = X[selected_features]
#     scaler = StandardScaler()
#     X_scaled = scaler.fit_transform(X_selected)

#     # Hyperparameter tuning with Grid Search for XGBoost
#     param_grid = {
#         'n_estimators': [100, 120],
#         'learning_rate': [0.01, 0.05],
#         'max_depth': [3, 4, 5],
#         'min_child_weight': [1, 2, 3],
#         'gamma': [0, 0.1, 0.2],
#         'subsample': [0.7, 0.8, 0.9],
#         # 'colsample_bytree': [0.7, 0.8, 0.9]
#     }
    
#     grid_search = GridSearchCV(XGBRegressor(random_state=21, n_jobs=-1), param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
#     grid_search.fit(X_scaled, y)

#     best_model = grid_search.best_estimator_
#     print("Best parameters found: ", grid_search.best_params_)

#     # Evaluate performance with Cross-Validation
#     cv_scores = cross_val_score(best_model, X_scaled, y, cv=10, scoring='neg_mean_absolute_error', n_jobs=-1)
#     print(f"\nCross-validated MAE (10-Fold): {np.mean(-cv_scores):.4f}")

#     # Fit the model on the entire dataset
#     best_model.fit(X_scaled, y)

#     # Predict on testData (optional)
#     if testData is not None:
#         X_test_scaled = scaler.transform(testData[selected_features])
#         testPredictions = best_model.predict(X_test_scaled)

#         output = pd.DataFrame({'id': idTest, 'yield': testPredictions})
#         output_file = 'submission1.csv'
#         output.to_csv(output_file, index=False)
#         print(f"Predictions saved to {output_file}")

#     return best_model

# # Step 1: Select best features using feature importance
# selected_features = select_best_features_using_importance(X_poly, y)

# # Step 2: Train a model with cross-validation using those selected features
# train_model_with_selected_features_cv(X_poly, y, selected_features, testData=xX, idTest=testData['id'])


Best parameters found:  {'gamma': 0, 'learning_rate': 0.05, 'max_depth': 5, 'min_child_weight': 3, 'n_estimators': 120, 'subsample': 0.9}

Cross-validated MAE (10-Fold): 244.4416


KeyError: "['fruitset^2', 'fruitset seeds', 'seeds^2', 'andrena fruitset_per_raining_days', 'fruitset fruitmass', 'fruitmass seeds', 'seeds fruitmass_per_seed', 'clonesize fruitset', 'AverageOfLowerTRange seeds', 'RainingDays fruitset_per_raining_days', 'MaxOfLowerTRange seeds', 'clonesize fruitset_per_raining_days', 'MaxOfUpperTRange seeds', 'andrena fruitset', 'osmia seeds', 'AverageOfUpperTRange seeds', 'fruitset fruitmass_per_seed', 'MinOfLowerTRange seeds', 'bumbles fruitset', 'fruitmass_per_seed^2', 'AverageOfUpperTRange fruitmass_per_seed', 'bumbles seeds', 'MaxOfUpperTRange fruitset'] not in index"

In [4]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
import warnings

warnings.filterwarnings("ignore")

# Load the datasets
trainData = pd.read_csv("train.csv")
testData = pd.read_csv("test.csv")

# Define the features and target variable
feature_to_use = trainData.columns[2:18]
X = trainData[feature_to_use]
y = trainData['yield']
xX = testData[feature_to_use]

# Outlier Detection and Adjustment using IQR Method
def adjust_outliers(df, target):
    Q1 = df[target].quantile(0.25)
    Q3 = df[target].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = df[(df[target] < lower_bound) | (df[target] > upper_bound)]
    non_outliers = df[(df[target] >= lower_bound) & (df[target] <= upper_bound)]
    
    # Fit a model on non-outlier data
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(non_outliers.drop(columns=[target]), non_outliers[target])
    
    # Predict values for outliers
    outlier_features = outliers.drop(columns=[target])
    predicted_values = model.predict(outlier_features)

    # Adjust outlier values to the predicted values
    df.loc[outliers.index, target] = predicted_values
    return df

# Adjust outliers in the training data
trainData = adjust_outliers(trainData, 'yield')
X = trainData[feature_to_use]
y = trainData['yield']

# Feature Engineering
def feature_engineering(df):
    df['honeybee_bumbles_ratio'] = df['honeybee'] / (df['bumbles'] + 1)
    df['fruitset_per_raining_days'] = df['fruitset'] / (df['RainingDays'] + 1)
    df['fruitmass_per_seed'] = df['fruitmass'] / (df['seeds'] + 1)
    return df

# Apply feature engineering to the datasets
X = feature_engineering(X)
xX = feature_engineering(xX)

# Add polynomial features
def add_polynomial_features(X, degree=2):
    poly = PolynomialFeatures(degree=degree, include_bias=False)
    return pd.DataFrame(poly.fit_transform(X), columns=poly.get_feature_names_out(X.columns))

# Apply polynomial features to both train and test sets
X_poly = add_polynomial_features(X)
xX_poly = add_polynomial_features(xX)

# Select best features using Random Forest Feature Importance
def select_best_features_using_importance(X, y):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    rf_model = RandomForestRegressor(n_estimators=500, max_depth=6, random_state=42)
    rf_model.fit(X_scaled, y)
    
    feature_importances = rf_model.feature_importances_
    feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False)
    selected_features = feature_importance_df['Feature'].head(25)  # Adjust number of features to select
    return selected_features

# Train a model using selected features with Cross-Validation
def train_model_with_selected_features_cv(X, y, selected_features, testData=None, idTest=None):
    X_selected = X[selected_features]
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_selected)

    # Hyperparameter tuning with Grid Search for XGBoost
    param_grid = {
        'n_estimators': [100, 120],
        'learning_rate': [0.01, 0.05],
        'max_depth': [3, 4, 5],
        'min_child_weight': [1, 2, 3],
        'gamma': [0, 0.1, 0.2],
        'subsample': [0.7, 0.8, 0.9],
        # 'colsample_bytree': [0.7, 0.8, 0.9]
    }
    
    grid_search = GridSearchCV(XGBRegressor(random_state=42, n_jobs=-1), param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
    grid_search.fit(X_scaled, y)

    best_model = grid_search.best_estimator_
    print("Best parameters found: ", grid_search.best_params_)

    # Evaluate performance with Cross-Validation
    cv_scores = cross_val_score(best_model, X_scaled, y, cv=10, scoring='neg_mean_absolute_error', n_jobs=-1)
    print(f"\nCross-validated MAE (10-Fold): {np.mean(-cv_scores):.4f}")

    # Fit the model on the entire dataset
    best_model.fit(X_scaled, y)

    # Predict on testData (optional)
    if testData is not None:
        X_test_scaled = scaler.transform(testData[selected_features])
        testPredictions = best_model.predict(X_test_scaled)

        output = pd.DataFrame({'id': idTest, 'yield': testPredictions})
        output_file = 'submission1.csv'
        output.to_csv(output_file, index=False)
        print(f"Predictions saved to {output_file}")

    return best_model

# Step 1: Select best features using feature importance
selected_features = select_best_features_using_importance(X_poly, y)

# Step 2: Train a model with cross-validation using those selected features
train_model_with_selected_features_cv(X_poly, y, selected_features, testData=xX_poly, idTest=testData['id'])


KeyboardInterrupt: 

In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
import warnings

warnings.filterwarnings("ignore")

# Load the datasets
trainData = pd.read_csv("train.csv")
testData = pd.read_csv("test.csv")

# Define the features and target variable
feature_to_use = trainData.columns[2:18]
X = trainData[feature_to_use]
y = trainData['yield']
xX = testData[feature_to_use]

# Outlier Detection and Adjustment using IQR Method
def adjust_outliers(df, target):
    Q1 = df[target].quantile(0.25)
    Q3 = df[target].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = df[(df[target] < lower_bound) | (df[target] > upper_bound)]
    non_outliers = df[(df[target] >= lower_bound) & (df[target] <= upper_bound)]
    
    # Fit a model on non-outlier data
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(non_outliers.drop(columns=[target]), non_outliers[target])
    
    # Predict values for outliers
    outlier_features = outliers.drop(columns=[target])
    predicted_values = model.predict(outlier_features)

    # Adjust outlier values to the predicted values
    df.loc[outliers.index, target] = predicted_values
    return df

# Adjust outliers in the training data
trainData = adjust_outliers(trainData, 'yield')
X = trainData[feature_to_use]
y = trainData['yield']

# Feature Engineering
def feature_engineering(df):
    df['honeybee_bumbles_ratio'] = df['honeybee'] / (df['bumbles'] + 1)
    df['fruitset_per_raining_days'] = df['fruitset'] / (df['RainingDays'] + 1)
    df['fruitmass_per_seed'] = df['fruitmass'] / (df['seeds'] + 1)
    return df

# Apply feature engineering to the datasets
X = feature_engineering(X)
xX = feature_engineering(xX)

# Add polynomial features
def add_polynomial_features(X, degree=2):
    poly = PolynomialFeatures(degree=degree, include_bias=False)
    return pd.DataFrame(poly.fit_transform(X), columns=poly.get_feature_names_out(X.columns))

# Apply polynomial features to both train and test sets
X_poly = add_polynomial_features(X, degree=2)
xX_poly = add_polynomial_features(xX, degree=2)

# Select best features using Random Forest Feature Importance
def select_best_features_using_importance(X, y):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    rf_model = RandomForestRegressor(n_estimators=500, max_depth=6, random_state=42)
    rf_model.fit(X_scaled, y)
    
    feature_importances = rf_model.feature_importances_
    feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False)
    selected_features = feature_importance_df['Feature'].head(25)  # Adjust number of features to select
    return selected_features

# Train a model using selected features with Cross-Validation
def train_model_with_selected_features_cv(X, y, selected_features, testData=None, idTest=None):
    X_selected = X[selected_features]
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_selected)

    # Hyperparameter tuning with Randomized Search for XGBoost
    param_grid = {
        'n_estimators': [100, 120, 150],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 4, 5],
        'min_child_weight': [1, 2, 3],
        'gamma': [0, 0.1, 0.2],
        'subsample': [0.7, 0.8, 0.9]
    }
    
    random_search = RandomizedSearchCV(
        XGBRegressor(random_state=21, n_jobs=-1),
        param_distributions=param_grid,
        cv=5,
        n_iter=10,  # Limit the number of iterations for faster tuning
        scoring='neg_mean_absolute_error',
        n_jobs=-1
    )
    random_search.fit(X_scaled, y)

    best_model = random_search.best_estimator_
    print("Best parameters found: ", random_search.best_params_)

    # Evaluate performance with Cross-Validation
    cv_scores = cross_val_score(best_model, X_scaled, y, cv=10, scoring='neg_mean_absolute_error', n_jobs=-1)
    print(f"\nCross-validated MAE (10-Fold): {np.mean(-cv_scores):.4f}")

    # Fit the model on the entire dataset
    best_model.fit(X_scaled, y)

    # Predict on testData (optional)
    if testData is not None:
        X_test_scaled = scaler.transform(testData[selected_features])
        testPredictions = best_model.predict(X_test_scaled)

        output = pd.DataFrame({'id': idTest, 'yield': testPredictions})
        output_file = 'submission1.csv'
        output.to_csv(output_file, index=False)
        print(f"Predictions saved to {output_file}")

    return best_model

# Step 1: Select best features using feature importance
selected_features = select_best_features_using_importance(X_poly, y)

# Step 2: Train a model with cross-validation using those selected features
train_model_with_selected_features_cv(X_poly, y, selected_features, testData=xX_poly, idTest=testData['id'])


Best parameters found:  {'subsample': 0.9, 'n_estimators': 120, 'min_child_weight': 1, 'max_depth': 5, 'learning_rate': 0.05, 'gamma': 0.1}

Cross-validated MAE (10-Fold): 244.7627
Predictions saved to submission1.csv


In [5]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
import warnings

warnings.filterwarnings("ignore")

# Load the datasets
trainData = pd.read_csv("train.csv")
testData = pd.read_csv("test.csv")

# Define the features and target variable
feature_to_use = trainData.columns[2:18]
X = trainData[feature_to_use]
y = trainData['yield']
xX = testData[feature_to_use]

# Outlier Detection and Adjustment using IQR Method
def adjust_outliers(df, target):
    Q1 = df[target].quantile(0.25)
    Q3 = df[target].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = df[(df[target] < lower_bound) | (df[target] > upper_bound)]
    non_outliers = df[(df[target] >= lower_bound) & (df[target] <= upper_bound)]
    
    # Fit a model on non-outlier data
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(non_outliers.drop(columns=[target]), non_outliers[target])
    
    # Predict values for outliers
    outlier_features = outliers.drop(columns=[target])
    predicted_values = model.predict(outlier_features)

    # Adjust outlier values to the predicted values
    df.loc[outliers.index, target] = predicted_values
    return df

# Adjust outliers in the training data
trainData = adjust_outliers(trainData, 'yield')
X = trainData[feature_to_use]
y = trainData['yield']

# Feature Engineering
def feature_engineering(df):
    df['honeybee_bumbles_ratio'] = df['honeybee'] / (df['bumbles'] + 1)
    df['fruitset_per_raining_days'] = df['fruitset'] / (df['RainingDays'] + 1)
    df['fruitmass_per_seed'] = df['fruitmass'] / (df['seeds'] + 1)
    return df

# Apply feature engineering to the datasets
X = feature_engineering(X)
xX = feature_engineering(xX)

# Add polynomial features
def add_polynomial_features(X, degree=2):
    poly = PolynomialFeatures(degree=degree, include_bias=False)
    return pd.DataFrame(poly.fit_transform(X), columns=poly.get_feature_names_out(X.columns))

# Apply polynomial features to both train and test sets
X_poly = add_polynomial_features(X, degree=2)
xX_poly = add_polynomial_features(xX, degree=2)

# Select best features using Random Forest Feature Importance
def select_best_features_using_importance(X, y):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    rf_model = RandomForestRegressor(n_estimators=350, max_depth=6, random_state=42)
    rf_model.fit(X_scaled, y)
    
    feature_importances = rf_model.feature_importances_
    feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False)
    selected_features = feature_importance_df['Feature'].head(15)  # Reduce number of selected features to avoid overfitting
    return selected_features

# Train a model using selected features with Cross-Validation
def train_model_with_selected_features_cv(X, y, selected_features, testData=None, idTest=None):
    X_selected = X[selected_features]
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_selected)

    # Hyperparameter tuning with Randomized Search for XGBoost
    param_grid = {
        'n_estimators': [50, 100, 120],
        'learning_rate': [0.01, 0.05],
        'max_depth': [3, 4, 5],
        'min_child_weight': [1, 2, 3],
        'gamma': [0, 0.1, 0.2],
        'subsample': [0.7, 0.8, 0.9],
        'reg_alpha': [0.1, 0.5, 1],  # Add L1 regularization
        'reg_lambda': [1, 1.5, 2]   # Add L2 regularization
    }
    
    random_search = RandomizedSearchCV(
        XGBRegressor(random_state=21, n_jobs=-1),
        param_distributions=param_grid,
        cv=5,
        n_iter=10,  # Limit the number of iterations for faster tuning
        scoring='neg_mean_absolute_error',
        n_jobs=-1
    )
    random_search.fit(X_scaled, y)

    best_model = random_search.best_estimator_
    print("Best parameters found: ", random_search.best_params_)

    # Evaluate performance with Cross-Validation
    cv_scores = cross_val_score(best_model, X_scaled, y, cv=10, scoring='neg_mean_absolute_error', n_jobs=-1)
    print(f"\nCross-validated MAE (10-Fold): {np.mean(-cv_scores):.4f}")

    # Fit the model on the entire dataset
    best_model.fit(X_scaled, y)

    # Predict on testData (optional)
    if testData is not None:
        X_test_scaled = scaler.transform(testData[selected_features])
        testPredictions = best_model.predict(X_test_scaled)

        output = pd.DataFrame({'id': idTest, 'yield': testPredictions})
        output_file = 'submission1.csv'
        output.to_csv(output_file, index=False)
        print(f"Predictions saved to {output_file}")

    return best_model

# Step 1: Select best features using feature importance
selected_features = select_best_features_using_importance(X_poly, y)

# Step 2: Train a model with cross-validation using those selected features
train_model_with_selected_features_cv(X_poly, y, selected_features, testData=xX_poly, idTest=testData['id'])


Best parameters found:  {'subsample': 0.9, 'reg_lambda': 1, 'reg_alpha': 1, 'n_estimators': 120, 'min_child_weight': 2, 'max_depth': 4, 'learning_rate': 0.05, 'gamma': 0.1}

Cross-validated MAE (10-Fold): 244.8752
Predictions saved to submission1.csv


In [5]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
import warnings

# Load the datasets
trainData = pd.read_csv("train.csv")
testData = pd.read_csv("test.csv")

features_to_use = trainData.columns.tolist()
features_to_use.remove('id')
features_to_use.remove('yield')
features_to_use.remove('Row#')

# Step 1: Select best features using Random Forest Feature Importance (this part remains the same)
def select_best_features_using_importance(X, y, n_estimators=200, max_depth=6, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Fit RandomForest to find feature importance
    rf_model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=random_state)
    rf_model.fit(X_train_scaled, y_train)
    
    # Get feature importance
    feature_importances = rf_model.feature_importances_
    
    # Sort features by importance
    feature_importance_df = pd.DataFrame({
        'Feature': X.columns,
        'Importance': feature_importances
    }).sort_values(by='Importance', ascending=False)
    
    print("Features sorted by importance:")
    print(feature_importance_df)
    
    # Select top features (you can decide how many to select)
    selected_features = feature_importance_df['Feature'].head(6)  # Selecting top 5 features
    return selected_features

# Step 2: Train a model using selected features with Cross-Validation
def train_model_with_selected_features_cv(X, y, selected_features, testData=None, idTest=None, model_type='gradient_boosting'):
    X_selected = X[selected_features]
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_selected)

    # Select model type
    if model_type == 'gradient_boosting':
        model = GradientBoostingRegressor(n_estimators=150, learning_rate=0.1, max_depth=4, random_state=42)
    else:
        raise ValueError("Unsupported model type")

    # Perform 5-Fold Cross-Validation
    cv_scores = cross_val_score(model, X_scaled, y, cv=20, scoring='neg_mean_absolute_error')

    # Since the scoring returns negative MAE, we'll negate it to show positive values
    print(f"\nCross-validated MAE (5-Fold): {np.mean(-cv_scores):.4f}")
    
    # Fit the model on the entire dataset
    model.fit(X_scaled, y)
    
    # Predict on testData (optional)
    if testData is not None:
        X_test_scaled = scaler.transform(testData[selected_features])
        testPredictions = model.predict(X_test_scaled)
        
        output = pd.DataFrame({
            'id': idTest,
            'yield': testPredictions
        })
        
        output_file = 'submission.csv'
        output.to_csv(output_file, index=False)
        print(f"Predictions saved to {output_file}")

# Example usage
X = trainData[features_to_use]  # All features initially
y = trainData['yield']

# Step 1: Select best features using feature importance
selected_features = select_best_features_using_importance(X, y)

# Step 2: Train a model with cross-validation using those selected features
train_model_with_selected_features_cv(X, y, selected_features, testData=testData, idTest=testData['id'])

Features sorted by importance:
                 Feature  Importance
13              fruitset    0.891302
15                 seeds    0.101976
14             fruitmass    0.002991
5       MaxOfUpperTRange    0.000654
8       MaxOfLowerTRange    0.000590
9       MinOfLowerTRange    0.000560
7   AverageOfUpperTRange    0.000470
10  AverageOfLowerTRange    0.000435
6       MinOfUpperTRange    0.000324
3                andrena    0.000157
4                  osmia    0.000140
12    AverageRainingDays    0.000136
11           RainingDays    0.000094
1               honeybee    0.000080
2                bumbles    0.000060
0              clonesize    0.000031

Cross-validated MAE (5-Fold): 248.0716
Predictions saved to submission.csv


In [9]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

def select_best_features_using_importance(X, y, n_estimators=200, max_depth=5, random_state=42, n_splits=5):
    """
    Select best features using Random Forest importance with cross-validation
    """
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Initialize K-Fold
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    
    # Store feature importances across folds
    feature_importances_all = np.zeros(X.shape[1])
    
    # Perform cross-validation
    for fold, (train_idx, val_idx) in enumerate(kf.split(X_scaled), 1):
        X_train, X_val = X_scaled[train_idx], X_scaled[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        # Fit RandomForest
        rf_model = RandomForestRegressor(
            n_estimators=n_estimators, 
            max_depth=max_depth, 
            random_state=random_state
        )
        rf_model.fit(X_train, y_train)
        
        # Accumulate feature importances
        feature_importances_all += rf_model.feature_importances_
        
        # Calculate and print fold performance
        val_pred = rf_model.predict(X_val)
        fold_mse = mean_squared_error(y_val, val_pred)
        fold_r2 = r2_score(y_val, val_pred)
        print(f"\nFold {fold} Results:")
        print(f'MSE: {fold_mse:.4f}')
        print(f'R²: {fold_r2:.4f}')
    
    # Average feature importances across folds
    feature_importances = feature_importances_all / n_splits
    
    # Create and sort feature importance DataFrame
    feature_importance_df = pd.DataFrame({
        'Feature': X.columns,
        'Importance': feature_importances
    }).sort_values(by='Importance', ascending=False)
    
    print("\nAverage Feature Importances Across Folds:")
    print(feature_importance_df)
    
    # Select top features
    selected_features = feature_importance_df['Feature'].head(5)
    return selected_features

def train_model_with_selected_features(X, y, selected_features, testData=None, idTest=None, 
                                     model_type='gradient_boosting', n_splits=5):
    """
    Train model using selected features with cross-validation
    """
    X_selected = X[selected_features]
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_selected)
    
    # Initialize model
    if model_type == 'gradient_boosting':
        model = GradientBoostingRegressor(
            n_estimators=80,
            learning_rate=0.1,
            max_depth=4,
            random_state=42
        )
    else:
        raise ValueError("Unsupported model type")
    
    # Perform cross-validation
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    cv_scores = {
        'mse': [],
        'r2': [],
        'mae': []
    }
    
    print("\nCross-validation Results:")
    for fold, (train_idx, val_idx) in enumerate(kf.split(X_scaled), 1):
        X_train, X_val = X_scaled[train_idx], X_scaled[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        # Train model
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_val)
        
        # Calculate metrics
        mse = mean_squared_error(y_val, y_pred)
        r2 = r2_score(y_val, y_pred)
        mae = mean_absolute_error(y_val, y_pred)
        
        cv_scores['mse'].append(mse)
        cv_scores['r2'].append(r2)
        cv_scores['mae'].append(mae)
        
        print(f"\nFold {fold}:")
        print(f'MSE: {mse:.4f}')
        print(f'R²: {r2:.4f}')
        print(f'MAE: {mae:.4f}')
    
    # Print average scores
    print("\nAverage Cross-validation Scores:")
    print(f'MSE: {np.mean(cv_scores["mse"]):.4f} ± {np.std(cv_scores["mse"]):.4f}')
    print(f'R²: {np.mean(cv_scores["r2"]):.4f} ± {np.std(cv_scores["r2"]):.4f}')
    print(f'MAE: {np.mean(cv_scores["mae"]):.4f} ± {np.std(cv_scores["mae"]):.4f}')
    
    # Train final model on full dataset for predictions
    if testData is not None:
        model.fit(X_scaled, y)  # Train on full dataset
        X_test_scaled = scaler.transform(testData[selected_features])
        testPredictions = model.predict(X_test_scaled)
        
        output = pd.DataFrame({
            'id': idTest,
            'yield': testPredictions
        })
        
        output_file = 'test_predictions_cv.csv'
        output.to_csv(output_file, index=False)
        print(f"\nPredictions saved to {output_file}")

# Example usage
X = trainData[features_to_use]
y = trainData['yield']

# Step 1: Select best features using cross-validated feature importance
selected_features = select_best_features_using_importance(X, y, n_splits=5)

# Step 2: Train and evaluate model with cross-validation
train_model_with_selected_features(X, y, selected_features, testData=testData, 
                                 idTest=testData['id'], n_splits=5)


Fold 1 Results:
MSE: 170400.3378
R²: 0.9060

Fold 2 Results:
MSE: 139390.1347
R²: 0.9227

Fold 3 Results:
MSE: 168407.0789
R²: 0.9080

Fold 4 Results:
MSE: 140016.0484
R²: 0.9228

Fold 5 Results:
MSE: 136511.3709
R²: 0.9247

Average Feature Importances Across Folds:
                 Feature  Importance
13              fruitset    0.890603
15                 seeds    0.106095
14             fruitmass    0.001313
10  AverageOfLowerTRange    0.000358
7   AverageOfUpperTRange    0.000334
9       MinOfLowerTRange    0.000329
5       MaxOfUpperTRange    0.000318
8       MaxOfLowerTRange    0.000316
6       MinOfUpperTRange    0.000220
4                  osmia    0.000028
11           RainingDays    0.000024
3                andrena    0.000023
12    AverageRainingDays    0.000022
2                bumbles    0.000007
0              clonesize    0.000006
1               honeybee    0.000004

Cross-validation Results:

Fold 1:
MSE: 166104.3341
R²: 0.9083
MAE: 251.7258

Fold 2:
MSE: 133873.7533

In [10]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

def select_best_features_using_importance(X, y, cv=5):
    """
    Select best features using Random Forest with GridSearchCV for hyperparameter optimization
    """
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Define parameter grid for RandomForest
    param_grid_rf = {
        'n_estimators': [100, 150, 200],
        'max_depth': [3, 5, 7],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    }
    
    # Initialize base model
    rf_base = RandomForestRegressor(random_state=42)
    
    # Perform GridSearchCV
    grid_search_rf = GridSearchCV(
        estimator=rf_base,
        param_grid=param_grid_rf,
        cv=cv,
        scoring='neg_mean_squared_error',
        n_jobs=-1,
        verbose=1
    )
    
    print("Performing GridSearchCV for feature selection...")
    grid_search_rf.fit(X_scaled, y)
    
    print("\nBest RandomForest Parameters:")
    print(grid_search_rf.best_params_)
    print(f"\nBest CV Score: {-grid_search_rf.best_score_:.4f} MSE")
    
    # Get feature importances from best model
    best_rf = grid_search_rf.best_estimator_
    feature_importances = best_rf.feature_importances_
    
    # Create and sort feature importance DataFrame
    feature_importance_df = pd.DataFrame({
        'Feature': X.columns,
        'Importance': feature_importances
    }).sort_values(by='Importance', ascending=False)
    
    print("\nFeature Importances from Best Model:")
    print(feature_importance_df)
    
    # Select top features
    selected_features = feature_importance_df['Feature'].head(5)
    return selected_features, grid_search_rf.best_params_

def train_model_with_selected_features(X, y, selected_features, testData=None, idTest=None, 
                                     model_type='gradient_boosting', cv=5):
    """
    Train model using selected features with GridSearchCV for hyperparameter optimization
    """
    X_selected = X[selected_features]
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_selected)
    
    if model_type == 'gradient_boosting':
        # Define parameter grid for GradientBoosting
        param_grid_gb = {
            'n_estimators': [120, 150, 100],
            'learning_rate': [0.05, 0.06, 0.09, 0.1, 0.15],
            'max_depth': [3, 4, 5],
            'min_samples_split': [2, 4],
            'min_samples_leaf': [1, 2],
            'subsample': [0.8, 0.9, 1.0]
        }
        
        # Initialize base model
        base_model = GradientBoostingRegressor(random_state=42)
        
    else:
        raise ValueError("Unsupported model type")
    
    # Perform GridSearchCV
    grid_search = GridSearchCV(
        estimator=base_model,
        param_grid=param_grid_gb,
        cv=cv,
        scoring=['neg_mean_squared_error', 'r2'],
        refit='neg_mean_squared_error',
        n_jobs=-1,
        verbose=1,
        return_train_score=True
    )
    
    print("\nPerforming GridSearchCV for model training...")
    grid_search.fit(X_scaled, y)
    
    # Print best parameters and scores
    print("\nBest Parameters:")
    print(grid_search.best_params_)
    
    # Get best scores
    best_mse = -grid_search.best_score_  # Convert negative MSE back to positive
    cv_results = grid_search.cv_results_
    
    # Calculate and print average scores for best parameters
    print("\nCross-validation Scores for Best Parameters:")
    print(f'MSE: {best_mse:.4f}')
    print(f'R²: {np.mean(cv_results["mean_test_r2"]):.4f}')
    
    # Print detailed CV results for best parameters
    test_mse_scores = -cv_results['split_test_neg_mean_squared_error'][grid_search.best_index_]
    test_r2_scores = cv_results['split_test_r2'][grid_search.best_index_]
    
    print("\nDetailed Cross-validation Scores:")
    print(f'MSE: {np.mean(test_mse_scores):.4f} ± {np.std(test_mse_scores):.4f}')
    print(f'R²: {np.mean(test_r2_scores):.4f} ± {np.std(test_r2_scores):.4f}')
    
    # Generate predictions if test data is provided
    if testData is not None:
        X_test_scaled = scaler.transform(testData[selected_features])
        testPredictions = grid_search.predict(X_test_scaled)
        
        output = pd.DataFrame({
            'id': idTest,
            'yield': testPredictions
        })
        
        output_file = 'test_predictions_gridsearch.csv'
        output.to_csv(output_file, index=False)
        print(f"\nPredictions saved to {output_file}")
        
        return grid_search.best_estimator_, output
    
    return grid_search.best_estimator_

# Example usage
X = trainData[features_to_use]
y = trainData['yield']

# Step 1: Select best features using GridSearchCV
print("Step 1: Feature Selection")
selected_features, best_rf_params = select_best_features_using_importance(X, y, cv=5)

# Step 2: Train model with GridSearchCV
print("\nStep 2: Model Training")
best_model, predictions = train_model_with_selected_features(
    X, y, selected_features, 
    testData=testData, 
    idTest=testData['id'], 
    cv=5
)

# Print final selected features
print("\nFinal Selected Features:")
print(selected_features.tolist())

Step 1: Feature Selection
Performing GridSearchCV for feature selection...
Fitting 5 folds for each of 36 candidates, totalling 180 fits

Best RandomForest Parameters:
{'max_depth': 7, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}

Best CV Score: 147486.3065 MSE

Feature Importances from Best Model:
                 Feature  Importance
13              fruitset    0.883141
15                 seeds    0.107747
14             fruitmass    0.004260
9       MinOfLowerTRange    0.000748
8       MaxOfLowerTRange    0.000572
7   AverageOfUpperTRange    0.000560
4                  osmia    0.000534
5       MaxOfUpperTRange    0.000521
10  AverageOfLowerTRange    0.000465
6       MinOfUpperTRange    0.000395
3                andrena    0.000354
12    AverageRainingDays    0.000209
11           RainingDays    0.000198
2                bumbles    0.000105
1               honeybee    0.000100
0              clonesize    0.000092

Step 2: Model Training

Performing GridSearchCV

KeyError: 'split_test_neg_mean_squared_error'