In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from mlxtend.regressor import StackingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, accuracy_score

In [2]:
# Load the dataset
data1 = pd.read_csv('CLIMATE DATA.csv')
data1.head()

Unnamed: 0,Year,Yield,Min Temp,Max Temp,Rainfall,Sunshine
0,1992,1.47,21.92,31.04,183.08,6.82
1,1993,1.8,21.87,31.26,145.7,7.58
2,1994,1.91,22.12,31.06,174.3,6.58
3,1995,1.92,22.03,31.61,113.38,7.18
4,1996,1.92,22.15,31.26,203.42,7.04


In [3]:
data1.isna().sum()

Year        0
Yield       0
Min Temp    0
Max Temp    0
Rainfall    1
Sunshine    1
dtype: int64

In [4]:
data = data1.dropna()
data.isna().sum()

Year        0
Yield       0
Min Temp    0
Max Temp    0
Rainfall    0
Sunshine    0
dtype: int64

In [5]:
X=data.drop(['Year','Yield'], axis='columns')
y=data.Yield

In [6]:
scaler = MinMaxScaler()
scaler.fit(data)
data = scaler.transform(data)

In [7]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [8]:
# Define the base models
lr = LinearRegression()
dt = DecisionTreeRegressor(random_state=0)
rf = RandomForestRegressor(random_state=0)
svm = SVR()
nn = MLPRegressor(random_state=0)

In [9]:
# Define the parameter grids for hyperparameter tuning
lr_params = {}
dt_params = {'max_depth': [None, 5, 10, 15],
             'min_samples_split': [2, 5, 10],
             'min_samples_leaf': [1, 2, 4]}
rf_params = {'n_estimators': [100, 200, 500],
             'max_depth': [None, 5, 10, 15],
             'min_samples_split': [2, 5, 10],
             'min_samples_leaf': [1, 2, 4]}
svm_params = {'C': [0.1, 1, 10],
              'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
              'degree': [2, 3, 4]}
nn_params = {'hidden_layer_sizes': [(100,), (50,50), (100,50,25)],
             'activation': ['relu', 'logistic', 'tanh'],
             'alpha': [0.0001, 0.001, 0.01]}

In [10]:
# Define the k-fold cross-validation object
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [11]:
# Perform hyperparameter tuning for each base model using GridSearchCV and 5-fold cross-validation
lr_gs = GridSearchCV(lr, lr_params, cv=kf, scoring='neg_mean_squared_error')
dt_gs = GridSearchCV(dt, dt_params, cv=kf, scoring='neg_mean_squared_error')
rf_gs = GridSearchCV(rf, rf_params, cv=kf, scoring='neg_mean_squared_error')
svm_gs = GridSearchCV(svm, svm_params, cv=kf, scoring='neg_mean_squared_error')
nn_gs = GridSearchCV(nn, nn_params, cv=kf, scoring='neg_mean_squared_error')

In [12]:
# Fit the base models with hyperparameter tuning on the training data
lr_gs.fit(X_train, y_train)
dt_gs.fit(X_train, y_train)
rf_gs.fit(X_train, y_train)
svm_gs.fit(X_train, y_train)
nn_gs.fit(X_train, y_train)



In [13]:
# Make predictions on the test data using the best hyperparameters for each base model
lr_pred = lr_gs.predict(X_test)
dt_pred = dt_gs.predict(X_test)
rf_pred = rf_gs.predict(X_test)
svm_pred = svm_gs.predict(X_test)
nn_pred = nn_gs.predict(X_test)

In [14]:
# Evaluate the performance of each base model using mean squared error
lr_r2 = r2_score(y_test, lr_pred)
lr_mae = mean_absolute_error(y_test, lr_pred)
lr_mse = mean_squared_error(y_test, lr_pred)
lr_rmse = mean_squared_error(y_test, lr_pred, squared=False)
dt_r2 = r2_score(y_test, dt_pred)
dt_mae = mean_absolute_error(y_test, dt_pred)
dt_mse = mean_squared_error(y_test, dt_pred)
dt_rmse = mean_squared_error(y_test, dt_pred, squared=False)
rf_r2 = r2_score(y_test, rf_pred)
rf_mae = mean_absolute_error(y_test, rf_pred)
rf_mse = mean_squared_error(y_test, rf_pred)
rf_rmse = mean_squared_error(y_test, rf_pred, squared=False)
svm_r2 = r2_score(y_test, svm_pred)
svm_mae = mean_absolute_error(y_test, svm_pred)
svm_mse = mean_squared_error(y_test, svm_pred)
svm_rmse = mean_squared_error(y_test, svm_pred, squared=False)
nn_r2 = r2_score(y_test, nn_pred)
nn_mae = mean_absolute_error(y_test, nn_pred)
nn_mse = mean_squared_error(y_test, nn_pred)
nn_rmse = mean_squared_error(y_test, nn_pred, squared=False)


print('Linear regression R2:', lr_r2)
print('Decision Tree R2:', dt_r2)
print('Random Forest R2:', rf_r2)
print('Support Vector Machine R2:', svm_r2)
print('Neural Network R2:', nn_r2)
print('Linear regression MAE:', lr_mae)
print('Decision Tree MAE:', dt_mae)
print('Random Forest MAE:', rf_mae)
print('Support Vector Machine MAE:', svm_mae)
print('Neural Network MAE:', nn_mae)
print('Linear regression MSE:', lr_mse)
print("Decision Tree MSE: ", dt_mse)
print('Random Forest MSE:', rf_mse)
print('Support Vector Machine MSE:', svm_mse)
print("Neural Network MSE: ", nn_mse)
print('Linear regression RMSE:', lr_rmse)
print('Decision Tree RMSE:', lr_rmse)
print('Random Forest RMSE:', rf_rmse)
print('Support Vector Machine RMSE:', svm_rmse)
print('Neural Network RMSE:', nn_rmse)

Linear regression R2: 0.15267971412397485
Decision Tree R2: 0.7159235853432285
Random Forest R2: 0.7794945629693338
Support Vector Machine R2: 0.14914329838848683
Neural Network R2: -0.06393979932259941
Linear regression MAE: 0.3030550808443197
Decision Tree MAE: 0.1691666666666666
Random Forest MAE: 0.15515937169856317
Support Vector Machine MAE: 0.3090981504762831
Neural Network MAE: 0.3339239210830276
Linear regression MSE: 0.12178816908991404
Decision Tree MSE:  0.040831249999999965
Random Forest MSE: 0.031693981482541086
Support Vector Machine MSE: 0.12229646991162818
Neural Network MSE:  0.152923613822635
Linear regression RMSE: 0.3489816171231861
Decision Tree RMSE: 0.3489816171231861
Random Forest RMSE: 0.17802803566444553
Support Vector Machine RMSE: 0.3497091218593364
Neural Network RMSE: 0.3910544895825069


In [15]:
# Build the Stacking Ensemble model using the 5 base models
stacked = StackingRegressor(regressors=[lr_gs.best_estimator_, dt_gs.best_estimator_,
                                        rf_gs.best_estimator_, svm_gs.best_estimator_, nn_gs.best_estimator_],
                            meta_regressor=lr_gs.best_estimator_)

In [16]:
# Fit the Stacking Ensemble model on the training data
stacked.fit(X_train, y_train)

In [17]:
# Make predictions on the test data using the Stacking Ensemble model
stacked_pred = stacked.predict(X_test)



In [18]:
# Evaluate the performance of the Stacking Ensemble model using mean squared error
stacked_r2 = r2_score(y_test, stacked_pred)
stacked_mae = mean_absolute_error(y_test, stacked_pred)
stacked_mse = mean_squared_error(y_test, stacked_pred)
stacked_rmse = mean_squared_error(y_test, stacked_pred, squared=False)
print('Stacking Ensemble R2:', stacked_r2)
print('Stacking Ensemble MAE:', stacked_mae)
print("Stacking Ensemble MSE: ", stacked_mse)
print('Stacking Ensemble RMSE:', stacked_rmse)

Stacking Ensemble R2: 0.7012391895883912
Stacking Ensemble MAE: 0.172751118841852
Stacking Ensemble MSE:  0.0429418871498286
Stacking Ensemble RMSE: 0.20722424363435038
