In [180]:
import pandas as pd

# This line of code Loads our datasets
corn_data = pd.read_csv('C:/Users/abiok/Downloads/Corn_Yield_Data_Main.csv')


# Display the first few rows of the dataset and basic information
# corn_data.head(), corn_data.info()

In [None]:
import tensorflow as tf
import random

#We set a random seed to ensure omore consistent reslts across the runs
seed = 42
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)

In [181]:
corn_data['Cation_Exchange'] = pd.to_numeric(corn_data['Cation_Exchange'], errors='coerce')

# Check if there are any missing values after the conversion
missing_cation_exchange = corn_data['Cation_Exchange'].isnull().sum()
print(f"Missing values in 'Cation_Exchange': {missing_cation_exchange}")

#imputing the NUll cells with the median value of the cation
median_cation_exchange = corn_data['Cation_Exchange'].median()
corn_data['Cation_Exchange'].fillna(median_cation_exchange, inplace=True)


Missing values in 'Cation_Exchange': 182


In [182]:
#This line of code confirms that there are no more missing values
missing_after_imputation = corn_data['Cation_Exchange'].isnull().sum()
print(f"Missing values in 'Cation_Exchange' after imputation: {missing_after_imputation}")

Missing values in 'Cation_Exchange' after imputation: 0


In [183]:
# This code shows tha 'W_prcp_53' through 'W_vp_53' are weekly measurements
# The cells with NaN represents weeks that do not exist.

# Create a mask for the years with 53 weeks
mask_53 = corn_data['year'].isin([2010, 2015, 2016, 2020, 2021])

# For years with only 52 weeks, set the 53rd week caolumns to zero
for column in corn_data.columns:
    if '53' in column:
        corn_data.loc[~mask_53, column] = 0


In [None]:
pip install --upgrade tensorflow

In [184]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Input
from tensorflow.keras.optimizers import Adam


In [185]:
print(corn_data.columns)

Index(['loc_ID', 'year', 'Yield', 'W_prcp_1', 'W_prcp_2', 'W_prcp_3',
       'W_prcp_4', 'W_prcp_5', 'W_prcp_6', 'W_prcp_7',
       ...
       'Total_Sand', 'Total_Silt ', 'Total_Clay', 'Org_Matter',
       'Hyd_Conductivity', 'Avail_Water_Cap', 'Satiated_H2O',
       'Cation_Exchange', 'Soil_pH_H2O ', 'Corn_Pro_Pct_Planted'],
      dtype='object', length=331)


In [188]:
corn_data.columns = corn_data.columns.str.strip()

### Data Splitting

In [190]:
# Remove any leading or trailing spaces in column names
X_train.columns = X_train.columns.str.strip()
X_test.columns = X_test.columns.str.strip()


**Standardization and Normalizaton**

In [191]:
# Initialize scalers
scaler = StandardScaler()
minmax_scaler = MinMaxScaler()

# Define features for each scaling method
weather_features = [col for col in corn_data.columns if col.startswith('W_')]  
soil_management_features = ['Total_Sand', 'Total_Silt', 'Total_Clay', 'Org_Matter',
                            'Hyd_Conductivity', 'Avail_Water_Cap', 'Satiated_H2O', 'Soil_pH_H2O',
                            'Corn_Pro_Pct_Planted']

# Apply StandardScaler to weather features
X_train[weather_features] = standard_scaler.fit_transform(X_train[weather_features])
X_test[weather_features] = standard_scaler.transform(X_test[weather_features])

# Apply MinMaxScaler to soil and management features
X_train[soil_management_features] = minmax_scaler.fit_transform(X_train[soil_management_features])
X_test[soil_management_features] = minmax_scaler.transform(X_test[soil_management_features])


In [193]:
# Splitting data
X = corn_data.drop('Yield', axis=1)
y = corn_data['Yield']
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


In [194]:
# Check the shape of each subset
print("Training Set Size:", X_train.shape)
print("Validation Set Size:", X_val.shape)
print("Testing Set Size:", X_test.shape)

Training Set Size: (709, 330)
Validation Set Size: (237, 330)
Testing Set Size: (237, 330)


In [195]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Fit the scaler on the training data and transform it
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [196]:
# Reshape data for LSTM if each sample is treated as a sequence of 1 time step with multiple features
X_train_scaled = X_train_scaled.reshape((X_train_scaled.shape[0], 1, X_train_scaled.shape[1]))
X_val_scaled = X_val_scaled.reshape((X_val_scaled.shape[0], 1, X_val_scaled.shape[1]))
X_test_scaled = X_test_scaled.reshape((X_test_scaled.shape[0], 1, X_test_scaled.shape[1]))


### Hyperparameter Tuning

In [200]:
from tensorflow.keras.layers import Input
from sklearn.model_selection import KFold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import numpy as np

def create_model(layers=[32], activation='relu'):
    model = Sequential()
    model.add(Input(shape=(X_train.shape[1],)))  # Adding an Input layer explicitly
    for layer_size in layers:
        model.add(Dense(layer_size, activation=activation))
    model.add(Dense(1, activation='linear'))  # Change to linear activation for regression
    model.compile(optimizer='adam', loss='mean_squared_error')  # Change loss to MSE for regression
    return model

# Define the grid of hyperparameters
params_grid = {
    'layers': [[32], [64, 32], [32, 16, 8]],
    'activation': ['relu', 'tanh'],
    'batch_size': [16, 32, 64],
    'epochs': [50, 100]
}

best_score = np.inf  # Initialize with infinity, as we want to minimize the loss
best_params = {}

# K-Fold for regression tasks
kfold = KFold(n_splits=3, shuffle=True, random_state=42)

# Iterate over all combinations of parameters
for layers in params_grid['layers']:
    for activation in params_grid['activation']:
        for batch_size in params_grid['batch_size']:
            for epochs in params_grid['epochs']:
                scores = []
                for train_ix, test_ix in kfold.split(X_train, y_train):
                    # Create model with the current set of parameters
                    model = create_model(layers=layers, activation=activation)
                    # Fit the model
                    model.fit(X_train.iloc[train_ix], y_train.iloc[train_ix], 
                              epochs=epochs, batch_size=batch_size, verbose=0)

                    # Evaluate the model using a regression metric like MSE
                    mse = model.evaluate(X_train.iloc[test_ix], y_train.iloc[test_ix], verbose=0)
                    scores.append(mse)
                # Compute average score across all folds
                mean_score = np.mean(scores)
                if mean_score < best_score:  # Note the change here to a less-than check
                    best_score = mean_score
                    best_params = {
                        'layers': layers,
                        'activation': activation,
                        'batch_size': batch_size,
                        'epochs': epochs
                    }

print("Best Score (MSE, lower is better): {:.2f}".format(best_score))
print("Best Parameters:", best_params)


Best Score (MSE, lower is better): 754.41
Best Parameters: {'layers': [32, 16, 8], 'activation': 'relu', 'batch_size': 16, 'epochs': 100}


### Model Building

In [201]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

def create_lstm_model(input_shape):
    model = Sequential()
    model.add(LSTM(64, return_sequences=True, input_shape=input_shape))  
    model.add(LSTM(32, return_sequences=False))  
    model.add(Dense(1))  # Output layer for regression
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model


# Create and train the LSTM model
lstm_model = create_lstm_model((1, X_train_scaled.shape[2]))  # Ensure the input shape is correctly passed
lstm_history = lstm_model.fit(X_train_scaled, y_train, epochs=100, batch_size=16, verbose=1)


Epoch 1/100


  super().__init__(**kwargs)


[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - loss: 31601.2715
Epoch 2/100
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 30336.8555
Epoch 3/100
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 29029.2559
Epoch 4/100
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 28338.8633
Epoch 5/100
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 27811.7695
Epoch 6/100
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 27344.4746
Epoch 7/100
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 26909.7402
Epoch 8/100
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 26496.8047
Epoch 9/100
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 26098.7559
Epoch 10/100
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m

[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 8588.6611
Epoch 80/100
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 8431.7910
Epoch 81/100
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 8277.0449
Epoch 82/100
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 8124.4048
Epoch 83/100
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 7973.8613
Epoch 84/100
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 7825.4033
Epoch 85/100
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 7679.0171
Epoch 86/100
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 7534.6895
Epoch 87/100
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 7392.4097
Epoch 88/100
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 

In [138]:
# Model training results
training_results = lstm_history.history
print(training_results)

{'loss': [31741.810546875, 30127.7890625, 29168.2421875, 28656.099609375, 27985.67578125, 27641.939453125, 27066.630859375, 26805.814453125, 26350.67578125, 26002.6953125, 25529.150390625, 25109.38671875, 24815.40234375, 24607.427734375, 24074.0078125, 23794.34765625, 23536.4609375, 23151.80859375, 22761.302734375, 22331.638671875, 22182.580078125, 21774.96484375, 21510.71875, 21270.84375, 20895.935546875, 20624.8125, 20223.248046875, 19975.76953125, 19686.361328125, 19404.6171875, 19175.94921875, 18848.419921875, 18697.765625, 18474.44921875, 18135.0078125, 17736.697265625, 17490.30078125, 17274.15625, 17055.63671875, 16892.115234375, 16516.9453125, 16264.9287109375, 15998.9794921875, 15688.6162109375, 15551.0947265625, 15269.716796875, 15047.8974609375, 14832.4072265625, 14668.8193359375, 14319.798828125, 14152.5390625, 13899.2265625, 13651.171875, 13424.111328125, 13215.232421875, 13044.6845703125, 12872.306640625, 12639.4794921875, 12334.5986328125, 12177.142578125, 11985.974609375

In [139]:
# Evaluate the model
loss = lstm_model.evaluate(X_test_scaled, y_test, verbose=0)
print(f'Test loss: {loss}')

Test loss: 5886.2080078125


In [140]:
# Calculate the number of features from the total size
num_features = 78210 // 237

# Reshape X_test_scaled properly
X_test_reshaped = X_test_scaled.reshape((237, 1, num_features))

# Evaluate the LSTM model
lstm_mse = lstm_model.evaluate(X_test_reshaped, y_test, verbose=0)
print(f'LSTM Model MSE: {lstm_mse}')


LSTM Model MSE: 5886.2080078125


In [141]:
# Calculate RMSE from MSE
rmse_lstm = np.sqrt(lstm_mse)
print(f'LSTM Model RMSE: {rmse_lstm}')

LSTM Model RMSE: 76.72162672814297


**so the library is scipy, within that one, there is the stat module import pearsonr**
**from scipy.stats import pearsonr**