<a href="https://colab.research.google.com/github/CHIMAOLEFORO/3MTT-DATA-SCIENCE-UPSKILLING-2025/blob/main/CHIMA_ENSEMBLE_CODES_CROP_YIELD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

SAMPLE CODE FOR CHIMA'S AGRICULTURAL YIELD PREDICTION OF AGRICULTURAL YIEL USING AN ENSEMBLE OF MACHINE LEARNING ALGORITHMS

In [None]:
!pip install scikit-learn xgboost lightgbm pandas matplotlib seaborn


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
import lightgbm as lgb

# Load dataset
df = pd.read_csv('agriculture_dataset.csv')  # replace with your dataset path

# Preview
print(df.head())

# Handling missing values
df = df.dropna()

# Features and Target
X = df.drop('crop_yield', axis=1)  # Replace 'crop_yield' with actual yield column name
y = df['crop_yield']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize base models
models = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(),
    'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42),
    'SVR': SVR(),
    'XGBoost': xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42),
    'LightGBM': lgb.LGBMRegressor(n_estimators=100, random_state=42),
    'GradientBoosting': GradientBoostingRegressor(n_estimators=100, random_state=42)
}

# Evaluate models
print("\nBase Model Performance:\n" + "-"*30)
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    print(f"{name}: RMSE = {rmse:.2f}, R2 = {r2:.2f}")

# Create a stacking ensemble
stack = StackingRegressor(
    estimators=[
        ('xgb', models['XGBoost']),
        ('lgb', models['LightGBM']),
        ('rf', models['RandomForest']),
        ('ridge', models['Ridge']),
    ],
    final_estimator=GradientBoostingRegressor(n_estimators=100),
    passthrough=True
)

# Train the stacked model
stack.fit(X_train_scaled, y_train)
stack_pred = stack.predict(X_test_scaled)

# Ensemble performance
print("\nEnsemble Model Performance:\n" + "-"*30)
stack_rmse = np.sqrt(mean_squared_error(y_test, stack_pred))
stack_r2 = r2_score(y_test, stack_pred)
print(f"Stacked Model: RMSE = {stack_rmse:.2f}, R2 = {stack_r2:.2f}")

# Plot Actual vs Predicted
plt.figure(figsize=(10, 6))
plt.scatter(y_test, stack_pred, alpha=0.7)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2)
plt.xlabel('Actual Yield')
plt.ylabel('Predicted Yield')
plt.title('Actual vs Predicted Crop Yield (Stacked Model)')
plt.show()


Time Series Forecasting with Machine Learning
We'll:

Aggregate and sort data by time

Create lag features (previous yield values)

Use ensemble regressors for forecasting

Optionally add exogenous variables (like rainfall, temperature, etc.)

In [None]:
!pip install pandas scikit-learn xgboost lightgbm matplotlib seaborn


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# Load your time series dataset
df = pd.read_csv('crop_yield_timeseries.csv')  # replace with your dataset

# Ensure date column is datetime type
df['date'] = pd.to_datetime(df['date'])

# Sort by date
df = df.sort_values('date')

# Example: ['date', 'yield', 'rainfall', 'temperature']
# Create lag features for yield
for lag in range(1, 4):
    df[f'yield_lag_{lag}'] = df['yield'].shift(lag)

# Drop NA values created by lagging
df.dropna(inplace=True)

# Features and target
features = [col for col in df.columns if 'lag' in col or col in ['rainfall', 'temperature']]  # exogenous vars
X = df[features]
y = df['yield']

# Train-test split (preserving time order)
train_size = int(len(df) * 0.8)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

# Model (you can swap in RandomForest, XGBoost, LightGBM, etc.)
model = GradientBoostingRegressor(n_estimators=200)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluation
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"Forecast RMSE: {rmse:.2f}")
print(f"Forecast R2 Score: {r2:.2f}")

# Plotting
plt.figure(figsize=(12, 6))
plt.plot(df['date'][train_size:], y_test.values, label='Actual Yield')
plt.plot(df['date'][train_size:], y_pred, label='Predicted Yield', linestyle='--')
plt.xlabel('Date')
plt.ylabel('Crop Yield')
plt.title('Time Series Forecast: Actual vs Predicted Crop Yield')
plt.legend()
plt.grid(True)
plt.show()


 To build a deep learning LSTM model for long-range crop yield forecasting using Keras (TensorFlow). This model is particularly useful when your data has strong temporal dependencies (like annual or seasonal crop cycles).

In [None]:
!pip install tensorflow pandas matplotlib scikit-learn


LSTM Crop Yield Forecasting Code

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.metrics import mean_squared_error, r2_score

# Load dataset
df = pd.read_csv('crop_yield_timeseries.csv')  # ensure this contains a 'date' and 'yield' column
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values('date')

# Normalize yield values
scaler = MinMaxScaler()
df['yield_scaled'] = scaler.fit_transform(df[['yield']])

# Create time series sequences
def create_sequences(data, seq_length=12):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i+seq_length])
        y.append(data[i+seq_length])
    return np.array(X), np.array(y)

sequence_length = 12  # Use past 12 time steps (e.g. months or years)
X, y = create_sequences(df['yield_scaled'].values, seq_length=sequence_length)

# Train-test split
train_size = int(0.8 * len(X))
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

# Reshape for LSTM [samples, time steps, features]
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

# Build LSTM model
model = Sequential([
    LSTM(64, return_sequences=True, input_shape=(sequence_length, 1)),
    Dropout(0.2),
    LSTM(64),
    Dropout(0.2),
    Dense(1)
])

model.compile(optimizer='adam', loss='mse')
model.summary()

# Train model
history = model.fit(X_train, y_train, epochs=50, batch_size=16, validation_data=(X_test, y_test), verbose=1)

# Predict and inverse scale
y_pred = model.predict(X_test)
y_pred_rescaled = scaler.inverse_transform(y_pred)
y_test_rescaled = scaler.inverse_transform(y_test.reshape(-1, 1))

# Evaluation
rmse = np.sqrt(mean_squared_error(y_test_rescaled, y_pred_rescaled))
r2 = r2_score(y_test_rescaled, y_pred_rescaled)
print(f"LSTM Forecast RMSE: {rmse:.2f}")
print(f"LSTM Forecast R2 Score: {r2:.2f}")

# Plotting
plt.figure(figsize=(12, 6))
plt.plot(df['date'][-len(y_test):], y_test_rescaled, label='Actual')
plt.plot(df['date'][-len(y_test):], y_pred_rescaled, label='Predicted', linestyle='--')
plt.title('LSTM Crop Yield Forecasting')
plt.xlabel('Date')
plt.ylabel('Crop Yield')
plt.legend()
plt.grid(True)
plt.show()
