# Load the Data

In [1]:
import pandas as pd

# Load the dataset
data = pd.read_csv('CropYieldData.csv')  # Replace with your actual file path

In [2]:
# Display the first few rows
data.head(10)

Unnamed: 0.1,Unnamed: 0,Year,Crop Type,Yield (kg/ha),Avg Temp (°C),Total Rainfall (mm),Avg Sunlight (hours/day),Soil pH,Soil Moisture (%),Fertilizer Used (kg/ha),Pesticide Used (L/ha),Region
0,0,2024,Wheat,7374,17.7,402,10.0,5.8,43.5,134,5.4,South
1,1,2021,Wheat,6980,28.7,370,7.3,6.7,42.2,102,6.2,South
2,2,2022,Rice,7630,25.7,373,6.5,6.4,44.6,150,5.7,West
3,3,2024,Wheat,7722,27.3,352,7.7,6.3,27.8,116,5.0,North
4,4,2020,Wheat,6294,23.1,382,7.8,6.5,33.1,134,6.4,South
5,5,2022,Corn,5963,28.1,382,11.2,6.7,26.9,101,5.4,South
6,6,2022,Wheat,6407,28.3,364,7.8,6.6,34.0,146,6.7,North
7,7,2024,Rice,6605,21.6,381,10.7,6.4,25.8,108,6.3,West
8,8,2019,Wheat,7565,21.9,324,9.5,5.9,38.2,147,5.3,West
9,9,2020,Corn,8299,23.6,368,6.6,6.2,33.4,112,5.7,West


# Data Inspection

In [3]:
# Check the structure and data types
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Unnamed: 0                20000 non-null  int64  
 1   Year                      20000 non-null  int64  
 2   Crop Type                 20000 non-null  object 
 3   Yield (kg/ha)             20000 non-null  int64  
 4   Avg Temp (°C)             20000 non-null  float64
 5   Total Rainfall (mm)       20000 non-null  int64  
 6   Avg Sunlight (hours/day)  20000 non-null  float64
 7   Soil pH                   20000 non-null  float64
 8   Soil Moisture (%)         20000 non-null  float64
 9   Fertilizer Used (kg/ha)   20000 non-null  int64  
 10  Pesticide Used (L/ha)     20000 non-null  float64
 11  Region                    20000 non-null  object 
dtypes: float64(5), int64(5), object(2)
memory usage: 1.8+ MB


In [4]:
# Display summary statistics
data.describe()

Unnamed: 0.1,Unnamed: 0,Year,Yield (kg/ha),Avg Temp (°C),Total Rainfall (mm),Avg Sunlight (hours/day),Soil pH,Soil Moisture (%),Fertilizer Used (kg/ha),Pesticide Used (L/ha)
count,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0
mean,9999.5,2021.0022,6507.443,23.532565,344.6975,8.9951,6.300585,34.463495,127.0361,5.751855
std,5773.647028,2.002622,1157.295887,3.742963,37.789076,1.725103,0.291631,6.064777,18.755881,0.721758
min,0.0,2018.0,4500.0,17.0,280.0,6.0,5.8,24.0,95.0,4.5
25%,4999.75,2019.0,5504.0,20.3,312.0,7.5,6.0,29.2,111.0,5.1
50%,9999.5,2021.0,6520.0,23.5,345.0,9.0,6.3,34.4,127.0,5.7
75%,14999.25,2023.0,7510.0,26.8,378.0,10.5,6.5,39.7,143.0,6.4
max,19999.0,2024.0,8499.0,30.0,409.0,12.0,6.8,45.0,159.0,7.0


In [5]:
# Check for missing values
data.isnull().sum()

Unnamed: 0                  0
Year                        0
Crop Type                   0
Yield (kg/ha)               0
Avg Temp (°C)               0
Total Rainfall (mm)         0
Avg Sunlight (hours/day)    0
Soil pH                     0
Soil Moisture (%)           0
Fertilizer Used (kg/ha)     0
Pesticide Used (L/ha)       0
Region                      0
dtype: int64

In [6]:
# Check for duplicates values
print(data.duplicated(keep=False).sum())

0


# Data Preparation

In [7]:
# One-hot encoding for categorical variables
data = pd.get_dummies(data, columns=['Crop Type', 'Region'], drop_first=True)

In [9]:
from sklearn.preprocessing import StandardScaler

# Selecting features and target variable
X = data.drop(['Unnamed: 0', 'Yield (kg/ha)'], axis=1)  # Dropping ID and target
y = data['Yield (kg/ha)']

# Scaling the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [15]:
# Convert the scaled features back into a DataFrame
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

# Display the first few rows of the scaled data
X_scaled_df.head()

Unnamed: 0,Year,Avg Temp (°C),Total Rainfall (mm),Avg Sunlight (hours/day),Soil pH,Soil Moisture (%),Fertilizer Used (kg/ha),Pesticide Used (L/ha),Crop Type_Rice,Crop Type_Wheat,Region_South,Region_West
0,1.496975,-1.558314,1.516415,0.582531,-1.716547,1.490035,0.371301,-0.487509,-0.708088,1.404354,1.415594,-0.703795
1,-0.001099,1.380608,0.669589,-0.982633,1.369626,1.275677,-1.334873,0.620923,-0.708088,1.404354,1.415594,-0.703795
2,0.498259,0.579084,0.748979,-1.446385,0.340902,1.671415,1.224388,-0.071847,1.412254,-0.712071,-0.706417,1.420869
3,1.496975,1.006563,0.193249,-0.750757,-0.002006,-1.098748,-0.588422,-1.041726,-0.708088,1.404354,-0.706417,-0.703795
4,-0.500456,-0.11557,0.987149,-0.692788,0.68381,-0.224828,0.371301,0.898031,-0.708088,1.404354,1.415594,-0.703795


In [16]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [17]:
# Check the shape of the splits
print(X_train.shape, X_test.shape)

(16000, 12) (4000, 12)


# Train the Model

In [18]:
from sklearn.ensemble import RandomForestRegressor

# Initialize the model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Make Predictions

In [19]:
# Make predictions on the testing set
y_pred = rf_model.predict(X_test)

# Evaluate the Model

In [20]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Calculate MAE and MSE
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)  # RMSE

print(f'Mean Absolute Error (MAE): {mae}')
print(f'Mean Squared Error (MSE): {mse}')
print(f'Root Mean Squared Error (RMSE): {rmse}')

Mean Absolute Error (MAE): 1022.9687574999999
Mean Squared Error (MSE): 1406861.506513475
Root Mean Squared Error (RMSE): 1186.1119283244204




# Hyperparameter Tuning

### Using Grid Search with Cross-Validation

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initialize the Random Forest Regressor
rf = RandomForestRegressor(random_state=42)

# Initialize Grid Search
grid_search = GridSearchCV(estimator=rf,
                           param_grid=param_grid,
                           cv=3,  # 3-fold cross-validation
                           n_jobs=-1,  # Use all available cores
                           verbose=2,
                           scoring='neg_mean_squared_error')

# Fit Grid Search on training data
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Best estimator
best_rf = grid_search.best_estimator_


Fitting 3 folds for each of 216 candidates, totalling 648 fits


### Using Randomized Search for Faster Results

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Define the parameter distribution
param_dist = {
    'n_estimators': randint(100, 500),
    'max_depth': [None] + list(range(10, 31, 10)),
    'min_samples_split': randint(2, 11),
    'min_samples_leaf': randint(1, 5),
    'bootstrap': [True, False]
}

# Initialize Randomized Search
random_search = RandomizedSearchCV(estimator=rf,
                                   param_distributions=param_dist,
                                   n_iter=100,  # Number of parameter settings sampled
                                   cv=3,
                                   verbose=2,
                                   random_state=42,
                                   n_jobs=-1,
                                   scoring='neg_mean_squared_error')

# Fit Randomized Search on training data
random_search.fit(X_train, y_train)

# Get the best parameters
best_params_random = random_search.best_params_
print("Best Parameters from Randomized Search:", best_params_random)

# Best estimator
best_rf_random = random_search.best_estimator_