### Section 1

Initializing the Libraries into notebook

In [85]:
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings('ignore') #ignoring some deprication warnings

Importing the raw data and checking for missing values

In [86]:
df = pd.read_csv('crop_yield.csv')

df.info()
print(df['Crop'].nunique())
print(df['Region'].nunique())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 10 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   Region                  1000000 non-null  object 
 1   Soil_Type               1000000 non-null  object 
 2   Crop                    1000000 non-null  object 
 3   Rainfall_mm             1000000 non-null  float64
 4   Temperature_Celsius     1000000 non-null  float64
 5   Fertilizer_Used         1000000 non-null  bool   
 6   Irrigation_Used         1000000 non-null  bool   
 7   Weather_Condition       1000000 non-null  object 
 8   Days_to_Harvest         1000000 non-null  int64  
 9   Yield_tons_per_hectare  1000000 non-null  float64
dtypes: bool(2), float64(3), int64(1), object(4)
memory usage: 62.9+ MB
6
4


### Check for NaN Values

In [87]:
print("Number of NaN values in each column:")
print(df.isna().sum())
print(df['Yield_tons_per_hectare'].describe())
print(df['Yield_tons_per_hectare'].isnull().sum())  # Ensure no NaNs

Number of NaN values in each column:
Region                    0
Soil_Type                 0
Crop                      0
Rainfall_mm               0
Temperature_Celsius       0
Fertilizer_Used           0
Irrigation_Used           0
Weather_Condition         0
Days_to_Harvest           0
Yield_tons_per_hectare    0
dtype: int64
count    1000000.000000
mean           4.649472
std            1.696572
min           -1.147613
25%            3.417637
50%            4.651808
75%            5.879200
max            9.963372
Name: Yield_tons_per_hectare, dtype: float64
0


### Pre-Processing required. 
**Encoding**
* Region
* Soil_Type
* Crop
* Weather_Condition

**Scaling**
* Rainfall_mm
* Temperature_Celsius 
* Days_to_Harvest
* Yield_tons_per_hectare

In [88]:
#Code area for pre-processing

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

ct_All = ColumnTransformer(
    [("scaling", StandardScaler(), ['Rainfall_mm','Temperature_Celsius']),
     ("onehot", OneHotEncoder(sparse_output=False), ['Region', 'Soil_Type', 'Crop', 'Weather_Condition'])])

ct_encode = ColumnTransformer(
    [("onehot", OneHotEncoder(sparse_output=False), ['Region', 'Soil_Type', 'Crop', 'Weather_Condition'])])

### Initialize Pipeline

In order to evalute our three models we will create a pipeline to allow us to perform the necessary preprocessing steps for each model 

In [89]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
pipeline = Pipeline([
    ('preprocessor', ct_encode),
    ('model', Ridge())
])



### Now we must create the param grid for our grid search

In [90]:
param_grid = [
    {
        'preprocessor': [ct_All],
        'model': [Ridge()],
        'model__alpha': np.logspace(-3, 3, 7)
    },
    {
        'preprocessor': [ct_encode],
        'model': [GradientBoostingRegressor(random_state=0)],
        'model__n_estimators': [10, 50, 100, 200, 500],
        'model__max_depth': [3, 5, 7, 9, 11],
    },
    {
        'preprocessor': [ct_encode],
        'model': [RandomForestRegressor(random_state=0)],
        'model__n_estimators': [10, 50, 100, 200, 500],
        'model__max_depth': [3, 5, 7, 9, 11],
    }
]

### Next we must split the the data into test and training splits

In [91]:

df = df.sample(n = 100000)
X = df.drop('Yield_tons_per_hectare', axis = 1)
y = df['Yield_tons_per_hectare']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9, random_state=0)

### Now we will implement the grid search

In [92]:
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2', n_jobs=-1)
grid_search.fit(X_train, y_train)

In [93]:
print("Best Parameters:", grid_search.best_params_)
print("Best Train R²:", grid_search.best_score_)
best_cv_test_score = max(grid_search.cv_results_['mean_test_score'])
print("Best Test R²:", best_cv_test_score)
test_score = grid_search.score(X_test, y_test)
print("Test Set R²:", test_score)

Best Parameters: {'model': Ridge(), 'model__alpha': np.float64(100.0), 'preprocessor': ColumnTransformer(transformers=[('scaling', StandardScaler(),
                                 ['Rainfall_mm', 'Temperature_Celsius']),
                                ('onehot', OneHotEncoder(sparse_output=False),
                                 ['Region', 'Soil_Type', 'Crop',
                                  'Weather_Condition'])])}
Best Train R²: 0.59089687048257
Best Test R²: 0.59089687048257
Test Set R²: 0.5915531420413462
