### Section 1

Initializing the Libraries into notebook

In [2]:
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings('ignore') #ignoring some deprication warnings

Importing the raw data and checking for missing values

In [3]:
df = pd.read_csv('crop_yield.csv')

df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 10 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   Region                  1000000 non-null  object 
 1   Soil_Type               1000000 non-null  object 
 2   Crop                    1000000 non-null  object 
 3   Rainfall_mm             1000000 non-null  float64
 4   Temperature_Celsius     1000000 non-null  float64
 5   Fertilizer_Used         1000000 non-null  bool   
 6   Irrigation_Used         1000000 non-null  bool   
 7   Weather_Condition       1000000 non-null  object 
 8   Days_to_Harvest         1000000 non-null  int64  
 9   Yield_tons_per_hectare  1000000 non-null  float64
dtypes: bool(2), float64(3), int64(1), object(4)
memory usage: 62.9+ MB


### Check for NaN Values

In [4]:
print("Number of NaN values in each column:")
print(df.isna().sum())

Number of NaN values in each column:
Region                    0
Soil_Type                 0
Crop                      0
Rainfall_mm               0
Temperature_Celsius       0
Fertilizer_Used           0
Irrigation_Used           0
Weather_Condition         0
Days_to_Harvest           0
Yield_tons_per_hectare    0
dtype: int64


### Pre-Processing required. 
**Encoding**
* Region
* Soil_Type
* Crop
* Weather_Condition

**Scaling**
* Rainfall_mm
* Temperature_Celsius 
* Days_to_Harvest
* Yield_tons_per_hectare

In [5]:
#Code area for pre-processing

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

ct_All = ColumnTransformer(
    [("scaling", StandardScaler(), ['Rainfall_mm','Temperature_Celsius']),
     ("onehot", OneHotEncoder(sparse_output=False), ['Region', 'Soil_Type', 'Crop', 'Weather_Condition'])])

ct_encode = ColumnTransformer(
    [("onehot", OneHotEncoder(sparse_output=False), ['Region', 'Soil_Type', 'Crop', 'Weather_Condition'])])

### Initialize Pipeline

In order to evalute our three models we will create a pipeline to allow us to perform the necessary preprocessing steps for each model 

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
pipeline = Pipeline([
    ('preprocessor', ct_All),
    ('model', LogisticRegression(max_iter = 1000))
])



### Now we must create the param grid for our grid search

In [None]:
lr = LogisticRegression(max_iter = 1000)
rf = RandomForestRegressor(random_state=0)
gbr = GradientBoostingRegressor(random_state=0)
param_grid = [
    {
        'preprocessor': [ct_All],
        'model': [lr],
        'model__max_iter': [100,250,500,750,1000]
    },
    {
        'preprocessor': [ct_encode],
        'model': [gbr],
        'model__n_estimators': [10, 50, 100, 200, 500],
        'model__max_depth': [3, 5, 7, 9, 11],
    },
    {
        'preprocessor': [ct_encode],
        'model': [rf],
        'model__n_estimators': [10, 50, 100, 200, 500],
        'model__max_depth': [3, 5, 7, 9, 11],
    }
]

### Next we must split the the data into test and training splits

In [15]:
df['yield_tons_per_day'] = df['Yield_tons_per_hectare'] / df ['Days_to_Harvest']
df.drop('Yield_tons_per_hectare', axis = 1)
df.drop('Days_to_Harvest', axis = 1)
df = df.sample(n = 10000)
X = df.drop('yield_tons_per_day', axis = 1)
y = df['yield_tons_per_day']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9, random_state=0)

### Now we will implement the grid search

In [16]:
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2', n_jobs=-1)
grid_search.fit(X_train, y_train)

TypeError: Parameter grid for parameter 'preprocessor' needs to be a list or a numpy array, but got ColumnTransformer(transformers=[('scaling', StandardScaler(),
                                 ['Rainfall_mm', 'Temperature_Celsius']),
                                ('onehot', OneHotEncoder(sparse_output=False),
                                 ['Region', 'Soil_Type', 'Crop',
                                  'Weather_Condition'])]) (of type ColumnTransformer) instead. Single values need to be wrapped in a list with one element.

In [1]:
print("Best Parameters:", grid_search.best_params_)
print("Best Train R²:", grid_search.best_score_)
best_cv_test_score = max(grid_search.cv_results_['mean_test_score'])
print("Best Test R²:", best_cv_test_score)
test_score = grid_search.score(X_test, y_test)
print("Test Set R²:", test_score)

NameError: name 'grid_search' is not defined