In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

In [5]:
df = pd.read_csv("/kaggle/input/dataset/crop_yield.csv")

In [6]:
# Display the first few rows
df.head()

Unnamed: 0,Region,Soil_Type,Crop,Rainfall_mm,Temperature_Celsius,Fertilizer_Used,Irrigation_Used,Weather_Condition,Days_to_Harvest,Yield_tons_per_hectare
0,West,Sandy,Cotton,897.077239,27.676966,False,True,Cloudy,122,6.555816
1,South,Clay,Rice,992.673282,18.026142,True,True,Rainy,140,8.527341
2,North,Loam,Barley,147.998025,29.794042,False,False,Sunny,106,1.127443
3,North,Sandy,Soybean,986.866331,16.64419,False,True,Rainy,146,6.517573
4,South,Silt,Wheat,730.379174,31.620687,True,True,Cloudy,110,7.248251


In [7]:
# Rounding off to two decimal places
df["Rainfall_mm"] = df["Rainfall_mm"].round(2)
df["Temperature_Celsius"] = df["Temperature_Celsius"].round(2)


# Converting boolean columns to 0 and 1
df["Fertilizer_Used"] = df["Fertilizer_Used"].astype(int)
df["Irrigation_Used"] = df["Irrigation_Used"].astype(int)

print(df)


       Region Soil_Type     Crop  Rainfall_mm  Temperature_Celsius  \
0        West     Sandy   Cotton       897.08                27.68   
1       South      Clay     Rice       992.67                18.03   
2       North      Loam   Barley       148.00                29.79   
3       North     Sandy  Soybean       986.87                16.64   
4       South      Silt    Wheat       730.38                31.62   
...       ...       ...      ...          ...                  ...   
999995   West      Silt     Rice       302.81                27.99   
999996  South    Chalky   Barley       932.99                39.66   
999997  North     Peaty   Cotton       867.36                24.37   
999998   West      Silt    Wheat       492.81                33.05   
999999   West     Sandy    Maize       180.94                27.30   

        Fertilizer_Used  Irrigation_Used Weather_Condition  Days_to_Harvest  \
0                     0                1            Cloudy              122   
1

In [8]:
# Encode categorical variables
label_encoders = {}
categorical_cols = ["Region", "Soil_Type", "Crop", "Weather_Condition"]
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [9]:
# Define features and target
X = df.drop(columns=["Yield_tons_per_hectare"])
y = df["Yield_tons_per_hectare"]


In [10]:
# Standardize numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [11]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [12]:
# Train Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict(X_test)


In [13]:
# Evaluate model
rf_r2 = r2_score(y_test, rf_preds)
print("Prediction accuracy:", rf_r2*100)


Prediction accuracy: 90.73748654773166


In [14]:
import joblib

# Save the trained model
joblib.dump(rf_model, "random_forest_model.joblib",compress=9)

['random_forest_model.joblib']

In [15]:
from IPython.display import FileLink

In [16]:
# Create a downloadable link
FileLink("random_forest_model.joblib")

In [1]:
import sklearn
print(sklearn.__version__)


1.2.2


In [1]:
import xgboost

In [2]:
print(xgboost.__version__)

2.0.3
