In [None]:
# ================================
# 1. IMPORT LIBRARIES
# ================================
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error

import joblib

# ================================
# 2. LOAD DATASET
# ================================

df = pd.read_csv('/crop_yield.csv')

print("Dataset loaded successfully")
print(df.head())

# ================================
# 3. DROP UNWANTED COLUMN
# ================================
# Production causes data leakage, so remove it
df = df.drop(columns=['Production'])

# ================================
# 4. HANDLE CATEGORICAL DATA
# ================================
label_encoders = {}

categorical_cols = ['Crop', 'Season', 'State']

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

print("\nCategorical columns encoded")

# ================================
# 5. DEFINE FEATURES & TARGET
# ================================
X = df.drop(columns=['Yield'])
y = df['Yield']

# ================================
# 6. SPLIT DATA
# ================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ================================
# 7. TRAIN MODEL
# ================================
model = RandomForestRegressor(
    n_estimators=200,
    random_state=42
)

model.fit(X_train, y_train)

print("\nModel training completed")

# ================================
# 8. EVALUATE MODEL
# ================================
y_pred = model.predict(X_test)

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print("\nModel Performance:")
print("R2 Score:", r2)
print("Mean Absolute Error:", mae)

# ================================
# 9. SAVE MODEL & ENCODERS
# ================================
joblib.dump(model, 'model.pkl')
joblib.dump(label_encoders, 'label_encoders.pkl')

print("\nmodel.pkl and label_encoders.pkl saved successfully")

# ================================
# 10. TEST PREDICTION (SAMPLE)
# ================================
sample_input = X.iloc[0:1]
sample_prediction = model.predict(sample_input)

print("\nSample Prediction (Yield):", sample_prediction[0])


Dataset loaded successfully
           Crop  Crop_Year       Season  State     Area  Production  \
0      Arecanut       1997  Whole Year   Assam  73814.0       56708   
1     Arhar/Tur       1997  Kharif       Assam   6637.0        4685   
2   Castor seed       1997  Kharif       Assam    796.0          22   
3      Coconut        1997  Whole Year   Assam  19656.0   126905000   
4  Cotton(lint)       1997  Kharif       Assam   1739.0         794   

   Annual_Rainfall  Fertilizer  Pesticide        Yield  
0           2051.4  7024878.38   22882.34     0.796087  
1           2051.4   631643.29    2057.47     0.710435  
2           2051.4    75755.32     246.76     0.238333  
3           2051.4  1870661.52    6093.36  5238.051739  
4           2051.4   165500.63     539.09     0.420909  

Categorical columns encoded

Model training completed

Model Performance:
R2 Score: 0.9789490743221669
Mean Absolute Error: 10.140026610903465

model.pkl and label_encoders.pkl saved successfully

Sampl

In [None]:
from google.colab import drive
drive.mount('/content/drive')