In [1]:
import numpy as np
import pandas as pd

df = pd.read_csv("APY.csv")
np.random.seed(42)

df["NDVI_mean"] = np.random.uniform(0.3, 0.8, len(df))
df["rainfall_mm"] = np.random.uniform(50, 300, len(df))
df["temp_avg"] = np.random.uniform(18, 35, len(df))
df["soil_pH"] = np.random.uniform(5.5, 8.0, len(df))

df.to_csv("multimodal_crop_dataset.csv", index=False)


In [2]:
df.columns = df.columns.str.strip()
print(df.columns.tolist())


['State', 'District', 'Crop', 'Crop_Year', 'Season', 'Area', 'Production', 'Yield', 'NDVI_mean', 'rainfall_mm', 'temp_avg', 'soil_pH']


In [3]:
df.head()

Unnamed: 0,State,District,Crop,Crop_Year,Season,Area,Production,Yield,NDVI_mean,rainfall_mm,temp_avg,soil_pH
0,Andaman and Nicobar Island,NICOBARS,Arecanut,2007,Kharif,2439.6,3415.0,1.4,0.48727,241.752852,20.643846,5.966527
1,Andaman and Nicobar Island,NICOBARS,Arecanut,2007,Rabi,1626.4,2277.0,1.4,0.775357,261.712505,28.169403,6.272131
2,Andaman and Nicobar Island,NICOBARS,Arecanut,2008,Autumn,4147.0,3060.0,0.74,0.665997,240.590931,20.29412,7.211945
3,Andaman and Nicobar Island,NICOBARS,Arecanut,2008,Summer,4147.0,2660.0,0.64,0.599329,155.305599,29.461618,5.844678
4,Andaman and Nicobar Island,NICOBARS,Arecanut,2009,Autumn,4153.0,3120.0,0.75,0.378009,289.390435,26.343111,6.957586


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder
import joblib

# Drop rows with NaN in the target variable 'Yield'
df.dropna(subset=['Yield'], inplace=True)

encoders = {}
for col in ["State", "District", "Season", "Crop"]:
    le = LabelEncoder()
    # Fit the encoder, and handle unknown labels by ignoring them
    le.fit(df[col].astype(str))
    df[col] = le.transform(df[col].astype(str))
    encoders[col] = le
    joblib.dump(le, f"{col}_encoder.pkl")  # save encoder


# Prepare features and target
X = df[["State", "District", "Crop","Crop_Year", "Season", "Area"]]

y = df["Yield"]

# Split and train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestRegressor(n_estimators=200, random_state=42)
model.fit(X_train, y_train)


pred = model.predict(X_test)
print("MAE:", mean_absolute_error(y_test, pred))
print("R2 Score:", r2_score(y_test, pred))

MAE: 14.83375291307118
R2 Score: 0.9152849373708202


In [5]:
import pandas as pd
import joblib

# Load encoders and model
state_encoder = joblib.load("State_encoder.pkl")
district_encoder = joblib.load("District_encoder.pkl")
season_encoder = joblib.load("Season_encoder.pkl")
crop_encoder = joblib.load("Crop_encoder.pkl")

# Example test input
test_input = pd.DataFrame({
    "State": ["Andhra Pradesh"],
    "District": ["WARANGAL"],
    "Crop": ["Wheat"],
    "Crop_Year": [2012],
    "Season": ["Rabi"],
    "Area": [900]
})

# Encode using same mappings, handling unknown labels
# Use a helper function to handle potential unknown labels during transformation
def safe_transform(encoder, data, default_value=-1):
    try:
        return encoder.transform(data)
    except ValueError:
        # Handle unseen labels by returning a default value (e.g., -1) or the closest known label
        # For simplicity, we'll return a default value here
        print(f"Warning: Unseen label encountered during transformation for {encoder}. Using default value {default_value}.")
        return [default_value] * len(data)


test_input["State"] = safe_transform(state_encoder, test_input["State"])
test_input["District"] = safe_transform(district_encoder, test_input["District"])
test_input["Season"] = safe_transform(season_encoder, test_input["Season"])
test_input["Crop"] = safe_transform(crop_encoder, test_input["Crop"])


# Predict
pred_yield = model.predict(test_input)
print("🌾 Predicted Yield:", pred_yield[0])

🌾 Predicted Yield: 1.6131999999999973


In [6]:
import pandas as pd

# Example single input
test_input = pd.DataFrame({
    "State": [12],
    "District": [45],
    "Crop": [3],
    "Crop_Year": [2025],
    "Season": [2],
    "Area": [850]
})


In [7]:
# Example: if you trained with a pipeline
predicted_yield = model.predict(test_input)
print(f"✅ Predicted Yield: {predicted_yield[0]:.2f} (quintals/ha or as per your dataset unit)")


✅ Predicted Yield: 10.99 (quintals/ha or as per your dataset unit)


In [8]:
# Step 1: Predict yield
pred = model.predict(X_test)

# Step 2: Suppose you have or simulate environmental factors
# (you can use averages, or fetch from APIs)
temp_avg = np.random.uniform(20, 40, size=len(X_test))
ndvi_mean = np.random.uniform(0.2, 0.8, size=len(X_test))

# Step 3: Apply physics-informed correction
def apply_physics_constraints(pred, temp, ndvi):
    correction = np.where(temp > 35, pred * 0.9, pred)  # reduce yield at high temp
    correction = np.where(ndvi < 0.4, correction * 0.85, correction)  # penalize low vegetation
    return correction

corrected_pred = apply_physics_constraints(pred, temp_avg, ndvi_mean)

# Step 4: Evaluate improvement (optional)
from sklearn.metrics import r2_score
print("Original R2:", r2_score(y_test, pred))
print("Corrected R2:", r2_score(y_test, corrected_pred))


Original R2: 0.9152849373708202
Corrected R2: 0.9040060093995514


In [9]:
# Save the best model (Random Forest)
import joblib

joblib.dump(model, "crop_yield_model.pkl")   # rf_best_model = your trained RandomForest model
print("✅ Model saved as crop_yield_model.pkl")


✅ Model saved as crop_yield_model.pkl


In [10]:
from google.colab import files
files.download('crop_yield_model.pkl') # Replace with your model's filename

ModuleNotFoundError: No module named 'google.colab'

In [None]:
!pip install streamlit pyngrok joblib


Collecting streamlit
  Downloading streamlit-1.50.0-py3-none-any.whl.metadata (9.5 kB)
Collecting pyngrok
  Downloading pyngrok-7.4.0-py3-none-any.whl.metadata (8.1 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.50.0-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m45.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyngrok-7.4.0-py3-none-any.whl (25 kB)
Downloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m86.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyngrok, pydeck, streamlit
Successfully installed pydeck-0.9.1 pyngrok-7.4.0 streamlit-1.50.0


In [None]:
%%writefile app.py
import streamlit as st
import joblib
import numpy as np
import pandas as pd
import gzip

# Load Model
try:
    with gzip.open("crop_yield_model.pkl.gz", "rb") as f:
        model = joblib.load(f)
except:
    model = joblib.load("crop_yield_model.pkl")

# Load Encoders
encoders = {}
for col in ["State", "District", "Season", "Crop"]:
    try:
        encoders[col] = joblib.load(f"{col}_encoder.pkl")
    except:
        encoders[col] = None

st.set_page_config(page_title="Crop Yield Prediction", page_icon="🌾", layout="centered")

st.title("🌾 AI-Powered Crop Yield Prediction")
st.markdown("Predict expected crop yield using machine learning trained on Indian crop data.")

# --- User Input Section ---
st.subheader("Enter Crop & Location Details")

col1, col2 = st.columns(2)
with col1:
    state = st.text_input("State", "Maharashtra")
    district = st.text_input("District", "Pune")
    crop = st.text_input("Crop", "Wheat")
with col2:
    season = st.selectbox("Season", ["Kharif", "Rabi", "Summer"])
    area = st.number_input("Area (in Hectares)", min_value=0.1, max_value=10000.0, value=10.0)
    year = st.number_input("Crop Year", min_value=2000, max_value=2025, value=2023)

if st.button("🔍 Predict Yield"):
    try:
        # Encode categorical fields
        inputs = []
        for col, val in zip(["State", "District", "Crop", "Season"], [state, district, crop, season]):
            le = encoders[col]
            if le is not None and val in le.classes_:
                encoded_val = le.transform([val])[0]
            else:
                encoded_val = 0  # fallback for unseen
            inputs.append(encoded_val)

        # Add numeric inputs
        inputs.extend([year, area])
        X_input = np.array(inputs).reshape(1, -1)

        # Predict yield
        pred = model.predict(X_input)[0]
        st.success(f"🌱 **Predicted Yield:** {pred:.2f} Quintals")

    except Exception as e:
        st.error(f"Error during prediction: {e}")


Overwriting app.py


In [None]:
from pyngrok import ngrok
!streamlit run app.py &

# Create tunnel for public URL
public_url = ngrok.connect(8501)
print("Streamlit App URL:", public_url)



Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.80.75.187:8501[0m
[0m
