In [204]:
import pandas as  pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


In [205]:
df=pd.read_csv('crop_production.csv');

In [206]:
df.head(30)

Unnamed: 0,index,State_Name,District_Name,Crop_Year,Season,Crop,Area,Production
0,0,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Arecanut,1254.0,2000.0
1,1,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Other Kharif pulses,2.0,1.0
2,2,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Rice,102.0,321.0
3,3,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Banana,176.0,641.0
4,4,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Cashewnut,720.0,165.0
5,5,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Coconut,18168.0,65100000.0
6,6,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Dry ginger,36.0,100.0
7,7,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Sugarcane,1.0,2.0
8,8,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Sweet potato,5.0,15.0
9,9,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Tapioca,40.0,169.0


In [207]:
df.shape

(246091, 8)

In [208]:
df.isnull().sum()

index               0
State_Name          0
District_Name       0
Crop_Year           0
Season              0
Crop                0
Area                0
Production       3730
dtype: int64

In [209]:
df = df.dropna(subset=['Production'])

In [210]:
df.isnull().sum()

index            0
State_Name       0
District_Name    0
Crop_Year        0
Season           0
Crop             0
Area             0
Production       0
dtype: int64

In [211]:
df = df.drop(columns=['index'])  # Drop index column

In [212]:
df.isnull().sum()

State_Name       0
District_Name    0
Crop_Year        0
Season           0
Crop             0
Area             0
Production       0
dtype: int64

In [213]:
df['Yield'] = df['Production'] / df['Area']

In [214]:
df.head()

Unnamed: 0,State_Name,District_Name,Crop_Year,Season,Crop,Area,Production,Yield
0,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Arecanut,1254.0,2000.0,1.594896
1,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Other Kharif pulses,2.0,1.0,0.5
2,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Rice,102.0,321.0,3.147059
3,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Banana,176.0,641.0,3.642045
4,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Cashewnut,720.0,165.0,0.229167


In [215]:
df.isnull().sum()

State_Name       0
District_Name    0
Crop_Year        0
Season           0
Crop             0
Area             0
Production       0
Yield            0
dtype: int64

In [2]:
import pandas as pd
import pickle
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

df = pd.read_csv('crop_data_with_yield.csv') 


df = df.dropna(subset=['Yield'])
df = df[~df['Yield'].isin([float('inf'), float('-inf')])]


df = df[df['Yield'] < 1e6]  

categorical_columns = ['State_Name', 'Crop', 'Season', 'Crop_Year']

X = df[categorical_columns]
y = df['Yield']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
    ]
)

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("R2 Score:", r2_score(y_test, y_pred))
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))

with open('xgb_crop_yield_pipeline.pkl', 'wb') as f:
    pickle.dump(model, f)

print(" Model retrained and saved WITHOUT Area and Production.")


R2 Score: 0.8336217219797464
Mean Squared Error: 90006.68586981106
✅ Model retrained and saved WITHOUT Area and Production.


In [235]:
import pickle
import pandas as pd

def predict_yield(state, crop, season, crop_year):
    # Load the saved pipeline
    with open('xgb_crop_yield_pipeline.pkl', 'rb') as f:
        model = pickle.load(f)

    input_data = pd.DataFrame([{
        'State_Name': state,
        'Crop': crop,
        'Season': season,
        'Crop_Year': crop_year
    }])

    predicted_yield = model.predict(input_data)[0]

    print(f"🌾 Predicted Crop Yield: {predicted_yield:.2f} tons/hectare")
    return predicted_yield


In [274]:
predict_yield("Bihar", "Wheat", "Rabi", 1998)


🌾 Predicted Crop Yield: 1.21 tons/hectare


1.2051084

✅ 'Yield' column added and saved to 'crop_data_with_yield.csv'


In [271]:
import pandas as pd
import json

df = pd.read_csv('crop_data_with_yield.csv') 

categorical_columns = ['State_Name', 'Crop', 'Season', 'Crop_Year']

unique_values = {col: sorted(df[col].dropna().unique().tolist()) for col in categorical_columns}

for col, values in unique_values.items():
    print(f"\n🔹 {col} ({len(values)}):\n{values}")

with open('unique_categorical_values.json', 'w') as f:
    json.dump(unique_values, f, indent=4)

print("\n Unique categorical values extracted and saved to 'unique_categorical_values.json'")



🔹 State_Name (33):
['Andaman and Nicobar Islands', 'Andhra Pradesh', 'Arunachal Pradesh', 'Assam', 'Bihar', 'Chandigarh', 'Chhattisgarh', 'Dadra and Nagar Haveli', 'Goa', 'Gujarat', 'Haryana', 'Himachal Pradesh', 'Jammu and Kashmir ', 'Jharkhand', 'Karnataka', 'Kerala', 'Madhya Pradesh', 'Maharashtra', 'Manipur', 'Meghalaya', 'Mizoram', 'Nagaland', 'Odisha', 'Puducherry', 'Punjab', 'Rajasthan', 'Sikkim', 'Tamil Nadu', 'Telangana ', 'Tripura', 'Uttar Pradesh', 'Uttarakhand', 'West Bengal']

🔹 Crop (124):
['Apple', 'Arcanut (Processed)', 'Arecanut', 'Arhar/Tur', 'Ash Gourd', 'Atcanut (Raw)', 'Bajra', 'Banana', 'Barley', 'Bean', 'Beans & Mutter(Vegetable)', 'Beet Root', 'Ber', 'Bhindi', 'Bitter Gourd', 'Black pepper', 'Blackgram', 'Bottle Gourd', 'Brinjal', 'Cabbage', 'Cardamom', 'Carrot', 'Cashewnut', 'Cashewnut Processed', 'Cashewnut Raw', 'Castor seed', 'Cauliflower', 'Citrus Fruit', 'Coconut ', 'Coffee', 'Colocosia', 'Cond-spcs other', 'Coriander', 'Cotton(lint)', 'Cowpea(Lobia)', 'C

In [3]:
import pandas as pd
import pickle
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

df = pd.read_csv('crop_data_with_yield.csv')  

df = df.dropna(subset=['Yield'])
df = df[~df['Yield'].isin([float('inf'), float('-inf')])]
df = df[df['Yield'] < 1e6]  

categorical_columns = ['State_Name', 'Crop', 'Season', 'Crop_Year']

X = df[categorical_columns]
y = df['Yield']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
    ]
)

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("R2 Score:", r2_score(y_test, y_pred))
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))

with open('xgb_crop_yield_pipeline.pkl', 'wb') as f:
    pickle.dump(model, f)

print(" Model retrained and saved WITHOUT Area and Production.")


R2 Score: 0.8336217219797464
Mean Squared Error: 90006.68586981106
✅ Model retrained and saved WITHOUT Area and Production.
