In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
df = pd.read_csv('all_india_dataset_final.csv')
df.shape

(246091, 17)

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,state_names,district_names,crop_year,season_names,crop_names,area,temperature,wind_speed,precipitation,humidity,soil_type,N,P,K,production,pressure
0,0,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Arecanut,1254.0,292.640631,2.379571,1016.868671,43,clay,598.552,0.0,0.0,2000,1004
1,1,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Other Kharif pulses,2.0,292.703959,2.906544,1015.245398,44,sandy,7.182,18.354,0.0,1,1004
2,2,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Rice,102.0,294.087464,2.139227,1015.611196,42,clay,35.742,0.0,0.0,321,1005
3,3,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Banana,176.0,293.216917,2.328995,1017.6691,42,peaty,8.74,8.74,0.0,641,1006
4,4,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Cashewnut,720.0,292.782403,2.688837,1015.646975,44,clay,0.615,0.615,0.615,165,1004


In [7]:
df = df.drop(columns=['Unnamed: 0'])
df.head()

Unnamed: 0,state_names,district_names,crop_year,season_names,crop_names,area,temperature,wind_speed,precipitation,humidity,soil_type,N,P,K,production,pressure
0,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Arecanut,1254.0,292.640631,2.379571,1016.868671,43,clay,598.552,0.0,0.0,2000,1004
1,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Other Kharif pulses,2.0,292.703959,2.906544,1015.245398,44,sandy,7.182,18.354,0.0,1,1004
2,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Rice,102.0,294.087464,2.139227,1015.611196,42,clay,35.742,0.0,0.0,321,1005
3,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Banana,176.0,293.216917,2.328995,1017.6691,42,peaty,8.74,8.74,0.0,641,1006
4,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Cashewnut,720.0,292.782403,2.688837,1015.646975,44,clay,0.615,0.615,0.615,165,1004


In [9]:
df.rename(columns={'temperature': 'Temperature', 'humidity': 'Humidity'}, inplace=True)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246091 entries, 0 to 246090
Data columns (total 16 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   state_names     246091 non-null  object 
 1   district_names  246091 non-null  object 
 2   crop_year       246091 non-null  int64  
 3   season_names    246091 non-null  object 
 4   crop_names      246091 non-null  object 
 5   area            246091 non-null  float64
 6   Temperature     246091 non-null  float64
 7   wind_speed      246091 non-null  float64
 8   precipitation   246091 non-null  float64
 9   Humidity        246091 non-null  int64  
 10  soil_type       246091 non-null  object 
 11  N               246091 non-null  float64
 12  P               246091 non-null  float64
 13  K               246091 non-null  float64
 14  production      246091 non-null  object 
 15  pressure        246091 non-null  int64  
dtypes: float64(7), int64(3), object(6)
memory usage: 30.0+ M

In [13]:
df['state_names'].unique()

array(['Andaman and Nicobar Islands', 'Andhra Pradesh',
       'Arunachal Pradesh', 'Assam', 'Bihar', 'Chandigarh',
       'Chhattisgarh', 'Dadra and Nagar Haveli', 'Goa', 'Gujarat',
       'Haryana', 'Himachal Pradesh', 'Jammu and Kashmir ', 'Jharkhand',
       'Karnataka', 'Kerala', 'Madhya Pradesh', 'Maharashtra', 'Manipur',
       'Meghalaya', 'Mizoram', 'Nagaland', 'Odisha', 'Puducherry',
       'Punjab', 'Rajasthan', 'Sikkim', 'Tamil Nadu', 'Telangana ',
       'Tripura', 'Uttar Pradesh', 'Uttarakhand', 'West Bengal'],
      dtype=object)

In [15]:
df = df[df['state_names'] == "Uttar Pradesh"]

In [17]:
df['production'] = pd.to_numeric(df['production'], errors='coerce')

In [19]:
df['yield'] = df['production'] / df['area']

In [21]:
df = df.dropna(subset=['yield'])
df = df[df['yield'] != 0]

In [23]:
df['crop_names'].unique()

array(['Arhar/Tur', 'Bajra', 'Groundnut', 'Jowar', 'Maize',
       'Moong(Green Gram)', 'Moth', 'Oilseeds total', 'Onion', 'Rice',
       'Sesamum', 'Soyabean', 'Total foodgrain', 'Urad', 'Barley', 'Gram',
       'Masoor', 'Peas & beans (Pulses)', 'Rapeseed &Mustard',
       'Sunflower', 'Wheat', 'Potato', 'Sugarcane', 'Cotton(lint)',
       'Castor seed', 'Linseed', 'Sannhamp', 'Jute', 'Turmeric',
       'Dry chillies', 'Garlic', 'Guar seed', 'Sweet potato', 'Coriander',
       'Dry ginger', 'Other Kharif pulses', 'Other  Rabi pulses',
       'Tobacco', 'Banana', 'Small millets', 'Ragi', 'Ginger'],
      dtype=object)

In [25]:
#df = df.join(pd.get_dummies(df['district_names']))
#df = df.join(pd.get_dummies(df['season_names']))
df = df.join(pd.get_dummies(df['crop_names']))
#df = df.join(pd.get_dummies(df['state_names']))
#df = df.join(pd.get_dummies(df['soil_type']))

In [27]:
df = df.drop('district_names', axis=1)
df = df.drop('season_names',axis=1)
df = df.drop('crop_names',axis=1)
df = df.drop('state_names', axis=1)
df = df.drop('soil_type', axis=1)
df = df.drop('production', axis=1)
df = df.drop('area', axis=1)
df = df.drop('crop_year', axis=1)
df = df.drop('wind_speed', axis=1)
df = df.drop('pressure', axis=1)
df = df.drop('precipitation', axis=1)

In [29]:
df.head()

Unnamed: 0,Temperature,Humidity,N,P,K,yield,Arhar/Tur,Bajra,Banana,Barley,...,Small millets,Soyabean,Sugarcane,Sunflower,Sweet potato,Tobacco,Total foodgrain,Turmeric,Urad,Wheat
198276,306.773308,57,7.68,7.68,0.0,1.478128,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
198277,307.583416,55,3.255,3.255,3.255,1.481451,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
198278,306.92938,55,38.25,38.25,38.25,0.814815,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
198279,307.506511,55,0.0,28.048,0.0,0.56,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
198280,306.839781,56,10.5,27.3,27.3,2.289855,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [31]:
df = df.replace({True: 1, False: 0})

  df = df.replace({True: 1, False: 0})


In [33]:
df.head()

Unnamed: 0,Temperature,Humidity,N,P,K,yield,Arhar/Tur,Bajra,Banana,Barley,...,Small millets,Soyabean,Sugarcane,Sunflower,Sweet potato,Tobacco,Total foodgrain,Turmeric,Urad,Wheat
198276,306.773308,57,7.68,7.68,0.0,1.478128,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
198277,307.583416,55,3.255,3.255,3.255,1.481451,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
198278,306.92938,55,38.25,38.25,38.25,0.814815,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
198279,307.506511,55,0.0,28.048,0.0,0.56,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
198280,306.839781,56,10.5,27.3,27.3,2.289855,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
df.isna().values.any()

False

In [37]:
df = df[~(df[['N', 'P', 'K']].eq(0).all(axis=1))]
df.shape

(28054, 48)

In [39]:
X = df.drop('yield', axis=1)
y = df['yield']

In [41]:
X.head()
X.columns

Index(['Temperature', 'Humidity', 'N', 'P', 'K', 'Arhar/Tur', 'Bajra',
       'Banana', 'Barley', 'Castor seed', 'Coriander', 'Cotton(lint)',
       'Dry chillies', 'Dry ginger', 'Garlic', 'Ginger', 'Gram', 'Groundnut',
       'Guar seed', 'Jowar', 'Jute', 'Linseed', 'Maize', 'Masoor',
       'Moong(Green Gram)', 'Moth', 'Oilseeds total', 'Onion',
       'Other  Rabi pulses', 'Other Kharif pulses', 'Peas & beans (Pulses)',
       'Potato', 'Ragi', 'Rapeseed &Mustard', 'Rice', 'Sannhamp', 'Sesamum',
       'Small millets', 'Soyabean', 'Sugarcane', 'Sunflower', 'Sweet potato',
       'Tobacco', 'Total foodgrain', 'Turmeric', 'Urad', 'Wheat'],
      dtype='object')

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [45]:
scaler = StandardScaler()
X_train_processed = scaler.fit_transform(X_train)
X_test_processed = scaler.transform(X_test)
#joblib.dump(scaler, 'scaler.pkl')

In [48]:
models = {
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'K-Neighbors': KNeighborsRegressor(),
    'Random Forest': RandomForestRegressor(random_state=42, n_estimators=100, max_depth=2),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'Linear Regression': LinearRegression(),
    'Lasso': Lasso(random_state=42),
    'Ridge': Ridge(random_state=42),
    'XGBoost': XGBRegressor(random_state=42) 
}

best_model = None
best_r2 = -float('inf')  # Initialize to a very low value
best_model_name = ""

for name, model in models.items():
    # Train the model
    model.fit(X_train_processed, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test_processed)
    
    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"\n{name} Results:")
    print(f"Mean Squared Error: {mse}")
    print(f"R-squared Score: {r2}")

    if r2 > best_r2:
        best_r2 = r2
        best_model = model
        best_model_name = name


Decision Tree Results:
Mean Squared Error: 10.092281341778524
R-squared Score: 0.9228085038261278

K-Neighbors Results:
Mean Squared Error: 5.388050845052621
R-squared Score: 0.9587891288296949

Random Forest Results:
Mean Squared Error: 24.66537671651914
R-squared Score: 0.8113451985767989

Gradient Boosting Results:
Mean Squared Error: 5.086115448106956
R-squared Score: 0.9610985021268508

Linear Regression Results:
Mean Squared Error: 5.087052411317274
R-squared Score: 0.9610913357003893

Lasso Results:
Mean Squared Error: 11.951696496106656
R-squared Score: 0.908586641304639

Ridge Results:
Mean Squared Error: 5.087012244446957
R-squared Score: 0.9610916429194126

XGBoost Results:
Mean Squared Error: 5.417217127090047
R-squared Score: 0.958566048549622


In [50]:
if best_model is not None:
    joblib.dump(best_model, 'best_model.pkl')
    print(f"\nBest model '{best_model_name}' saved with R-squared score: {best_r2}")


Best model 'Gradient Boosting' saved with R-squared score: 0.9610985021268508
