In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
df = pd.read_csv('all_india_dataset_final.csv')
df.shape

(246091, 17)

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,state_names,district_names,crop_year,season_names,crop_names,area,temperature,wind_speed,precipitation,humidity,soil_type,N,P,K,production,pressure
0,0,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Arecanut,1254.0,292.640631,2.379571,1016.868671,43,clay,598.552,0.0,0.0,2000,1004
1,1,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Other Kharif pulses,2.0,292.703959,2.906544,1015.245398,44,sandy,7.182,18.354,0.0,1,1004
2,2,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Rice,102.0,294.087464,2.139227,1015.611196,42,clay,35.742,0.0,0.0,321,1005
3,3,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Banana,176.0,293.216917,2.328995,1017.6691,42,peaty,8.74,8.74,0.0,641,1006
4,4,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Cashewnut,720.0,292.782403,2.688837,1015.646975,44,clay,0.615,0.615,0.615,165,1004


In [7]:
df = df.drop(columns=['Unnamed: 0'])
df.head()

Unnamed: 0,state_names,district_names,crop_year,season_names,crop_names,area,temperature,wind_speed,precipitation,humidity,soil_type,N,P,K,production,pressure
0,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Arecanut,1254.0,292.640631,2.379571,1016.868671,43,clay,598.552,0.0,0.0,2000,1004
1,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Other Kharif pulses,2.0,292.703959,2.906544,1015.245398,44,sandy,7.182,18.354,0.0,1,1004
2,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Rice,102.0,294.087464,2.139227,1015.611196,42,clay,35.742,0.0,0.0,321,1005
3,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Banana,176.0,293.216917,2.328995,1017.6691,42,peaty,8.74,8.74,0.0,641,1006
4,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Cashewnut,720.0,292.782403,2.688837,1015.646975,44,clay,0.615,0.615,0.615,165,1004


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246091 entries, 0 to 246090
Data columns (total 16 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   state_names     246091 non-null  object 
 1   district_names  246091 non-null  object 
 2   crop_year       246091 non-null  int64  
 3   season_names    246091 non-null  object 
 4   crop_names      246091 non-null  object 
 5   area            246091 non-null  float64
 6   temperature     246091 non-null  float64
 7   wind_speed      246091 non-null  float64
 8   precipitation   246091 non-null  float64
 9   humidity        246091 non-null  int64  
 10  soil_type       246091 non-null  object 
 11  N               246091 non-null  float64
 12  P               246091 non-null  float64
 13  K               246091 non-null  float64
 14  production      246091 non-null  object 
 15  pressure        246091 non-null  int64  
dtypes: float64(7), int64(3), object(6)
memory usage: 30.0+ M

In [11]:
state_counts = df.groupby('state_names').size().reset_index(name='counts')
state_counts

Unnamed: 0,state_names,counts
0,Andaman and Nicobar Islands,203
1,Andhra Pradesh,9628
2,Arunachal Pradesh,2546
3,Assam,14628
4,Bihar,18885
5,Chandigarh,90
6,Chhattisgarh,10709
7,Dadra and Nagar Haveli,263
8,Goa,208
9,Gujarat,8436


In [13]:
df['production'] = pd.to_numeric(df['production'], errors='coerce')

In [15]:
df['yield'] = df['production'] / df['area']

In [17]:
df = df.dropna(subset=['yield'])
df = df[df['yield'] != 0]

In [19]:
df = df.join(pd.get_dummies(df['district_names']))
df = df.join(pd.get_dummies(df['season_names']))
df = df.join(pd.get_dummies(df['crop_names']))
df = df.join(pd.get_dummies(df['state_names']))
df = df.join(pd.get_dummies(df['soil_type']))

In [21]:
df = df.drop('district_names', axis=1)
df = df.drop('season_names',axis=1)
df = df.drop('crop_names',axis=1)
#df = df.drop('state_names', axis=1)
df = df.drop('soil_type', axis=1)
df = df.drop('production', axis=1)
df = df.drop('area', axis=1)

In [23]:
df.head()

Unnamed: 0,state_names,crop_year,temperature,wind_speed,precipitation,humidity,N,P,K,pressure,...,Uttar Pradesh,Uttarakhand,West Bengal,chalky,clay,loamy,peaty,sandy,silt,silty
0,Andaman and Nicobar Islands,2000,292.640631,2.379571,1016.868671,43,598.552,0.0,0.0,1004,...,False,False,False,False,True,False,False,False,False,False
1,Andaman and Nicobar Islands,2000,292.703959,2.906544,1015.245398,44,7.182,18.354,0.0,1004,...,False,False,False,False,False,False,False,True,False,False
2,Andaman and Nicobar Islands,2000,294.087464,2.139227,1015.611196,42,35.742,0.0,0.0,1005,...,False,False,False,False,True,False,False,False,False,False
3,Andaman and Nicobar Islands,2000,293.216917,2.328995,1017.6691,42,8.74,8.74,0.0,1006,...,False,False,False,False,False,False,True,False,False,False
4,Andaman and Nicobar Islands,2000,292.782403,2.688837,1015.646975,44,0.615,0.615,0.615,1004,...,False,False,False,False,True,False,False,False,False,False


In [25]:
df = df.replace({True: 1, False: 0})

  df = df.replace({True: 1, False: 0})


In [27]:
df.head()

Unnamed: 0,state_names,crop_year,temperature,wind_speed,precipitation,humidity,N,P,K,pressure,...,Uttar Pradesh,Uttarakhand,West Bengal,chalky,clay,loamy,peaty,sandy,silt,silty
0,Andaman and Nicobar Islands,2000,292.640631,2.379571,1016.868671,43,598.552,0.0,0.0,1004,...,0,0,0,0,1,0,0,0,0,0
1,Andaman and Nicobar Islands,2000,292.703959,2.906544,1015.245398,44,7.182,18.354,0.0,1004,...,0,0,0,0,0,0,0,1,0,0
2,Andaman and Nicobar Islands,2000,294.087464,2.139227,1015.611196,42,35.742,0.0,0.0,1005,...,0,0,0,0,1,0,0,0,0,0
3,Andaman and Nicobar Islands,2000,293.216917,2.328995,1017.6691,42,8.74,8.74,0.0,1006,...,0,0,0,0,0,0,1,0,0,0
4,Andaman and Nicobar Islands,2000,292.782403,2.688837,1015.646975,44,0.615,0.615,0.615,1004,...,0,0,0,0,1,0,0,0,0,0


In [29]:
df.isna().values.any()

False

In [31]:
df = df[~(df[['N', 'P', 'K']].eq(0).all(axis=1))]
df.shape

(201956, 808)

In [33]:
states = df['state_names'].unique() 
statewise_results = []
best_state = None
best_model = None
highest_r2 = float('-inf')  # Initialize to negative infinity
best_state_mse = None
best_model_mse = None
lowest_mse = float('inf')  # Initialize to positive infinity

In [35]:
sc = StandardScaler()
models = {
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'K-Neighbors': KNeighborsRegressor(),
    'Random Forest': RandomForestRegressor(random_state=42, n_estimators=100, max_depth=2),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'Linear Regression': LinearRegression(),
    'Lasso': Lasso(random_state=42),
    'Ridge': Ridge(random_state=42)
}

In [37]:
for state in states:
    # Filter the DataFrame for the current state
    state_df = df[df['state_names'] == state]

    num_rows = state_df.shape[0]
    
    # Define features (X) and target (y)
    X_state = state_df.drop(columns=['yield', 'state_names'])  # Adjust 'Yield' to the actual target column name
    y_state = state_df['yield']  # Adjust 'Yield' to the actual target column name
    
    # Split the data into training and testing sets
    X_train_state, X_test_state, y_train_state, y_test_state = train_test_split(X_state, y_state, test_size=0.2, random_state=42)
    
    # Process the data (e.g., scaling)
    X_train_state_processed = sc.fit_transform(X_train_state)
    X_test_state_processed = sc.transform(X_test_state)
    
    # Train and evaluate each model
    state_results = {}
    for name, model in models.items():
        model.fit(X_train_state_processed, y_train_state)
        y_pred_state = model.predict(X_test_state_processed)
        
        mse = mean_squared_error(y_test_state, y_pred_state)
        r2 = r2_score(y_test_state, y_pred_state)
        
        #state_results[name] = {'MSE': mse, 'R2': r2}
        statewise_results.append({
            'State': state,
            'Model': name,
            'MSE': mse,
            'R²': r2,
            'Num_Rows': num_rows
        })

        if r2 > highest_r2:
            highest_r2 = r2
            best_state = state
            best_model = name

        if mse < lowest_mse:
            lowest_mse = mse
            best_state_mse = state
            best_model_mse = name
    
    #statewise_results[state] = state_results
results_df = pd.DataFrame(statewise_results)

In [39]:
print(results_df)

                           State              Model            MSE        R²  \
0    Andaman and Nicobar Islands      Decision Tree    1264.291391  0.996843   
1    Andaman and Nicobar Islands        K-Neighbors  378282.506544  0.055427   
2    Andaman and Nicobar Islands      Random Forest     314.156435  0.999216   
3    Andaman and Nicobar Islands  Gradient Boosting    4701.626387  0.988260   
4    Andaman and Nicobar Islands  Linear Regression   69292.124245  0.826977   
..                           ...                ...            ...       ...   
226                  West Bengal      Random Forest  311939.669923  0.649797   
227                  West Bengal  Gradient Boosting  126792.312613  0.857655   
228                  West Bengal  Linear Regression  756283.588864  0.150949   
229                  West Bengal              Lasso  755814.592795  0.151476   
230                  West Bengal              Ridge  756782.174968  0.150390   

     Num_Rows  
0         170  
1      

In [41]:
results_df.to_csv('statewise_model_results.csv', index=False)

In [59]:
for state, results in statewise_results.items():
    print(f"Results for {state}:")
    for model_name, metrics in results.items():
        print(f"  {model_name} - MSE: {metrics['MSE']:.4f}, R2: {metrics['R2']:.4f}")
    print()

Results for Andaman and Nicobar Islands:
  Decision Tree - MSE: 1264.2914, R2: 0.9968
  K-Neighbors - MSE: 378282.5065, R2: 0.0554
  Random Forest - MSE: 314.1564, R2: 0.9992
  Gradient Boosting - MSE: 4701.6264, R2: 0.9883
  Linear Regression - MSE: 69292.1242, R2: 0.8270
  Lasso - MSE: 4741.1277, R2: 0.9882
  Ridge - MSE: 5256.4881, R2: 0.9869

Results for Andhra Pradesh:
  Decision Tree - MSE: 420168.7690, R2: 0.7886
  K-Neighbors - MSE: 199117.6844, R2: 0.8998
  Random Forest - MSE: 286056.3276, R2: 0.8560
  Gradient Boosting - MSE: 164552.9519, R2: 0.9172
  Linear Regression - MSE: 245944.4119, R2: 0.8762
  Lasso - MSE: 243483.5561, R2: 0.8775
  Ridge - MSE: 244650.5870, R2: 0.8769

Results for Arunachal Pradesh:
  Decision Tree - MSE: 5.5850, R2: 0.7663
  K-Neighbors - MSE: 5.7959, R2: 0.7575
  Random Forest - MSE: 8.3375, R2: 0.6512
  Gradient Boosting - MSE: 4.2211, R2: 0.8234
  Linear Regression - MSE: 7.8210, R2: 0.6728
  Lasso - MSE: 10.7613, R2: 0.5498
  Ridge - MSE: 7.4386

In [61]:
print(f"The highest accuracy (R2) was achieved in {best_state} with the {best_model} model:")
print(f"  R2: {highest_r2:.4f}")
print(f"The lowest MSE was achieved in {best_state_mse} with the {best_model_mse} model:")
print(f"  MSE: {lowest_mse:.4f}")

The highest accuracy (R2) was achieved in Goa with the Gradient Boosting model:
  R2: 1.0000
The lowest MSE was achieved in Meghalaya with the Decision Tree model:
  MSE: 0.6926
