In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
import pickle

# Load the dataset
file_path = r"C:\Life Projects\DAIICT_hackout\crop_production.csv"  # Update this path if necessary
data = pd.read_csv(file_path)

# Handle missing values by filling them with the median for numerical columns only
data['Area'].fillna(data['Area'].median(), inplace=True)
data['Production'].fillna(data['Production'].median(), inplace=True)

# Encoding categorical variables
label_encoders = {}
categorical_columns = ['State_Name', 'District_Name', 'Season', 'Crop']

for column in categorical_columns:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])

# Splitting the dataset into features and target variable
X = data.drop('Production', axis=1)
y = data['Production']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models to train
models = {
    'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42),
}

# Dictionary to store the performance of each model
model_performance = {}

# Train each model and evaluate
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    model_performance[model_name] = {'model': model, 'mse': mse, 'r2': r2}
    print(f"{model_name} - MSE: {mse}, R²: {r2}")

# Find the best model based on R-squared
best_model_name = max(model_performance, key=lambda x: model_performance[x]['r2'])
best_model = model_performance[best_model_name]['model']

print(f"\nBest Model: {best_model_name} with R²: {model_performance[best_model_name]['r2']}")

# Save the best model to a file
with open('best_crop_yield_model.pkl', 'wb') as model_file:
    pickle.dump(best_model, model_file)

# Save the label encoders to a file
with open('label_encoders_yield.pkl', 'wb') as encoder_file:
    pickle.dump(label_encoders, encoder_file)

print("Best model and label encoders have been saved.")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Area'].fillna(data['Area'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Production'].fillna(data['Production'].median(), inplace=True)


RandomForest - MSE: 29566005617466.15, R²: 0.814710669194388

Best Model: RandomForest with R²: 0.814710669194388
Best model and label encoders have been saved.


ValueError: y contains previously unseen labels: 'Andaman and Nicobar Islands'