In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

# Load the dataset
data = pd.read_csv('/content/crop_yield.csv')  # Replace with your CSV file path

# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(data.head())

# Preprocessing
# Convert categorical variables to numerical using Label Encoding
label_encoders = {}
categorical_columns = ['Crop', 'Season', 'State']

for col in categorical_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Remove the 'Production' column and use 'Yield' as the target variable
X = data.drop(columns=['Production', 'Yield'])  # Features
y = data['Yield']  # Target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Scaling (optional for Random Forest, but can help in some cases)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train a Random Forest Regressor
n_estimators = 100  # Number of trees in the forest
print("Training Random Forest model...")
rf_model = RandomForestRegressor(n_estimators=n_estimators, random_state=42)

# Train the model with progress updates
for i in range(1, n_estimators + 1):
    rf_model.set_params(n_estimators=i)
    rf_model.fit(X_train, y_train)
    if i % 10 == 0 or i == 1 or i == n_estimators:
        y_pred = rf_model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        print(f"Epoch {i}/{n_estimators} - Test MSE: {mse:.4f}, R²: {r2:.4f}")

# Final evaluation
y_pred = rf_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nFinal Model Evaluation:")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"R² Score: {r2:.4f}")

# Feature Importance
importances = rf_model.feature_importances_
feature_names = X.columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

print("\nFeature Importance:")
print(feature_importance_df)

# Visualize the results
plt.scatter(y_test, y_pred)
plt.xlabel('Actual Yield')
plt.ylabel('Predicted Yield')
plt.title('Actual vs Predicted Yield')
plt.show()

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# Load the dataset
data = pd.read_csv('/content/crop_yield.csv')

# Preprocessing
label_encoders = {}
categorical_columns = ['Crop', 'Season', 'State']

for col in categorical_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Remove the 'Production' column and use 'Yield' as the target variable
X = data.drop(columns=['Production', 'Yield'])
y = data['Yield']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Save feature names (important for prediction)
feature_names = X.columns.tolist()

# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train the model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Save the model, scaler, label encoders, and feature names
joblib.dump(rf_model, 'random_forest_model1.pkl')
joblib.dump(scaler, 'scaler1.pkl')
joblib.dump(label_encoders, 'label_encoders1.pkl')
joblib.dump(feature_names, 'feature_names.pkl')  # Save feature order

# Evaluate the model
y_pred = rf_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Final Model - MSE: {mse:.4f}, R²: {r2:.4f}")

In [None]:
import joblib

# Save the model, label encoders, and scaler
joblib.dump(rf_model, 'random_forest_model1.pkl')
joblib.dump(label_encoders, 'label_encoders1.pkl')
joblib.dump(scaler, 'scaler1.pkl')

In [None]:
def predict_yield(new_data, model, label_encoders, scaler, training_columns):
    """
    Predicts crop yield for new data using the trained Random Forest model.

    Parameters:
        new_data (pd.DataFrame): New data with the same features as the training data.
        model (RandomForestRegressor): Trained Random Forest model.
        label_encoders (dict): Dictionary of LabelEncoders used for categorical columns.
        scaler (StandardScaler): Scaler used for feature scaling.
        training_columns (list): List of feature names used during training.

    Returns:
        predictions (np.array): Predicted yield values.
    """
    # Make a copy of the new data to avoid modifying the original
    new_data = new_data.copy()

    # Ensure the new data has the same columns as the training data, in the same order
    new_data = new_data.reindex(columns=training_columns, fill_value=0)  # Fill missing columns with 0 or appropriate default values

    # Preprocess the new data
    # Encode categorical variables using the same LabelEncoders
    categorical_columns = ['Crop', 'Season', 'State']
    for col in categorical_columns:
        le = label_encoders[col]
        valid_categories = set(le.classes_)
        new_data[col] = new_data[col].apply(lambda x: x if x in valid_categories else le.classes_[0])  # Replace unknown categories with default
        new_data[col] = le.transform(new_data[col])  # Encode categorical variables

    # Scale the features using the same scaler
    new_data_scaled = scaler.transform(new_data)

    # Make predictions
    predictions = model.predict(new_data_scaled)
    return predictions

In [None]:
# Get the feature names used during training
training_columns = X.columns.tolist()

# Example new data
new_data = pd.DataFrame({
    'Crop': ['Maize', 'Rice'],
    'Season': ['Kharif', 'Rabi'],
    'State': ['Assam', 'Punjab'],
    'Area': [10000, 15000],
    'Annual_Rainfall': [1200, 800],
    'Fertilizer': [50000, 75000],
    'Pesticide': [2000, 3000],
    'Crop_Year': [1997, 1998]  # Add the 'Crop_Year' column
})

# Make predictions
predictions = predict_yield(new_data, rf_model, label_encoders, scaler, training_columns)

# Display the predictions
print("\nPredicted Yield:")
for i, pred in enumerate(predictions):
    print(f"Prediction {i + 1}: {pred:.4f}")

In [None]:
# Example input for a single crop
single_crop_data = pd.DataFrame({
    'Crop': ['Arecanut'],  # Replace with the crop name
    'Season': ['Karif'],  # Replace with the season
    'State': ['Telangana'],  # Replace with the state
    'Area': [100],  # Replace with the area
    'Annual_Rainfall': [2021.4],  # Replace with the annual rainfall
    'Fertilizer': [7024878.38],  # Replace with the fertilizer amount
    'Pesticide': [22882.34],  # Replace with the pesticide amount
    'Crop_Year': [2025]  # Replace with the crop year
})

# Get the feature names used during training
training_columns = X.columns.tolist()

# Make predictions for the single crop
prediction = predict_yield(single_crop_data, rf_model, label_encoders, scaler, training_columns)

# Display the prediction
print("\nPredicted Yield for the Single Crop:")
print(f"Predicted Yield: {prediction[0]:.4f}")