In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from imblearn.over_sampling import SMOTE
import joblib

# Load Dataset
file_path = "crop_yield2.csv"
df = pd.read_csv(file_path)

# Handle Missing Data
numeric_cols = df.select_dtypes(include=[np.number]).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

# Remove Outliers using IQR
Q1 = df[numeric_cols].quantile(0.25)
Q3 = df[numeric_cols].quantile(0.75)
IQR = Q3 - Q1
df = df[~((df[numeric_cols] < (Q1 - 1.5 * IQR)) | (df[numeric_cols] > (Q3 + 1.5 * IQR))).any(axis=1)]

# Encode Categorical Variables
label_encoders = {}
for col in ["Crop", "Season", "State"]:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Prepare Data for Classification
X_class = df.drop(columns=["Crop", "Yield"])
y_class = df["Crop"]
scaler_class = StandardScaler()
X_class_scaled = scaler_class.fit_transform(X_class)

# Prepare Data for Regression
X_reg = df.drop(columns=["Yield"])
y_reg = df["Yield"]
scaler_reg = StandardScaler()
X_reg_scaled = scaler_reg.fit_transform(X_reg)

# Handle Class Imbalance
smote = SMOTE(random_state=42, k_neighbors=1)
X_class_resampled, y_class_resampled = smote.fit_resample(X_class_scaled, y_class)

# Split Data
Xc_train, Xc_test, yc_train, yc_test = train_test_split(X_class_resampled, y_class_resampled, test_size=0.2, random_state=42, stratify=y_class_resampled)
Xr_train, Xr_test, yr_train, yr_test = train_test_split(X_reg_scaled, y_reg, test_size=0.2, random_state=42)

# Hyperparameter Tuning for Classification
param_grid = {
    'n_estimators': [50, 100, 150, 200, 250],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'class_weight': [None, "balanced"]
}

clf_base = RandomForestClassifier(random_state=42)
grid_search = RandomizedSearchCV(clf_base, param_grid, n_iter=15, cv=5, random_state=42, n_jobs=-1)
grid_search.fit(Xc_train, yc_train)
clf = grid_search.best_estimator_

# Train Classification Model
clf.fit(Xc_train, yc_train)

# Train Regression Model
reg = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                            max_depth=grid_search.best_params_['max_depth'],
                            random_state=42)
reg.fit(Xr_train, yr_train)

# Save Models and Preprocessing Objects
joblib.dump(clf, 'classification_model.pkl')  # Classification model
joblib.dump(reg, 'regression_model.pkl')      # Regression model
joblib.dump(scaler_class, 'scaler_class.pkl') # Scaler for classification
joblib.dump(scaler_reg, 'scaler_reg.pkl')     # Scaler for regression
joblib.dump(label_encoders, 'label_encoders.pkl') # Label encoders for categorical variables

print("All model files have been saved successfully!")

All model files have been saved successfully!
