In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    f1_score,
    mean_squared_error,
    mean_absolute_error,
)
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization, Input
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import warnings

warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv("EuroCrop_agricultural_logistics_dataset.csv", on_bad_lines='skip')
df.drop(columns=['Unnamed: 0', 'Harvest_Date', 'Inventory_Levels', 'Vehicle_Load_Capacity'], inplace=True, errors='ignore')
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df['Spoilage_Risk'] = pd.to_numeric(df['Spoilage_Risk'], errors='coerce')
df.dropna(subset=['Spoilage_Risk'], inplace=True)

In [None]:
numeric_cols = df.select_dtypes(include='number').columns.tolist()
for col in numeric_cols:
    q_low = df[col].quantile(0.001)
    q_high = df[col].quantile(0.999)
    df[col] = np.clip(df[col], q_low, q_high)
    df[col].fillna(df[col].median(), inplace=True)

if 'Temperature' in df.columns and 'Humidity' in df.columns:
    df['Temp_Humidity_Ratio'] = df['Temperature'] / (df['Humidity'] + 1e-5)
    df['Temp_Humidity_Ratio'].replace([np.inf, -np.inf], np.nan, inplace=True)
    df['Temp_Humidity_Ratio'].fillna(df['Temp_Humidity_Ratio'].median(), inplace=True)


In [None]:
categorical_cols = [col for col in ['Vehicle_Type', 'Crop_Type'] if col in df.columns]
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

y = df['Spoilage_Risk']
X = df.drop(columns=['Spoilage_Risk'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled = np.nan_to_num(X_train_scaled, nan=0.0, posinf=1e10, neginf=-1e10)
X_test_scaled = np.nan_to_num(X_test_scaled, nan=0.0, posinf=1e10, neginf=-1e10)
X_train_scaled = np.clip(X_train_scaled, -1e5, 1e5)
X_test_scaled = np.clip(X_test_scaled, -1e5, 1e5)

xgb = XGBRegressor(random_state=42, missing=np.nan)
xgb_params = {
    'n_estimators': [100, 300],
    'max_depth': [5, 10],
    'learning_rate': [0.05, 0.1]
}
xgb_search = RandomizedSearchCV(xgb, xgb_params, n_iter=4, cv=3, scoring='neg_root_mean_squared_error', n_jobs=-1, verbose=1)
xgb_search.fit(X_train_scaled, y_train)

rf = RandomForestRegressor(random_state=42)
rf_params = {
    'n_estimators': [100, 300],
    'max_depth': [10, 30],
    'min_samples_split': [2, 10]
}
rf_search = RandomizedSearchCV(rf, rf_params, n_iter=4, cv=3, scoring='neg_root_mean_squared_error', n_jobs=-1, verbose=1)
rf_search.fit(X_train_scaled, y_train)

def risk_label(val):
    if val > 1.3:
        return 'High Risk'
    elif val < 1.1:
        return 'Low Risk'
    else:
        return 'Medium Risk'

y_true_labels = y_test.apply(risk_label)

In [None]:
def evaluate_regression_model(model, name):
    y_pred = model.predict(X_test_scaled)
    y_pred_labels = pd.Series(y_pred).apply(risk_label)
    acc = accuracy_score(y_true_labels, y_pred_labels)
    f1 = f1_score(y_true_labels, y_pred_labels, average='weighted')
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    print(f"{name} Results:")
    print(f"Accuracy: {acc:.4f}")
    print(f"F1 Score (weighted): {f1:.4f}")
    print(f"MSE: {mse:.4f}")
    print(f"MAE: {mae:.4f}")
    print("Classification Report:")
    print(classification_report(y_true_labels, y_pred_labels))
    cm = confusion_matrix(y_true_labels, y_pred_labels, labels=["High Risk", "Medium Risk", "Low Risk"])
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap="Blues",
                xticklabels=["High", "Medium", "Low"],
                yticklabels=["High", "Medium", "Low"])
    plt.title(f"Confusion Matrix: {name}")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.tight_layout()
    plt.show()

evaluate_regression_model(xgb_search.best_estimator_, "XGBoost Regressor")
evaluate_regression_model(rf_search.best_estimator_, "Random Forest Regressor")

In [None]:
scaler_x_lstm = MinMaxScaler()
scaler_y_lstm = MinMaxScaler()
X_scaled_lstm = scaler_x_lstm.fit_transform(X)
y_scaled_lstm = scaler_y_lstm.fit_transform(y.values.reshape(-1, 1))

X_seq, y_seq = [], []
time_steps = 20
for i in range(time_steps, len(X_scaled_lstm)):
    X_seq.append(X_scaled_lstm[i-time_steps:i])
    y_seq.append(y_scaled_lstm[i])

X_seq = np.array(X_seq)
y_seq = np.array(y_seq)

split = int(0.8 * len(X_seq))
X_train_lstm, X_test_lstm = X_seq[:split], X_seq[split:]
y_train_lstm, y_test_lstm = y_seq[:split], y_seq[split:]

model = Sequential([
    Input(shape=(X_train_lstm.shape[1], X_train_lstm.shape[2])),
    LSTM(64, return_sequences=True),
    BatchNormalization(),
    Dropout(0.2),
    LSTM(32),
    Dropout(0.2),
    Dense(1)
])
model.compile(optimizer='adam', loss='mse')
model.fit(X_train_lstm, y_train_lstm, epochs=20, batch_size=32, validation_split=0.1, verbose=1,
          callbacks=[EarlyStopping(patience=5), ReduceLROnPlateau()])

y_pred_lstm = model.predict(X_test_lstm).flatten()
y_pred_lstm_real = scaler_y_lstm.inverse_transform(y_pred_lstm.reshape(-1, 1)).flatten()
y_test_lstm_real = scaler_y_lstm.inverse_transform(y_test_lstm.reshape(-1, 1)).flatten()

y_lstm_labels = pd.Series(y_pred_lstm_real).apply(risk_label)
y_lstm_true_labels = pd.Series(y_test_lstm_real).apply(risk_label)

acc_lstm = accuracy_score(y_lstm_true_labels, y_lstm_labels)
f1_lstm = f1_score(y_lstm_true_labels, y_lstm_labels, average='weighted')
mse_lstm = mean_squared_error(y_test_lstm_real, y_pred_lstm_real)
mae_lstm = mean_absolute_error(y_test_lstm_real, y_pred_lstm_real)

In [None]:
print("\nLSTM Results:")
print(f"Accuracy: {acc_lstm:.4f}")
print(f"F1 Score (weighted): {f1_lstm:.4f}")
print(f"MSE: {mse_lstm:.4f}")
print(f"MAE: {mae_lstm:.4f}")
print("Classification Report:")
print(classification_report(y_lstm_true_labels, y_lstm_labels))

cm_lstm = confusion_matrix(y_lstm_true_labels, y_lstm_labels, labels=["High Risk", "Medium Risk", "Low Risk"])
plt.figure(figsize=(6, 4))
sns.heatmap(cm_lstm, annot=True, fmt='d', cmap="Blues",
            xticklabels=["High", "Medium", "Low"],
            yticklabels=["High", "Medium", "Low"])
plt.title("Confusion Matrix: LSTM")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.tight_layout()
plt.show()
