In [None]:
# !pip install tensorflow # Uncomment if needed

# --- Standard Imports ---
import pandas as pd
import numpy as np
import datetime
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import MinMaxScaler # <--- For LSTM scaling
from IPython.display import display
import traceback
import warnings
import os

# --- Keras / TensorFlow Imports ---
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# Suppress common warnings
warnings.filterwarnings("ignore")
# tf.random.set_seed(42) # Optional reproducibility
# np.random.seed(42)

# --- Configuration ---  b
# Data Paths
DATA_PATH_TRAIN = r"E:\elevatetrsest\crop price predictor\Crop_price_Prediction\data\edited_nashik_2002_2023.csv"
DATA_PATH_VALIDATION = r"E:\elevatetrsest\crop price predictor\Crop_price_Prediction\data\edited_nashik_test_2024.csv"

# Target Columns & Date Construction Columns
TARGET_COLUMNS = ['avg_min_price', 'avg_max_price', 'avg_modal_price']
PRIMARY_TARGET = 'avg_modal_price' # Focus on one target
DATE_COLUMN = 'full_date'; YEAR_COL = 'year'; MONTH_COL = 'month'; DAY_COL = 'date'
VALIDATION_YEAR = 2024

# Filter Selections
SELECTED_STATE_STR = "Maharashtra"; SELECTED_DISTRICT_STR = "Nashik"; SELECTED_COMMODITY_STR = "Wheat"

# Frequency Encoding Maps (CONFIRM THESE ARE CORRECT)
state_name_encoding_map = {"maharashtra": 6291}
district_name_encoding_map = {"nashik": 6291}
commodity_name_encoding_map = {"wheat": 6291}

# --- LSTM Configuration ---
SEQUENCE_LENGTH = 60     # Number of past days to use for predicting the next day - NEEDS TUNING
LSTM_UNITS = 50        # Number of units in LSTM layer - NEEDS TUNING
DROPOUT_RATE = 0.2       # Dropout for regularization
EPOCHS = 50             # Max epochs (use EarlyStopping) - NEEDS TUNING
BATCH_SIZE = 32      # Batch size for training - NEEDS TUNING

# --- Helper Functions (Outlier removal, Loading - Minor changes needed) ---
def remove_outliers_iqr(df, columns_to_check):
    df_filtered = df.copy(); initial_rows = len(df_filtered)
    valid_columns = [col for col in columns_to_check if col in df_filtered.columns and pd.api.types.is_numeric_dtype(df_filtered[col])]
    if not valid_columns: return df_filtered
    subset_for_iqr = df_filtered[valid_columns]
    Q1 = subset_for_iqr.quantile(0.25); Q3 = subset_for_iqr.quantile(0.75); IQR = Q3 - Q1
    mask = ~((subset_for_iqr < (Q1 - 1.5 * IQR)) | (subset_for_iqr > (Q3 + 1.5 * IQR))).any(axis=1)
    df_filtered = df_filtered[mask]; rows_removed = initial_rows - len(df_filtered)
    if rows_removed > 0: print(f"Removed {rows_removed} rows via IQR.")
    return df_filtered

# --- Data Loading and Preprocessing Function (Simplified for LSTM base) ---
def load_and_preprocess_base_data(path, date_col_name, year_col, month_col, day_col, all_potential_targets, dataset_name="Training"):
    """Loads data, constructs date, basic cleaning. Returns essential cols."""
    print("-" * 30); print(f"Processing {dataset_name} Dataset"); print("-" * 30)
    try:
        print(f"Loading {dataset_name} data from {path}..."); df = pd.read_csv(path); print(f"Loaded {len(df)} rows.")
        # 1. Construct Date
        date_components_cols = [year_col, month_col, day_col]
        if not all(col in df.columns for col in date_components_cols): print(f"Error: Date component cols missing: {[c for c in date_components_cols if c not in df.columns]}"); return None
        for col in date_components_cols: df[col] = pd.to_numeric(df[col], errors='coerce')
        df.dropna(subset=date_components_cols, inplace=True)
        print(f"Constructing '{date_col_name}'...");
        df[date_col_name] = pd.to_datetime({'year': df[year_col], 'month': df[month_col], 'day': df[day_col]}, errors='coerce')
        initial_rows_date = len(df); df.dropna(subset=[date_col_name], inplace=True)
        if initial_rows_date > len(df): print(f"Dropped {initial_rows_date - len(df)} rows due to invalid date components.")
        print(f"{len(df)} rows after date construction.")

        # 2. Keep ONLY necessary columns for filtering + target + date
        required_numeric_filter_cols = ['state_name', 'district_name', 'commodity_name']
        keep_cols = [date_col_name] + all_potential_targets + required_numeric_filter_cols
        missing_req_cols = [col for col in keep_cols if col not in df.columns]
        if missing_req_cols: print(f"Error: Required columns missing: {missing_req_cols}"); print(f"Available: {df.columns.tolist()}"); return None
        df = df[keep_cols] # Keep only needed columns early on

        # 3. Ensure Price/Target columns are numeric
        for col in all_potential_targets: df[col] = pd.to_numeric(df[col], errors='coerce')
        df.dropna(subset=all_potential_targets, how='any', inplace=True)
        print(f"{len(df)} rows after ensuring price columns numeric.")

        # 4. Ensure Filter columns are numeric (encoded)
        for col in required_numeric_filter_cols:
             if not pd.api.types.is_numeric_dtype(df[col]): print(f"Error: Col '{col}' expected numeric but isn't."); return None

        # 5. Apply IQR Outlier Removal (Optional, on targets)
        # df = remove_outliers_iqr(df, all_potential_targets)

        df.sort_values(date_col_name, inplace=True)
        print(f"{dataset_name} base data loaded. {len(df)} rows.")
        return df

    except FileNotFoundError: print(f"Error: {dataset_name} file not found at {path}"); return None
    except Exception as e: print(f"Error loading/preprocessing {dataset_name}: {e}"); traceback.print_exc(); return None


# --- Sequence Creation Function ---
def create_sequences(data, sequence_length):
    """Creates sequences of data for LSTM."""
    X, y = [], []
    for i in range(len(data) - sequence_length):
        X.append(data[i:(i + sequence_length)]) # Sequence of inputs
        y.append(data[i + sequence_length])    # Value to predict
    return np.array(X), np.array(y)

# --- Evaluation Metrics Function (No changes) ---
def calculate_metrics(y_true, y_pred):
    # ... (same as before) ...
    y_true = np.array(y_true).flatten(); y_pred = np.array(y_pred).flatten()
    valid_mask = ~np.isnan(y_true) & ~np.isnan(y_pred)
    y_true = y_true[valid_mask]; y_pred = y_pred[valid_mask]
    if len(y_true) == 0: print("Warning: No valid points for metric calculation."); return np.nan, np.nan, np.nan
    try:
        r2 = r2_score(y_true, y_pred); mae = mean_absolute_error(y_true, y_pred); mse = mean_squared_error(y_true, y_pred)
        return r2, mae, mse
    except Exception as e: print(f"Error calculating metrics: {e}"); return np.nan, np.nan, np.nan


# --- Plotting Function for Validation (Adapted for LSTM Output) ---
def plot_lstm_validation_results(dates_val, actuals_inv, preds_inv, target_column, title):
    """Plots actuals vs predictions for validation period."""
    import plotly.graph_objects as go # Import locally if not globally
    fig = go.Figure(); target_label = target_column.replace("avg_", "").replace("_price", "").capitalize()

    # Actuals
    fig.add_trace(go.Scatter(x=dates_val, y=actuals_inv.flatten(), mode='lines+markers', name=f'Actual {target_label} ({VALIDATION_YEAR})', line=dict(color='blue'), marker=dict(size=4)))

    # Predictions
    fig.add_trace(go.Scatter(x=dates_val, y=preds_inv.flatten(), mode='lines', name=f'Predicted {target_label} ({VALIDATION_YEAR})', line=dict(color='red')))

    fig.update_layout(title=title, xaxis_title=f'Date ({VALIDATION_YEAR})', yaxis_title=f'Price ({target_label})', hovermode="x unified", legend_title_text='Legend')
    return fig


# --- Main Execution Block ---
print("--- LSTM Forecasting & Validation ---") # Changed Title
print(f"--- (Nashik/Wheat: 2002-2023 Train, {VALIDATION_YEAR} Validate) ---")

# 1. Load Base Data
df_train_base = load_and_preprocess_base_data(DATA_PATH_TRAIN, DATE_COLUMN, YEAR_COL, MONTH_COL, DAY_COL, TARGET_COLUMNS, "Training (2002-2023)")
df_val_base = load_and_preprocess_base_data(DATA_PATH_VALIDATION, DATE_COLUMN, YEAR_COL, MONTH_COL, DAY_COL, TARGET_COLUMNS, f"Validation ({VALIDATION_YEAR})")

# Init flags/variables
model = None

if df_train_base is not None and df_val_base is not None:
    # 2. Get Encoded Values for Filtering
    try:
        selected_state_key=SELECTED_STATE_STR.strip().lower(); selected_district_key=SELECTED_DISTRICT_STR.strip().lower(); selected_commodity_key=SELECTED_COMMODITY_STR.strip().lower()
        encoded_state = state_name_encoding_map.get(selected_state_key); encoded_district = district_name_encoding_map.get(selected_district_key); encoded_commodity = commodity_name_encoding_map.get(selected_commodity_key)
        lookup_failed = False
        if encoded_state is None: print(f"Error: State '{SELECTED_STATE_STR}' missing map."); lookup_failed=True
        if encoded_district is None: print(f"Error: District '{SELECTED_DISTRICT_STR}' missing map."); lookup_failed=True
        if encoded_commodity is None: print(f"Error: Commodity '{SELECTED_COMMODITY_STR}' missing map."); lookup_failed=True
        if lookup_failed: print("Check maps."); df_train_base=df_val_base=None
        else: print(f"\nSelected: {SELECTED_STATE_STR}/{SELECTED_DISTRICT_STR}/{SELECTED_COMMODITY_STR} -> Encoded: St={encoded_state}, Di={encoded_district}, Co={encoded_commodity}")
    except Exception as e: print(f"Error mapping lookup: {e}"); df_train_base = df_val_base = None

if df_train_base is not None and df_val_base is not None:
    # 3. Filtering Data
    print(f"\nFiltering datasets using encoded values...")
    filter_cols_num = ['state_name', 'district_name', 'commodity_name']
    if not all(col in df_train_base.columns for col in filter_cols_num): print("Error: Encoded filter cols missing Training."); filtered_df_train = pd.DataFrame()
    else: filtered_df_train = df_train_base[(df_train_base['state_name'] == encoded_state) & (df_train_base['district_name'] == encoded_district) & (df_train_base['commodity_name'] == encoded_commodity)].copy(); filtered_df_train.sort_values(by=DATE_COLUMN, inplace=True)
    if not all(col in df_val_base.columns for col in filter_cols_num): print("Error: Encoded filter cols missing Validation."); filtered_df_val = pd.DataFrame()
    else: filtered_df_val = df_val_base[(df_val_base['state_name'] == encoded_state) & (df_val_base['district_name'] == encoded_district) & (df_val_base['commodity_name'] == encoded_commodity)].copy(); filtered_df_val.sort_values(by=DATE_COLUMN, inplace=True)

    if filtered_df_train.empty: print("\nWarning: No training data after filtering.")
    if filtered_df_val.empty: print("\nWarning: No validation data after filtering.")

    # 4. Prepare Data for LSTM (Focusing on PRIMARY_TARGET)
    if not filtered_df_train.empty and not filtered_df_val.empty:
        print(f"\nPreparing data for LSTM (Target: {PRIMARY_TARGET})...")

        # Select target series and ensure correct shape (n_samples, 1)
        train_series = filtered_df_train[PRIMARY_TARGET].values.reshape(-1, 1)
        val_series = filtered_df_val[PRIMARY_TARGET].values.reshape(-1, 1)
        val_dates = filtered_df_val[DATE_COLUMN].values # Keep dates for plotting

        # Scale data (Fit only on training data)
        scaler = MinMaxScaler(feature_range=(0, 1))
        scaled_train_data = scaler.fit_transform(train_series)
        scaled_val_data = scaler.transform(val_series) # Use same scaler

        # Create sequences
        print(f"Creating sequences with length {SEQUENCE_LENGTH}...")
        X_train, y_train = create_sequences(scaled_train_data, SEQUENCE_LENGTH)
        X_val, y_val = create_sequences(scaled_val_data, SEQUENCE_LENGTH)

        # Validation dates need adjustment: remove first `SEQUENCE_LENGTH` dates
        # as they don't have a corresponding prediction in this simple setup.
        dates_val_for_plotting = val_dates[SEQUENCE_LENGTH:]
        # Actual validation values corresponding to predictions
        y_val_actual_unscaled = val_series[SEQUENCE_LENGTH:] # Unscaled actuals

        if X_train.shape[0] == 0 or X_val.shape[0] == 0:
             print("Error: Not enough data to create sequences for training or validation after filtering.")
             model = None # Prevent proceeding
        else:
             print(f"Training sequences shape: X={X_train.shape}, y={y_train.shape}")
             print(f"Validation sequences shape: X={X_val.shape}, y={y_val.shape}")

             # --- 5. Build LSTM Model ---
             print("\nBuilding LSTM model...")
             model = Sequential()
             model.add(LSTM(LSTM_UNITS, activation='relu', # Or 'tanh'
                            input_shape=(SEQUENCE_LENGTH, 1))) # Input: sequence_length time steps, 1 feature
             model.add(Dropout(DROPOUT_RATE))
             # Add more LSTM layers if needed:
             # model.add(LSTM(LSTM_UNITS // 2, activation='relu', return_sequences=True))
             # model.add(Dropout(DROPOUT_RATE))
             model.add(Dense(1)) # Output layer: predicts 1 value

             model.compile(optimizer='adam', loss='mean_squared_error')
             model.summary()

             # --- 6. Train LSTM Model ---
             print("\nStarting LSTM model training...")
             early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

             history = model.fit(
                 X_train, y_train,
                 epochs=EPOCHS,
                 batch_size=BATCH_SIZE,
                 validation_data=(X_val, y_val),
                 callbacks=[early_stopping],
                 verbose=1 # Set to 1 or 2 for progress, 0 for silent
             )
             print("Training finished.")

    # --- 7. Evaluate on Validation Set ---
    if model is not None and 'X_val' in locals() and X_val.shape[0] > 0:
        print(f"\n--- Evaluating FINAL LSTM Model Performance on {VALIDATION_YEAR} Validation Data ---")
        print("Predicting on validation data...")
        predictions_scaled = model.predict(X_val)

        # Inverse transform predictions and actuals
        print("Inverse transforming scaled predictions and actuals...")
        try:
             predictions_inv = scaler.inverse_transform(predictions_scaled)
             # y_val_actual_unscaled was stored earlier
             actuals_inv = y_val_actual_unscaled

             # Ensure shapes match for metric calculation
             min_len_eval = min(len(actuals_inv), len(predictions_inv))
             if len(actuals_inv) != len(predictions_inv):
                 print(f"Warning: Length mismatch after prediction/scaling. Actuals: {len(actuals_inv)}, Preds: {len(predictions_inv)}. Truncating.")
             actuals_inv = actuals_inv[:min_len_eval]
             predictions_inv = predictions_inv[:min_len_eval]
             dates_val_for_plotting = dates_val_for_plotting[:min_len_eval] # Adjust dates too

             # Calculate metrics
             if len(actuals_inv) > 0:
                 r2_val, mae_val, mse_val = calculate_metrics(actuals_inv, predictions_inv)
                 print(f"FINAL Validation R-squared (R2): {r2_val:.4f}")
                 print(f"FINAL Validation Mean Absolute Error (MAE): {mae_val:.2f}")
                 print(f"FINAL Validation Mean Squared Error (MSE): {mse_val:.2f}")

                 # --- 8. Plot Validation Results ---
                 print(f"\n--- Plotting FINAL Validation Results for {PRIMARY_TARGET} (Actual vs. Predicted {VALIDATION_YEAR}) ---")
                 plot_title_val = f'LSTM Validation (Nashik/Wheat): {PRIMARY_TARGET.replace("avg_", "").replace("_price", "").capitalize()} Price (Actual vs. Predicted {VALIDATION_YEAR})'
                 fig_val = plot_lstm_validation_results(dates_val_for_plotting, actuals_inv, predictions_inv, PRIMARY_TARGET, plot_title_val)
                 fig_val.show()
             else:
                 print("Skipping metrics and plotting: No valid data points after alignment/scaling.")

        except Exception as e:
             print(f"Error during prediction, scaling or evaluation: {e}")
             traceback.print_exc()

    elif model is None:
         print("\nSkipping evaluation and plotting because model training failed or insufficient data.")
    else: # Handle cases where filtering worked but sequence creation failed
         print("\nCannot proceed: lack of data after filtering or sequence creation.")
else:
    print("\nFailed during data loading, preprocessing, or mapping lookup.")

print("\nProcess finished.")


--- LSTM Forecasting & Validation ---
--- (Nashik/Wheat: 2002-2023 Train, 2024 Validate) ---
------------------------------
Processing Training (2002-2023) Dataset
------------------------------
Loading Training (2002-2023) data from E:\elevatetrsest\crop price predictor\Crop_price_Prediction\data\edited_nashik_2002_2023.csv...
Loaded 6246 rows.
Constructing 'full_date'...
6246 rows after date construction.
6246 rows after ensuring price columns numeric.
Training (2002-2023) base data loaded. 6246 rows.
------------------------------
Processing Validation (2024) Dataset
------------------------------
Loading Validation (2024) data from E:\elevatetrsest\crop price predictor\Crop_price_Prediction\data\edited_nashik_test_2024.csv...
Loaded 278 rows.
Constructing 'full_date'...
278 rows after date construction.
278 rows after ensuring price columns numeric.
Validation (2024) base data loaded. 278 rows.

Selected: Maharashtra/Nashik/Wheat -> Encoded: St=6291, Di=6291, Co=6291

Filtering da


Process finished.


## lstm with hyperparameter tuning

In [2]:
# !pip install tensorflow keras-tuner # Uncomment if needed

# --- Standard Imports ---
import pandas as pd
import numpy as np
import datetime
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from IPython.display import display
import traceback
import warnings
import os

# --- Keras / TensorFlow Imports ---
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from keras.optimizers import Adam # For tuning learning rate
# Or alternatively, sometimes just:
# from tensorflow.keras import optimizers # And then use optimizers.Adam later
import keras_tuner as kt # <--- Import KerasTuner

# Suppress common warnings & TF logs
warnings.filterwarnings("ignore")
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # Suppress TensorFlow INFO messages
# tf.random.set_seed(42) # Optional reproducibility
# np.random.seed(42)

# --- Configuration ---
# Data Paths
DATA_PATH_TRAIN = r"E:\elevatetrsest\crop price predictor\Crop_price_Prediction\data\edited_nashik_2002_2023.csv"
DATA_PATH_VALIDATION = r"E:\elevatetrsest\crop price predictor\Crop_price_Prediction\data\edited_nashik_test_2024.csv"

# Target Columns & Date Construction Columns
TARGET_COLUMNS = ['avg_min_price', 'avg_max_price', 'avg_modal_price']
PRIMARY_TARGET = 'avg_modal_price' # Focus on one target
DATE_COLUMN = 'full_date'; YEAR_COL = 'year'; MONTH_COL = 'month'; DAY_COL = 'date'
VALIDATION_YEAR = 2024

# Filter Selections
SELECTED_STATE_STR = "Maharashtra"; SELECTED_DISTRICT_STR = "Nashik"; SELECTED_COMMODITY_STR = "Wheat"

# Frequency Encoding Maps (CONFIRM THESE ARE CORRECT)
state_name_encoding_map = {"maharashtra": 6291}
district_name_encoding_map = {"nashik": 6291}
commodity_name_encoding_map = {"wheat": 6291}

# --- LSTM & Tuner Configuration ---
SEQUENCE_LENGTH = 60     # Number of past days - Keep fixed during tuning for simplicity
# Hyperparameters below will be tuned by KerasTuner
# Define ranges later in the build_model function
# LSTM_UNITS = 50
# DROPOUT_RATE = 0.2
# LEARNING_RATE = 0.001 # Example default, will be tuned
KERAS_TUNER_MAX_TRIALS = 20 # How many different hyperparameter sets to try
KERAS_TUNER_EXECUTIONS = 2  # How many times to train each set (for stability)
KERAS_TUNER_PROJECT_NAME = 'lstm_wheat_price_tuning' # Folder to save results
SEARCH_EPOCHS = 20          # Max epochs *per trial* during search (EarlyStopping recommended)
FINAL_EPOCHS = 100         # Max epochs for training the *final* best model
BATCH_SIZE = 32

# --- Helper Functions (remove_outliers_iqr, Loading, Metrics, Plotting - NO CHANGES needed) ---
def remove_outliers_iqr(df, columns_to_check):
    df_filtered = df.copy(); initial_rows = len(df_filtered)
    valid_columns = [col for col in columns_to_check if col in df_filtered.columns and pd.api.types.is_numeric_dtype(df_filtered[col])]
    if not valid_columns: return df_filtered
    subset_for_iqr = df_filtered[valid_columns]
    Q1 = subset_for_iqr.quantile(0.25); Q3 = subset_for_iqr.quantile(0.75); IQR = Q3 - Q1
    mask = ~((subset_for_iqr < (Q1 - 1.5 * IQR)) | (subset_for_iqr > (Q3 + 1.5 * IQR))).any(axis=1)
    df_filtered = df_filtered[mask]; rows_removed = initial_rows - len(df_filtered)
    if rows_removed > 0: print(f"Removed {rows_removed} rows via IQR.")
    return df_filtered

def load_and_preprocess_base_data(path, date_col_name, year_col, month_col, day_col, all_potential_targets, dataset_name="Training"):
    print("-" * 30); print(f"Processing {dataset_name} Dataset"); print("-" * 30)
    try:
        print(f"Loading {dataset_name} data from {path}..."); df = pd.read_csv(path); print(f"Loaded {len(df)} rows.")
        # 1. Construct Date
        date_components_cols = [year_col, month_col, day_col]
        if not all(col in df.columns for col in date_components_cols): print(f"Error: Date component cols missing: {[c for c in date_components_cols if c not in df.columns]}"); return None
        for col in date_components_cols: df[col] = pd.to_numeric(df[col], errors='coerce')
        df.dropna(subset=date_components_cols, inplace=True)
        print(f"Constructing '{date_col_name}'...");
        df[date_col_name] = pd.to_datetime({'year': df[year_col], 'month': df[month_col], 'day': df[day_col]}, errors='coerce')
        initial_rows_date = len(df); df.dropna(subset=[date_col_name], inplace=True)
        if initial_rows_date > len(df): print(f"Dropped {initial_rows_date - len(df)} rows due to invalid date components.")
        print(f"{len(df)} rows after date construction.")
        # 2. Keep ONLY necessary columns
        required_numeric_filter_cols = ['state_name', 'district_name', 'commodity_name']
        keep_cols = [date_col_name] + all_potential_targets + required_numeric_filter_cols
        missing_req_cols = [col for col in keep_cols if col not in df.columns]
        if missing_req_cols: print(f"Error: Required columns missing: {missing_req_cols}"); print(f"Available: {df.columns.tolist()}"); return None
        df = df[keep_cols]
        # 3. Ensure Price/Target columns are numeric
        for col in all_potential_targets: df[col] = pd.to_numeric(df[col], errors='coerce')
        df.dropna(subset=all_potential_targets, how='any', inplace=True)
        print(f"{len(df)} rows after ensuring price columns numeric.")
        # 4. Ensure Filter columns are numeric
        for col in required_numeric_filter_cols:
             if not pd.api.types.is_numeric_dtype(df[col]): print(f"Error: Col '{col}' expected numeric but isn't."); return None
        # 5. Apply IQR Outlier Removal (Optional)
        # df = remove_outliers_iqr(df, all_potential_targets)
        df.sort_values(date_col_name, inplace=True)
        print(f"{dataset_name} base data loaded. {len(df)} rows.")
        return df
    except FileNotFoundError: print(f"Error: {dataset_name} file not found at {path}"); return None
    except Exception as e: print(f"Error loading/preprocessing {dataset_name}: {e}"); traceback.print_exc(); return None

def create_sequences(data, sequence_length):
    X, y = [], []
    if len(data) <= sequence_length: return np.array(X), np.array(y) # Handle case with insufficient data
    for i in range(len(data) - sequence_length):
        X.append(data[i:(i + sequence_length)])
        y.append(data[i + sequence_length])
    return np.array(X), np.array(y)

def calculate_metrics(y_true, y_pred):
    y_true = np.array(y_true).flatten(); y_pred = np.array(y_pred).flatten()
    valid_mask = ~np.isnan(y_true) & ~np.isnan(y_pred)
    y_true = y_true[valid_mask]; y_pred = y_pred[valid_mask]
    if len(y_true) == 0: print("Warning: No valid points for metric calculation."); return np.nan, np.nan, np.nan
    try:
        r2 = r2_score(y_true, y_pred); mae = mean_absolute_error(y_true, y_pred); mse = mean_squared_error(y_true, y_pred)
        return r2, mae, mse
    except Exception as e: print(f"Error calculating metrics: {e}"); return np.nan, np.nan, np.nan

def plot_lstm_validation_results(dates_val, actuals_inv, preds_inv, target_column, title):
    import plotly.graph_objects as go
    fig = go.Figure(); target_label = target_column.replace("avg_", "").replace("_price", "").capitalize()
    fig.add_trace(go.Scatter(x=dates_val, y=actuals_inv.flatten(), mode='lines+markers', name=f'Actual {target_label} ({VALIDATION_YEAR})', line=dict(color='blue'), marker=dict(size=4)))
    fig.add_trace(go.Scatter(x=dates_val, y=preds_inv.flatten(), mode='lines', name=f'Predicted {target_label} ({VALIDATION_YEAR})', line=dict(color='red')))
    fig.update_layout(title=title, xaxis_title=f'Date ({VALIDATION_YEAR})', yaxis_title=f'Price ({target_label})', hovermode="x unified", legend_title_text='Legend')
    return fig


# --- NEW: KerasTuner Model Building Function ---
def build_lstm_model(hp):
    """Builds a compiled LSTM model with hyperparameters defined by KerasTuner."""
    model = Sequential()

    # Tune the number of units in the first LSTM layer
    hp_units = hp.Int('units', min_value=32, max_value=128, step=32)
    # Tune dropout rate
    hp_dropout = hp.Float('dropout', min_value=0.1, max_value=0.5, step=0.1)
    # Tune learning rate for Adam optimizer
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

    model.add(LSTM(units=hp_units, activation='relu', # Consider 'tanh' as well: hp.Choice('activation', ['relu', 'tanh'])
                   input_shape=(SEQUENCE_LENGTH, 1))) # Input shape: (timesteps, features)
    model.add(Dropout(rate=hp_dropout))
    # Potentially add more layers here, tuning their units/dropout as well
    # model.add(LSTM(units=hp.Int('units_l2', ...), return_sequences=True))
    # model.add(Dropout(rate=hp.Float('dropout_l2', ...)))
    model.add(Dense(1)) # Output layer

    model.compile(
        optimizer=Adam(learning_rate=hp_learning_rate),
        loss='mean_squared_error'
    )
    return model


# --- Main Execution Block ---
print("--- LSTM Forecasting & Validation with KerasTuner ---")
print(f"--- (Nashik/Wheat: 2002-2023 Train, {VALIDATION_YEAR} Validate) ---")

# 1. Load Base Data
df_train_base = load_and_preprocess_base_data(DATA_PATH_TRAIN, DATE_COLUMN, YEAR_COL, MONTH_COL, DAY_COL, TARGET_COLUMNS, "Training (2002-2023)")
df_val_base = load_and_preprocess_base_data(DATA_PATH_VALIDATION, DATE_COLUMN, YEAR_COL, MONTH_COL, DAY_COL, TARGET_COLUMNS, f"Validation ({VALIDATION_YEAR})")

# Init model variable
final_model = None
scaler = None # Make scaler accessible later

if df_train_base is not None and df_val_base is not None:
    # 2. Get Encoded Values for Filtering
    try:
        selected_state_key=SELECTED_STATE_STR.strip().lower(); selected_district_key=SELECTED_DISTRICT_STR.strip().lower(); selected_commodity_key=SELECTED_COMMODITY_STR.strip().lower()
        encoded_state = state_name_encoding_map.get(selected_state_key); encoded_district = district_name_encoding_map.get(selected_district_key); encoded_commodity = commodity_name_encoding_map.get(selected_commodity_key)
        lookup_failed = False
        if encoded_state is None: print(f"Error: State '{SELECTED_STATE_STR}' missing map."); lookup_failed=True
        if encoded_district is None: print(f"Error: District '{SELECTED_DISTRICT_STR}' missing map."); lookup_failed=True
        if encoded_commodity is None: print(f"Error: Commodity '{SELECTED_COMMODITY_STR}' missing map."); lookup_failed=True
        if lookup_failed: print("Check maps."); df_train_base=df_val_base=None
        else: print(f"\nSelected: {SELECTED_STATE_STR}/{SELECTED_DISTRICT_STR}/{SELECTED_COMMODITY_STR} -> Encoded: St={encoded_state}, Di={encoded_district}, Co={encoded_commodity}")
    except Exception as e: print(f"Error mapping lookup: {e}"); df_train_base = df_val_base = None

if df_train_base is not None and df_val_base is not None:
    # 3. Filtering Data
    print(f"\nFiltering datasets using encoded values...")
    filter_cols_num = ['state_name', 'district_name', 'commodity_name']
    if not all(col in df_train_base.columns for col in filter_cols_num): print("Error: Encoded filter cols missing Training."); filtered_df_train = pd.DataFrame()
    else: filtered_df_train = df_train_base[(df_train_base['state_name'] == encoded_state) & (df_train_base['district_name'] == encoded_district) & (df_train_base['commodity_name'] == encoded_commodity)].copy(); filtered_df_train.sort_values(by=DATE_COLUMN, inplace=True)
    if not all(col in df_val_base.columns for col in filter_cols_num): print("Error: Encoded filter cols missing Validation."); filtered_df_val = pd.DataFrame()
    else: filtered_df_val = df_val_base[(df_val_base['state_name'] == encoded_state) & (df_val_base['district_name'] == encoded_district) & (df_val_base['commodity_name'] == encoded_commodity)].copy(); filtered_df_val.sort_values(by=DATE_COLUMN, inplace=True)

    if filtered_df_train.empty: print("\nWarning: No training data after filtering.")
    if filtered_df_val.empty: print("\nWarning: No validation data after filtering.")

    # 4. Prepare Data for LSTM (Target: PRIMARY_TARGET)
    if not filtered_df_train.empty and not filtered_df_val.empty:
        print(f"\nPreparing data for LSTM (Target: {PRIMARY_TARGET})...")
        train_series = filtered_df_train[PRIMARY_TARGET].values.reshape(-1, 1)
        val_series = filtered_df_val[PRIMARY_TARGET].values.reshape(-1, 1)
        val_dates = filtered_df_val[DATE_COLUMN].values # Keep dates

        # Scale data
        scaler = MinMaxScaler(feature_range=(0, 1)) # Assign to global scope
        scaled_train_data = scaler.fit_transform(train_series)
        scaled_val_data = scaler.transform(val_series)

        # Create sequences
        print(f"Creating sequences with length {SEQUENCE_LENGTH}...")
        X_train, y_train = create_sequences(scaled_train_data, SEQUENCE_LENGTH)
        X_val, y_val = create_sequences(scaled_val_data, SEQUENCE_LENGTH)
        dates_val_for_plotting = val_dates[SEQUENCE_LENGTH:] # Adjusted dates
        y_val_actual_unscaled = val_series[SEQUENCE_LENGTH:] # Adjusted unscaled actuals

        if X_train.shape[0] == 0 or X_val.shape[0] == 0:
             print("Error: Not enough data to create sequences. Try shorter SEQUENCE_LENGTH or check filters.")
        else:
             print(f"Training sequences shape: X={X_train.shape}, y={y_train.shape}")
             print(f"Validation sequences shape: X={X_val.shape}, y={y_val.shape}")

             # --- 5. Hyperparameter Tuning with KerasTuner ---
             print("\nStarting KerasTuner hyperparameter search...")
             tuner = kt.BayesianOptimization(
                 hypermodel=build_lstm_model,
                 objective='val_loss', # Minimize validation loss
                 max_trials=KERAS_TUNER_MAX_TRIALS, # How many hyperparameter combinations to test
                 executions_per_trial=KERAS_TUNER_EXECUTIONS, # Train each combination multiple times for stability
                 directory='keras_tuner_dir', # Directory to store results
                 project_name=KERAS_TUNER_PROJECT_NAME,
                 overwrite=True # Overwrite previous logs for this project
             )

             # Define EarlyStopping for the search phase
             search_early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=0) # Stop trials early

             # Run the search
             tuner.search(X_train, y_train,
                          epochs=SEARCH_EPOCHS, # Max epochs per trial
                          batch_size=BATCH_SIZE,
                          validation_data=(X_val, y_val),
                          callbacks=[search_early_stopping],
                          verbose=0 # Set to 1 to see trial progress
                         )

             print("\nHyperparameter search finished.")
             tuner.results_summary(num_trials=1) # Show the best trial summary

             # Get the optimal hyperparameters
             best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
             print(f"""
The hyperparameter search is complete. The optimal hyperparameters found are:
- Units: {best_hps.get('units')}
- Dropout: {best_hps.get('dropout'):.2f}
- Learning Rate: {best_hps.get('learning_rate')}
             """)

             # --- 6. Build and Train FINAL Model with Best Hyperparameters ---
             print("\nBuilding and training final model with best hyperparameters...")
             # Build the model with the best hp
             final_model = tuner.hypermodel.build(best_hps) # Recommended way

             # Define EarlyStopping for the final training phase
             final_early_stopping = EarlyStopping(monitor='val_loss', patience=10, # Possibly more patience for final run
                                                  restore_best_weights=True, verbose=1)

             # Train the final model
             history = final_model.fit(
                 X_train, y_train,
                 epochs=FINAL_EPOCHS, # Use potentially more epochs for final training
                 batch_size=BATCH_SIZE,
                 validation_data=(X_val, y_val),
                 callbacks=[final_early_stopping],
                 verbose=1
             )
             print("Final model training finished.")


    # --- 7. Evaluate FINAL Model on Validation Set ---
    if final_model is not None and scaler is not None and 'X_val' in locals() and X_val.shape[0] > 0:
        print(f"\n--- Evaluating FINAL Tuned LSTM Model on {VALIDATION_YEAR} Validation Data ---")
        print("Predicting...")
        predictions_scaled = final_model.predict(X_val)

        print("Inverse transforming...")
        try:
             predictions_inv = scaler.inverse_transform(predictions_scaled)
             actuals_inv = y_val_actual_unscaled # Stored earlier

             # Align lengths if necessary (e.g., if prediction output length differs slightly)
             min_len_eval = min(len(actuals_inv), len(predictions_inv))
             if len(actuals_inv) != len(predictions_inv):
                 print(f"Warning: Length mismatch final eval. Truncating to {min_len_eval}.")
             actuals_inv = actuals_inv[:min_len_eval]
             predictions_inv = predictions_inv[:min_len_eval]
             dates_val_plot = dates_val_for_plotting[:min_len_eval]

             # Calculate metrics
             if len(actuals_inv) > 0:
                 r2_val, mae_val, mse_val = calculate_metrics(actuals_inv, predictions_inv)
                 print(f"FINAL Tuned Validation R-squared (R2): {r2_val:.4f}")
                 print(f"FINAL Tuned Validation Mean Absolute Error (MAE): {mae_val:.2f}")
                 print(f"FINAL Tuned Validation Mean Squared Error (MSE): {mse_val:.2f}")

                 # --- 8. Plot Validation Results ---
                 print(f"\n--- Plotting FINAL Validation Results for {PRIMARY_TARGET} (Actual vs. Predicted {VALIDATION_YEAR}) ---")
                 plot_title_val = f'TUNED LSTM Validation (Nashik/Wheat): {PRIMARY_TARGET.replace("avg_", "").replace("_price", "").capitalize()} Price (Actual vs. Predicted {VALIDATION_YEAR})'
                 fig_val = plot_lstm_validation_results(dates_val_plot, actuals_inv, predictions_inv, PRIMARY_TARGET, plot_title_val)
                 fig_val.show()
             else: print("Skipping metrics/plotting: No valid aligned data.")
        except Exception as e: print(f"Error during final prediction/scaling/eval: {e}"); traceback.print_exc()
    elif final_model is None: print("\nSkipping final evaluation (Model not trained).")
    else: print("\nCannot proceed: lack of data after filtering/sequencing.")
else: print("\nFailed: check data loading, preprocessing, or mapping lookup.")

print("\nProcess finished.")

--- LSTM Forecasting & Validation with KerasTuner ---
--- (Nashik/Wheat: 2002-2023 Train, 2024 Validate) ---
------------------------------
Processing Training (2002-2023) Dataset
------------------------------
Loading Training (2002-2023) data from E:\elevatetrsest\crop price predictor\Crop_price_Prediction\data\edited_nashik_2002_2023.csv...
Loaded 6246 rows.
Constructing 'full_date'...
6246 rows after date construction.
6246 rows after ensuring price columns numeric.
Training (2002-2023) base data loaded. 6246 rows.
------------------------------
Processing Validation (2024) Dataset
------------------------------
Loading Validation (2024) data from E:\elevatetrsest\crop price predictor\Crop_price_Prediction\data\edited_nashik_test_2024.csv...
Loaded 278 rows.
Constructing 'full_date'...
278 rows after date construction.
278 rows after ensuring price columns numeric.
Validation (2024) base data loaded. 278 rows.

Selected: Maharashtra/Nashik/Wheat -> Encoded: St=6291, Di=6291, Co=629

KeyboardInterrupt: 

# new iteration

In [3]:
# !pip install tensorflow keras-tuner pandas numpy plotly scikit-learn ipython # Uncomment if needed

# --- Standard Imports ---
import pandas as pd
import numpy as np
import datetime
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from IPython.display import display
import traceback
import warnings
import os
import plotly.graph_objects as go # Moved import here

# --- Keras / TensorFlow Imports ---
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam # <--- Using standard TF Keras path
import keras_tuner as kt

# Suppress common warnings & TF logs
warnings.filterwarnings("ignore")
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # Suppress TensorFlow INFO messages
# tf.random.set_seed(42) # Optional reproducibility
# np.random.seed(42)

# --- Configuration ---
# Data Paths
DATA_PATH_TRAIN = r"E:\elevatetrsest\crop price predictor\Crop_price_Prediction\data\edited_nashik_2002_2023.csv"
DATA_PATH_VALIDATION = r"E:\elevatetrsest\crop price predictor\Crop_price_Prediction\data\edited_nashik_test_2024.csv"

# Target Columns & Date Construction Columns
TARGET_COLUMNS = ['avg_min_price', 'avg_max_price', 'avg_modal_price']
PRIMARY_TARGET = 'avg_modal_price' # Focus on one target
DATE_COLUMN = 'full_date'; YEAR_COL = 'year'; MONTH_COL = 'month'; DAY_COL = 'date'
VALIDATION_YEAR = 2024

# Filter Selections
SELECTED_STATE_STR = "Maharashtra"; SELECTED_DISTRICT_STR = "Nashik"; SELECTED_COMMODITY_STR = "Wheat"

# Frequency Encoding Maps (CONFIRM THESE ARE CORRECT)
state_name_encoding_map = {"maharashtra": 6291}
district_name_encoding_map = {"nashik": 6291}
commodity_name_encoding_map = {"wheat": 6291}

# --- LSTM & Tuner Configuration ---
SEQUENCE_LENGTH = 60     # Number of past days - Keep fixed during tuning for simplicity
KERAS_TUNER_MAX_TRIALS = 20 # How many different hyperparameter sets to try
KERAS_TUNER_EXECUTIONS = 2  # How many times to train each set (for stability)
KERAS_TUNER_PROJECT_NAME = 'lstm_wheat_price_tuning' # Folder to save results
SEARCH_EPOCHS = 20          # Max epochs *per trial* during search (EarlyStopping recommended)
FINAL_EPOCHS = 100         # Max epochs for training the *final* best model
BATCH_SIZE = 32

# --- Helper Functions ---
def remove_outliers_iqr(df, columns_to_check):
    """Removes outliers from specified numerical columns using the IQR method."""
    df_filtered = df.copy(); initial_rows = len(df_filtered)
    valid_columns = [col for col in columns_to_check if col in df_filtered.columns and pd.api.types.is_numeric_dtype(df_filtered[col])]
    if not valid_columns: return df_filtered
    subset_for_iqr = df_filtered[valid_columns]
    Q1 = subset_for_iqr.quantile(0.25); Q3 = subset_for_iqr.quantile(0.75); IQR = Q3 - Q1
    mask = ~((subset_for_iqr < (Q1 - 1.5 * IQR)) | (subset_for_iqr > (Q3 + 1.5 * IQR))).any(axis=1)
    df_filtered = df_filtered[mask]; rows_removed = initial_rows - len(df_filtered)
    if rows_removed > 0: print(f"Removed {rows_removed} rows via IQR.")
    return df_filtered

def load_and_preprocess_base_data(path, date_col_name, year_col, month_col, day_col, all_potential_targets, dataset_name="Training"):
    """Loads data, constructs date, basic cleaning. Returns essential cols."""
    print("-" * 30); print(f"Processing {dataset_name} Dataset"); print("-" * 30)
    try:
        print(f"Loading {dataset_name} data from {path}..."); df = pd.read_csv(path); print(f"Loaded {len(df)} rows.")
        # 1. Construct Date
        date_components_cols = [year_col, month_col, day_col]
        if not all(col in df.columns for col in date_components_cols): print(f"Error: Date component cols missing: {[c for c in date_components_cols if c not in df.columns]}"); return None
        for col in date_components_cols: df[col] = pd.to_numeric(df[col], errors='coerce')
        df.dropna(subset=date_components_cols, inplace=True)
        print(f"Constructing '{date_col_name}'...");
        df[date_col_name] = pd.to_datetime({'year': df[year_col], 'month': df[month_col], 'day': df[day_col]}, errors='coerce')
        initial_rows_date = len(df); df.dropna(subset=[date_col_name], inplace=True)
        if initial_rows_date > len(df): print(f"Dropped {initial_rows_date - len(df)} rows due to invalid date components.")
        print(f"{len(df)} rows after date construction.")
        # 2. Keep ONLY necessary columns
        required_numeric_filter_cols = ['state_name', 'district_name', 'commodity_name']
        keep_cols = [date_col_name] + all_potential_targets + required_numeric_filter_cols
        missing_req_cols = [col for col in keep_cols if col not in df.columns]
        if missing_req_cols: print(f"Error: Required columns missing: {missing_req_cols}"); print(f"Available: {df.columns.tolist()}"); return None
        df = df[keep_cols]
        # 3. Ensure Price/Target columns are numeric
        for col in all_potential_targets: df[col] = pd.to_numeric(df[col], errors='coerce')
        df.dropna(subset=all_potential_targets, how='any', inplace=True)
        print(f"{len(df)} rows after ensuring price columns numeric.")
        # 4. Ensure Filter columns are numeric
        for col in required_numeric_filter_cols:
             if not pd.api.types.is_numeric_dtype(df[col]): print(f"Error: Col '{col}' expected numeric but isn't."); return None
        # 5. Apply IQR Outlier Removal (Optional)
        # df = remove_outliers_iqr(df, all_potential_targets)
        df.sort_values(date_col_name, inplace=True)
        print(f"{dataset_name} base data loaded. {len(df)} rows.")
        return df
    except FileNotFoundError: print(f"Error: {dataset_name} file not found at {path}"); return None
    except Exception as e: print(f"Error loading/preprocessing {dataset_name}: {e}"); traceback.print_exc(); return None

def create_sequences(data, sequence_length):
    """Creates sequences of data for LSTM."""
    X, y = [], []
    if len(data) <= sequence_length: return np.array(X), np.array(y) # Handle case with insufficient data
    for i in range(len(data) - sequence_length):
        X.append(data[i:(i + sequence_length)]) # Sequence of inputs
        y.append(data[i + sequence_length])    # Value to predict
    return np.array(X), np.array(y)

def calculate_metrics(y_true, y_pred):
    """Calculates R2, MAE, MSE after handling potential NaNs and length mismatches."""
    y_true = np.array(y_true).flatten(); y_pred = np.array(y_pred).flatten()
    valid_mask = ~np.isnan(y_true) & ~np.isnan(y_pred)
    y_true = y_true[valid_mask]; y_pred = y_pred[valid_mask]
    if len(y_true) == 0: print("Warning: No valid points for metric calculation."); return np.nan, np.nan, np.nan
    try:
        r2 = r2_score(y_true, y_pred); mae = mean_absolute_error(y_true, y_pred); mse = mean_squared_error(y_true, y_pred)
        return r2, mae, mse
    except Exception as e: print(f"Error calculating metrics: {e}"); return np.nan, np.nan, np.nan

def plot_lstm_validation_results(dates_val, actuals_inv, preds_inv, target_column, title):
    """Plots actuals vs predictions for validation period."""
    fig = go.Figure(); target_label = target_column.replace("avg_", "").replace("_price", "").capitalize()
    fig.add_trace(go.Scatter(x=dates_val, y=actuals_inv.flatten(), mode='lines+markers', name=f'Actual {target_label} ({VALIDATION_YEAR})', line=dict(color='blue'), marker=dict(size=4)))
    fig.add_trace(go.Scatter(x=dates_val, y=preds_inv.flatten(), mode='lines', name=f'Predicted {target_label} ({VALIDATION_YEAR})', line=dict(color='red')))
    fig.update_layout(title=title, xaxis_title=f'Date ({VALIDATION_YEAR})', yaxis_title=f'Price ({target_label})', hovermode="x unified", legend_title_text='Legend')
    return fig

# --- KerasTuner Model Building Function ---
def build_lstm_model(hp):
    """Builds a compiled LSTM model with hyperparameters defined by KerasTuner."""
    model = Sequential()
    hp_units = hp.Int('units', min_value=32, max_value=128, step=32)
    hp_dropout = hp.Float('dropout', min_value=0.1, max_value=0.5, step=0.1)
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

    model.add(LSTM(units=hp_units, activation='relu', input_shape=(SEQUENCE_LENGTH, 1)))
    model.add(Dropout(rate=hp_dropout))
    model.add(Dense(1)) # Output layer

    model.compile(
        optimizer=Adam(learning_rate=hp_learning_rate), # Adam class used here
        loss='mean_squared_error'
    )
    return model


# --- Main Execution Block ---
print("--- LSTM Forecasting & Validation with KerasTuner ---")
print(f"--- (Nashik/Wheat: 2002-2023 Train, {VALIDATION_YEAR} Validate) ---")

# 1. Load Base Data
df_train_base = load_and_preprocess_base_data(DATA_PATH_TRAIN, DATE_COLUMN, YEAR_COL, MONTH_COL, DAY_COL, TARGET_COLUMNS, "Training (2002-2023)")
df_val_base = load_and_preprocess_base_data(DATA_PATH_VALIDATION, DATE_COLUMN, YEAR_COL, MONTH_COL, DAY_COL, TARGET_COLUMNS, f"Validation ({VALIDATION_YEAR})")

# Init model variable
final_model = None
scaler = None # Make scaler accessible in global scope of this block

if df_train_base is not None and df_val_base is not None:
    # 2. Get Encoded Values for Filtering
    try:
        selected_state_key=SELECTED_STATE_STR.strip().lower(); selected_district_key=SELECTED_DISTRICT_STR.strip().lower(); selected_commodity_key=SELECTED_COMMODITY_STR.strip().lower()
        encoded_state = state_name_encoding_map.get(selected_state_key); encoded_district = district_name_encoding_map.get(selected_district_key); encoded_commodity = commodity_name_encoding_map.get(selected_commodity_key)
        lookup_failed = False
        if encoded_state is None: print(f"Error: State '{SELECTED_STATE_STR}' missing map."); lookup_failed=True
        if encoded_district is None: print(f"Error: District '{SELECTED_DISTRICT_STR}' missing map."); lookup_failed=True
        if encoded_commodity is None: print(f"Error: Commodity '{SELECTED_COMMODITY_STR}' missing map."); lookup_failed=True
        if lookup_failed: print("Check maps."); df_train_base=df_val_base=None
        else: print(f"\nSelected: {SELECTED_STATE_STR}/{SELECTED_DISTRICT_STR}/{SELECTED_COMMODITY_STR} -> Encoded: St={encoded_state}, Di={encoded_district}, Co={encoded_commodity}")
    except Exception as e: print(f"Error mapping lookup: {e}"); df_train_base = df_val_base = None

if df_train_base is not None and df_val_base is not None:
    # 3. Filtering Data
    print(f"\nFiltering datasets using encoded values...")
    filter_cols_num = ['state_name', 'district_name', 'commodity_name']
    if not all(col in df_train_base.columns for col in filter_cols_num): print("Error: Encoded filter cols missing Training."); filtered_df_train = pd.DataFrame()
    else: filtered_df_train = df_train_base[(df_train_base['state_name'] == encoded_state) & (df_train_base['district_name'] == encoded_district) & (df_train_base['commodity_name'] == encoded_commodity)].copy(); filtered_df_train.sort_values(by=DATE_COLUMN, inplace=True)
    if not all(col in df_val_base.columns for col in filter_cols_num): print("Error: Encoded filter cols missing Validation."); filtered_df_val = pd.DataFrame()
    else: filtered_df_val = df_val_base[(df_val_base['state_name'] == encoded_state) & (df_val_base['district_name'] == encoded_district) & (df_val_base['commodity_name'] == encoded_commodity)].copy(); filtered_df_val.sort_values(by=DATE_COLUMN, inplace=True)

    if filtered_df_train.empty: print("\nWarning: No training data after filtering.")
    if filtered_df_val.empty: print("\nWarning: No validation data after filtering.")

    # 4. Prepare Data for LSTM (Target: PRIMARY_TARGET)
    if not filtered_df_train.empty and not filtered_df_val.empty:
        print(f"\nPreparing data for LSTM (Target: {PRIMARY_TARGET})...")
        train_series = filtered_df_train[PRIMARY_TARGET].values.reshape(-1, 1)
        val_series = filtered_df_val[PRIMARY_TARGET].values.reshape(-1, 1)
        val_dates = filtered_df_val[DATE_COLUMN].values

        # Scale data
        scaler = MinMaxScaler(feature_range=(0, 1)) # Assign to scaler defined outside loop
        scaled_train_data = scaler.fit_transform(train_series)
        scaled_val_data = scaler.transform(val_series)

        # Create sequences
        print(f"Creating sequences with length {SEQUENCE_LENGTH}...")
        X_train, y_train = create_sequences(scaled_train_data, SEQUENCE_LENGTH)
        X_val, y_val = create_sequences(scaled_val_data, SEQUENCE_LENGTH)
        dates_val_for_plotting = val_dates[SEQUENCE_LENGTH:]
        y_val_actual_unscaled = val_series[SEQUENCE_LENGTH:]

        if X_train.shape[0] == 0 or X_val.shape[0] == 0:
             print("Error: Not enough data to create sequences. Try shorter SEQUENCE_LENGTH or check filters.")
        else:
             print(f"Training sequences shape: X={X_train.shape}, y={y_train.shape}")
             print(f"Validation sequences shape: X={X_val.shape}, y={y_val.shape}")

             # --- 5. Hyperparameter Tuning with KerasTuner ---
             print("\nStarting KerasTuner hyperparameter search...")
             tuner = kt.BayesianOptimization(
                 hypermodel=build_lstm_model,
                 objective=kt.Objective("val_loss", direction="min"), # Explicitly define objective
                 max_trials=KERAS_TUNER_MAX_TRIALS,
                 executions_per_trial=KERAS_TUNER_EXECUTIONS,
                 directory='keras_tuner_dir', # Saves logs/checkpoints here
                 project_name=KERAS_TUNER_PROJECT_NAME,
                 overwrite=True
             )

             # Define EarlyStopping for the search phase
             search_early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=0)

             # Run the search
             print(f"Running KerasTuner search for {KERAS_TUNER_MAX_TRIALS} trials...")
             tuner.search(X_train, y_train,
                          epochs=SEARCH_EPOCHS, # Max epochs PER TRIAL
                          batch_size=BATCH_SIZE,
                          validation_data=(X_val, y_val),
                          callbacks=[search_early_stopping],
                          verbose=0 # Set to 1 for detailed trial logs
                         )

             print("\nHyperparameter search finished.")
             # Show summary of top trial(s)
             tuner.results_summary(num_trials=1)

             # Get the optimal hyperparameters
             try:
                best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
                print(f"""
The hyperparameter search is complete. The optimal hyperparameters found are:
- Units: {best_hps.get('units')}
- Dropout: {best_hps.get('dropout'):.2f}
- Learning Rate: {best_hps.get('learning_rate')}
                """)

                # --- 6. Build and Train FINAL Model with Best Hyperparameters ---
                print("\nBuilding and training final model with best hyperparameters...")
                final_model = tuner.hypermodel.build(best_hps) # Build model with best HPs

                # Define EarlyStopping for the final training phase
                final_early_stopping = EarlyStopping(monitor='val_loss', patience=10,
                                                      restore_best_weights=True, verbose=1)

                # Train the final model
                history = final_model.fit(
                    X_train, y_train,
                    epochs=FINAL_EPOCHS,
                    batch_size=BATCH_SIZE,
                    validation_data=(X_val, y_val),
                    callbacks=[final_early_stopping],
                    verbose=1
                )
                print("Final model training finished.")

             except Exception as e:
                print(f"Error retrieving best HPs or building/training final model: {e}")
                final_model = None # Ensure model is None if this fails


    # --- 7. Evaluate FINAL Model on Validation Set ---
    # Check scaler and final_model exist, and validation data exists
    if final_model is not None and scaler is not None and 'X_val' in locals() and X_val.shape[0] > 0:
        print(f"\n--- Evaluating FINAL Tuned LSTM Model on {VALIDATION_YEAR} Validation Data ---")
        print("Predicting...")
        # Ensure X_val has data before predicting
        if X_val.shape[0] > 0:
            predictions_scaled = final_model.predict(X_val)
            print("Inverse transforming...")
            try:
                 predictions_inv = scaler.inverse_transform(predictions_scaled)
                 actuals_inv = y_val_actual_unscaled

                 min_len_eval = min(len(actuals_inv), len(predictions_inv))
                 if len(actuals_inv) != len(predictions_inv): print(f"Warn: Length mismatch final eval. Truncating to {min_len_eval}.")
                 actuals_inv = actuals_inv[:min_len_eval]
                 predictions_inv = predictions_inv[:min_len_eval]
                 dates_val_plot = dates_val_for_plotting[:min_len_eval]

                 # Calculate metrics
                 if len(actuals_inv) > 0:
                     r2_val, mae_val, mse_val = calculate_metrics(actuals_inv, predictions_inv)
                     print(f"FINAL Tuned Validation R-squared (R2): {r2_val:.4f}")
                     print(f"FINAL Tuned Validation Mean Absolute Error (MAE): {mae_val:.2f}")
                     print(f"FINAL Tuned Validation Mean Squared Error (MSE): {mse_val:.2f}")

                     # --- 8. Plot Validation Results ---
                     print(f"\n--- Plotting FINAL Validation Results for {PRIMARY_TARGET} (Actual vs. Predicted {VALIDATION_YEAR}) ---")
                     plot_title_val = f'TUNED LSTM Validation (Nashik/Wheat): {PRIMARY_TARGET.replace("avg_", "").replace("_price", "").capitalize()} Price (Actual vs. Predicted {VALIDATION_YEAR})'
                     fig_val = plot_lstm_validation_results(dates_val_plot, actuals_inv, predictions_inv, PRIMARY_TARGET, plot_title_val)
                     fig_val.show()
                 else: print("Skipping metrics/plotting: No valid aligned data after inverse transform.")
            except Exception as e: print(f"Error during final prediction/scaling/eval: {e}"); traceback.print_exc()
        else:
            print("Skipping prediction and evaluation as X_val is empty.")

    elif final_model is None: print("\nSkipping final evaluation (Model not trained or failed).")
    else: print("\nCannot proceed: lack of data after filtering/sequencing or scaler missing.")
else: print("\nFailed: check data loading, preprocessing, or mapping lookup.")

print("\nProcess finished.")

--- LSTM Forecasting & Validation with KerasTuner ---
--- (Nashik/Wheat: 2002-2023 Train, 2024 Validate) ---
------------------------------
Processing Training (2002-2023) Dataset
------------------------------
Loading Training (2002-2023) data from E:\elevatetrsest\crop price predictor\Crop_price_Prediction\data\edited_nashik_2002_2023.csv...
Loaded 6246 rows.
Constructing 'full_date'...
6246 rows after date construction.
6246 rows after ensuring price columns numeric.
Training (2002-2023) base data loaded. 6246 rows.
------------------------------
Processing Validation (2024) Dataset
------------------------------
Loading Validation (2024) data from E:\elevatetrsest\crop price predictor\Crop_price_Prediction\data\edited_nashik_test_2024.csv...
Loaded 278 rows.
Constructing 'full_date'...
278 rows after date construction.
278 rows after ensuring price columns numeric.
Validation (2024) base data loaded. 278 rows.

Selected: Maharashtra/Nashik/Wheat -> Encoded: St=6291, Di=6291, Co=629


Process finished.
