In [2]:
# === OFFLINE TRAINING SCRIPT ===
# (Run this in your notebook/Python environment once)

# --- Standard Imports ---
import pandas as pd
import numpy as np
import datetime
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import MinMaxScaler
import traceback
import warnings
import os
import joblib # <--- For saving scaler

# --- Keras / TensorFlow Imports ---
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam # Using TF Keras path


# Suppress common warnings & TF logs
warnings.filterwarnings("ignore")
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
# tf.random.set_seed(42)
# np.random.seed(42)

# --- Configuration ---
DATA_PATH_TRAIN = r"E:\elevatetrsest\crop price predictor\Crop_price_Prediction\data\edited_nashik_2002_2023.csv"
# DATA_PATH_VALIDATION = r"E:\elevatetrsest\crop price predictor\Crop_price_Prediction\data\edited_nashik_test_2024.csv" # Not needed for training only
TARGET_COLUMNS = ['avg_min_price', 'avg_max_price', 'avg_modal_price'] # Train for all targets
DATE_COLUMN = 'full_date'; YEAR_COL = 'year'; MONTH_COL = 'month'; DAY_COL = 'date'
# Filter Selections
SELECTED_STATE_STR = "Maharashtra"; SELECTED_DISTRICT_STR = "Nashik"; SELECTED_COMMODITY_STR = "Wheat"
# Frequency Encoding Maps
state_name_encoding_map = {"maharashtra": 6291}
district_name_encoding_map = {"nashik": 6291}
commodity_name_encoding_map = {"wheat": 6291}
# LSTM Config
SEQUENCE_LENGTH = 60
LSTM_UNITS = 50 # Use fixed HPs found from tuning or reasonable defaults
DROPOUT_RATE = 0.2
LEARNING_RATE = 0.001 # Use fixed HPs found from tuning or reasonable defaults
EPOCHS = 100 # Train longer offline, use EarlyStopping
BATCH_SIZE = 32
# Save Directory
MODEL_SAVE_DIR = "saved_lstm_models"
os.makedirs(MODEL_SAVE_DIR, exist_ok=True) # Create directory if it doesn't exist

# --- Helper Functions (load_and_preprocess_base_data, create_sequences) ---
# (Copy these functions from the previous script - no changes needed in them)
def remove_outliers_iqr(df, columns_to_check):
    # ... (implementation from previous script) ...
    df_filtered = df.copy(); initial_rows = len(df_filtered)
    valid_columns = [col for col in columns_to_check if col in df_filtered.columns and pd.api.types.is_numeric_dtype(df_filtered[col])]
    if not valid_columns: return df_filtered
    subset_for_iqr = df_filtered[valid_columns]
    Q1 = subset_for_iqr.quantile(0.25); Q3 = subset_for_iqr.quantile(0.75); IQR = Q3 - Q1
    mask = ~((subset_for_iqr < (Q1 - 1.5 * IQR)) | (subset_for_iqr > (Q3 + 1.5 * IQR))).any(axis=1)
    df_filtered = df_filtered[mask]; rows_removed = initial_rows - len(df_filtered)
    if rows_removed > 0: print(f"Removed {rows_removed} rows via IQR.")
    return df_filtered

def load_and_preprocess_base_data(path, date_col_name, year_col, month_col, day_col, all_potential_targets, dataset_name="Training"):
    # ... (implementation from previous script) ...
    print("-" * 30); print(f"Processing {dataset_name} Dataset"); print("-" * 30)
    try:
        print(f"Loading {dataset_name} data from {path}..."); df = pd.read_csv(path); print(f"Loaded {len(df)} rows.")
        date_components_cols = [year_col, month_col, day_col]
        if not all(col in df.columns for col in date_components_cols): print(f"Error: Date component cols missing: {[c for c in date_components_cols if c not in df.columns]}"); return None
        for col in date_components_cols: df[col] = pd.to_numeric(df[col], errors='coerce')
        df.dropna(subset=date_components_cols, inplace=True)
        print(f"Constructing '{date_col_name}'...");
        df[date_col_name] = pd.to_datetime({'year': df[year_col], 'month': df[month_col], 'day': df[day_col]}, errors='coerce')
        initial_rows_date = len(df); df.dropna(subset=[date_col_name], inplace=True)
        if initial_rows_date > len(df): print(f"Dropped {initial_rows_date - len(df)} rows due to invalid date components.")
        print(f"{len(df)} rows after date construction.")
        required_numeric_filter_cols = ['state_name', 'district_name', 'commodity_name']
        keep_cols = [date_col_name] + all_potential_targets + required_numeric_filter_cols
        missing_req_cols = [col for col in keep_cols if col not in df.columns]
        if missing_req_cols: print(f"Error: Required columns missing: {missing_req_cols}"); print(f"Available: {df.columns.tolist()}"); return None
        df = df[keep_cols]
        for col in all_potential_targets: df[col] = pd.to_numeric(df[col], errors='coerce')
        df.dropna(subset=all_potential_targets, how='any', inplace=True)
        print(f"{len(df)} rows after ensuring price columns numeric.")
        for col in required_numeric_filter_cols:
             if not pd.api.types.is_numeric_dtype(df[col]): print(f"Error: Col '{col}' expected numeric but isn't."); return None
        # df = remove_outliers_iqr(df, all_potential_targets) # Optional
        df.sort_values(date_col_name, inplace=True)
        print(f"{dataset_name} base data loaded. {len(df)} rows.")
        return df
    except FileNotFoundError: print(f"Error: {dataset_name} file not found at {path}"); return None
    except Exception as e: print(f"Error loading/preprocessing {dataset_name}: {e}"); traceback.print_exc(); return None

def create_sequences(data, sequence_length):
    # ... (implementation from previous script) ...
    X, y = [], []
    if len(data) <= sequence_length: return np.array(X), np.array(y)
    for i in range(len(data) - sequence_length):
        X.append(data[i:(i + sequence_length)])
        y.append(data[i + sequence_length])
    return np.array(X), np.array(y)


# --- Main Training & Saving Logic ---
print("--- LSTM Offline Training & Saving ---")

# 1. Load Training Data Only
df_train_base = load_and_preprocess_base_data(DATA_PATH_TRAIN, DATE_COLUMN, YEAR_COL, MONTH_COL, DAY_COL, TARGET_COLUMNS, "Training (2002-2023)")

if df_train_base is not None:
    # 2. Get Encoded Values
    try:
        # ... (encoding map lookup - same as before) ...
        selected_state_key=SELECTED_STATE_STR.strip().lower(); selected_district_key=SELECTED_DISTRICT_STR.strip().lower(); selected_commodity_key=SELECTED_COMMODITY_STR.strip().lower()
        encoded_state = state_name_encoding_map.get(selected_state_key); encoded_district = district_name_encoding_map.get(selected_district_key); encoded_commodity = commodity_name_encoding_map.get(selected_commodity_key)
        # ... (error checking for lookup - same as before) ...
        lookup_failed = False
        if encoded_state is None: print(f"Error: State '{SELECTED_STATE_STR}' missing map."); lookup_failed=True
        if encoded_district is None: print(f"Error: District '{SELECTED_DISTRICT_STR}' missing map."); lookup_failed=True
        if encoded_commodity is None: print(f"Error: Commodity '{SELECTED_COMMODITY_STR}' missing map."); lookup_failed=True
        if lookup_failed: print("Check maps."); df_train_base=None
        else: print(f"\nSelected: {SELECTED_STATE_STR}/{SELECTED_DISTRICT_STR}/{SELECTED_COMMODITY_STR} -> Encoded: St={encoded_state}, Di={encoded_district}, Co={encoded_commodity}")

    except Exception as e: print(f"Error mapping lookup: {e}"); df_train_base = None

if df_train_base is not None:
    # 3. Filtering Data
    print(f"\nFiltering dataset using encoded values...")
    filter_cols_num = ['state_name', 'district_name', 'commodity_name']
    if not all(col in df_train_base.columns for col in filter_cols_num): print("Error: Encoded filter cols missing Training."); filtered_df_train = pd.DataFrame()
    else: filtered_df_train = df_train_base[(df_train_base['state_name'] == encoded_state) & (df_train_base['district_name'] == encoded_district) & (df_train_base['commodity_name'] == encoded_commodity)].copy(); filtered_df_train.sort_values(by=DATE_COLUMN, inplace=True)

    if filtered_df_train.empty:
        print("\nError: No training data found after filtering. Cannot train models.")
    else:
        print(f"Filtered training data shape: {filtered_df_train.shape}")

        # --- Loop through targets to train and save ---
        for target in TARGET_COLUMNS:
            print("=" * 60)
            print(f"Processing and training for target: {target}")
            print("=" * 60)

            # 4. Prepare Data for LSTM
            train_series = filtered_df_train[target].values.reshape(-1, 1)
            if len(train_series) <= SEQUENCE_LENGTH:
                print(f"Skipping {target}: Not enough data points ({len(train_series)}) after filtering to create sequences.")
                continue

            # Scale data
            scaler = MinMaxScaler(feature_range=(0, 1))
            scaled_train_data = scaler.fit_transform(train_series)

            # Create sequences
            X_train, y_train = create_sequences(scaled_train_data, SEQUENCE_LENGTH)

            if X_train.shape[0] == 0:
                print(f"Skipping {target}: Not enough data to create sequences ({len(scaled_train_data)} points).")
                continue

            print(f"Training sequences shape for {target}: X={X_train.shape}, y={y_train.shape}")

            # --- 5. Build & Train LSTM Model ---
            print(f"\nBuilding LSTM model for {target}...")
            model = Sequential()
            model.add(LSTM(LSTM_UNITS, activation='relu', input_shape=(SEQUENCE_LENGTH, 1)))
            model.add(Dropout(DROPOUT_RATE))
            model.add(Dense(1))
            optimizer = Adam(learning_rate=LEARNING_RATE) # Use configured LR
            model.compile(optimizer=optimizer, loss='mean_squared_error')
            # model.summary() # Optional summary

            print(f"\nStarting LSTM model training for {target}...")
            # Use a portion of training data for validation during offline training if desired
            # Or simply train until convergence / max epochs
            # early_stopping = EarlyStopping(monitor='loss', patience=10, restore_best_weights=True) # Stop based on training loss plateau
            # For simplicity, train for fixed epochs here, but EarlyStopping on val_loss is better if you create a split
            history = model.fit(
                X_train, y_train,
                epochs=EPOCHS,
                batch_size=BATCH_SIZE,
                # validation_split=0.1, # Use 10% of training sequences for validation during fit
                # callbacks=[early_stopping],
                verbose=1
            )
            print(f"Training finished for {target}.")

            # --- 6. Save Model, Scaler, Last Sequence ---
            print(f"\nSaving artifacts for {target}...")
            try:
                # Save Model
                model_path = os.path.join(MODEL_SAVE_DIR, f"lstm_model_{target}.h5")
                model.save(model_path)
                print(f"   Model saved to: {model_path}")

                # Save Scaler
                scaler_path = os.path.join(MODEL_SAVE_DIR, f"scaler_{target}.joblib")
                joblib.dump(scaler, scaler_path)
                print(f"   Scaler saved to: {scaler_path}")

                # Save Last Sequence (Needed to start predictions)
                last_sequence = scaled_train_data[-SEQUENCE_LENGTH:]
                sequence_path = os.path.join(MODEL_SAVE_DIR, f"last_sequence_{target}.npy")
                np.save(sequence_path, last_sequence)
                print(f"   Last sequence saved to: {sequence_path} (Shape: {last_sequence.shape})")

            except Exception as e:
                print(f"Error saving artifacts for {target}: {e}")
                traceback.print_exc()

        print("=" * 60)
        print("Offline training and artifact saving complete.")

else:
    print("\nFailed during data loading, preprocessing, or mapping lookup. Cannot train.")

print("\nProcess finished.")

AttributeError: partially initialized module 'pandas' has no attribute '_pandas_parser_CAPI' (most likely due to a circular import)