In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from statsmodels.tsa.arima.model import ARIMA
import warnings
warnings.filterwarnings('ignore')

# ============================
# 0️⃣ Load & prepare data (KEEPING YOUR EXACT WORKING VERSION)
# ============================
merged_train_df = pd.read_csv('merged_train_df.csv')
test_df = pd.read_csv("test_df.csv")

merged_train_df['date'] = pd.to_datetime(merged_train_df['date'])
test_df['date'] = pd.to_datetime(test_df['date'])

merged_train_df = merged_train_df.sort_values(["store_nbr", "family", "date"]).reset_index(drop=True)
test_df = test_df.sort_values(["store_nbr", "family", "date"]).reset_index(drop=True)

merged_train_df['store_nbr'] = merged_train_df['store_nbr'].astype(int)
test_df['store_nbr'] = test_df['store_nbr'].astype(int)

# ============================
# 1️⃣ Advanced data preprocessing with LOG TRANSFORM
# ============================
print(">>> Advanced data preprocessing with log transform...")

# Handle sales data quality
print(f"Train sales - Min: {merged_train_df['sales'].min():.2f}, Max: {merged_train_df['sales'].max():.2f}")
print(f"Negative sales: {(merged_train_df['sales'] < 0).sum()}, Zero sales: {(merged_train_df['sales'] == 0).sum()}")

# Clip negative values and add small epsilon for log
merged_train_df['sales'] = np.maximum(merged_train_df['sales'], 0.01)
merged_train_df['log_sales'] = np.log1p(merged_train_df['sales'])  # This will be our target

# Get date ranges
train_start = merged_train_df['date'].min()
train_end = merged_train_df['date'].max()
test_start = test_df['date'].min()
test_end = test_df['date'].max()

print(f">>> Train period: {train_start} to {train_end}")
print(f">>> Test period: {test_start} to {test_end}")
print(f">>> Log sales range: {merged_train_df['log_sales'].min():.3f} to {merged_train_df['log_sales'].max():.3f}")

# ============================
# 2️⃣ Simplified feature engineering (KEEPING YOUR EXACT VERSION)
# ============================
def create_time_features(df):
    """Create time-based features"""
    df = df.copy()
    
    # Basic time features
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['dayofweek'] = df['date'].dt.dayofweek
    df['quarter'] = df['date'].dt.quarter
    df['week'] = df['date'].dt.isocalendar().week
    df['dayofyear'] = df['date'].dt.dayofyear
    
    # Advanced time features
    df['is_weekend'] = (df['dayofweek'] >= 5).astype(int)
    df['is_monday'] = (df['dayofweek'] == 0).astype(int)
    df['is_friday'] = (df['dayofweek'] == 4).astype(int)
    df['is_month_start'] = df['date'].dt.is_month_start.astype(int)
    df['is_month_end'] = df['date'].dt.is_month_end.astype(int)
    df['is_quarter_start'] = df['date'].dt.is_quarter_start.astype(int)
    df['is_quarter_end'] = df['date'].dt.is_quarter_end.astype(int)
    
    # Cyclical encoding
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    df['dayofweek_sin'] = np.sin(2 * np.pi * df['dayofweek'] / 7)
    df['dayofweek_cos'] = np.cos(2 * np.pi * df['dayofweek'] / 7)
    df['day_sin'] = np.sin(2 * np.pi * df['day'] / 31)
    df['day_cos'] = np.cos(2 * np.pi * df['day'] / 31)
    
    return df

def add_other_features(df):
    """Add non-time features"""
    df = df.copy()
    
    # Store-family identifier
    df['store_family'] = df['store_nbr'].astype(str) + "_" + df['family'].astype(str)
    
    # Promotion features
    if 'onpromotion' in df.columns:
        df['onpromotion'] = df['onpromotion'].fillna(0).astype(int)
    else:
        df['onpromotion'] = 0
    
    # Holiday features
    if 'isHoliday' in df.columns:
        df['isHoliday'] = df['isHoliday'].fillna(0).astype(int)
        df['promo_holiday'] = df['onpromotion'] * df['isHoliday']
    else:
        df['isHoliday'] = 0
        df['promo_holiday'] = 0
    
    # Oil price features
    if 'oil_price' in df.columns:
        df['oil_price'] = df['oil_price'].fillna(method='ffill').fillna(method='bfill').fillna(100)
    else:
        df['oil_price'] = 100
    
    # Special event features
    for col in ['earthquake_impact', 'salary_day_impact']:
        if col in df.columns:
            df[col] = df[col].fillna(0)
        else:
            df[col] = 0
    
    return df

def create_lag_features_safe(df_list):
    """Create lag features safely for list of dataframes (one per store-family) - WITH LOG TRANSFORM"""
    
    result_dfs = []
    
    for df_group in df_list:
        if len(df_group) == 0:
            continue
            
        df_group = df_group.copy().sort_values('date').reset_index(drop=True)
        
        # Use LOG SALES for lag features
        if 'log_sales' in df_group.columns:
            target_col = 'log_sales'
        else:
            target_col = 'sales'  # fallback for test data
        
        # Simple lag features
        df_group['sales_lag_1'] = df_group[target_col].shift(1).fillna(df_group[target_col].mean())
        df_group['sales_lag_7'] = df_group[target_col].shift(7).fillna(df_group[target_col].mean())
        df_group['sales_lag_14'] = df_group[target_col].shift(14).fillna(df_group[target_col].mean())
        
        # Simple rolling features - avoid groupby issues
        df_group['sales_roll_7_mean'] = df_group[target_col].rolling(7, min_periods=1).mean()
        df_group['sales_roll_14_mean'] = df_group[target_col].rolling(14, min_periods=1).mean()
        df_group['sales_roll_7_std'] = df_group[target_col].rolling(7, min_periods=1).std().fillna(0)
        
        # Simple trend feature
        if len(df_group) >= 7:
            df_group['sales_trend_7'] = (
                df_group[target_col].rolling(7, min_periods=1).mean() / 
                df_group[target_col].rolling(14, min_periods=1).mean().shift(7).fillna(df_group[target_col].mean())
            ).fillna(1.0)
        else:
            df_group['sales_trend_7'] = 1.0
        
        result_dfs.append(df_group)
    
    return result_dfs

# Apply basic feature engineering
print(">>> Creating time features...")
train_fe = create_time_features(merged_train_df)
test_fe = create_time_features(test_df)

print(">>> Adding other features...")
train_fe = add_other_features(train_fe)
test_fe = add_other_features(test_fe)

# Create lag features by processing each store-family group separately
print(">>> Creating lag features with log transform...")

# Split training data by store-family
train_groups = []
for (store, family), group in train_fe.groupby(['store_nbr', 'family']):
    train_groups.append(group)

# Process lag features
train_groups_with_lags = create_lag_features_safe(train_groups)

# Recombine training data
if train_groups_with_lags:
    train_fe = pd.concat(train_groups_with_lags, ignore_index=True)
    train_fe = train_fe.sort_values(['store_nbr', 'family', 'date']).reset_index(drop=True)

# For test data, create lag features using training data statistics (in log space)
print(">>> Creating lag features for test...")

test_with_lags = []
for (store, family), test_group in test_fe.groupby(['store_nbr', 'family']):
    test_group = test_group.copy().reset_index(drop=True)
    
    # Find corresponding training group
    train_group = train_fe[(train_fe['store_nbr'] == store) & (train_fe['family'] == family)]
    
    if len(train_group) > 0:
        # Use last values from training for lag initialization (in log space)
        recent_log_sales = train_group['log_sales'].tail(14)
        
        # Initialize lag features
        if len(recent_log_sales) >= 1:
            test_group['sales_lag_1'] = recent_log_sales.iloc[-1]
        else:
            test_group['sales_lag_1'] = train_group['log_sales'].mean()
            
        if len(recent_log_sales) >= 7:
            test_group['sales_lag_7'] = recent_log_sales.iloc[-7]
        else:
            test_group['sales_lag_7'] = train_group['log_sales'].mean()
            
        if len(recent_log_sales) >= 14:
            test_group['sales_lag_14'] = recent_log_sales.iloc[-14]
        else:
            test_group['sales_lag_14'] = train_group['log_sales'].mean()
        
        # Rolling features - use recent statistics
        test_group['sales_roll_7_mean'] = recent_log_sales.tail(7).mean() if len(recent_log_sales) >= 7 else train_group['log_sales'].mean()
        test_group['sales_roll_14_mean'] = recent_log_sales.mean()
        test_group['sales_roll_7_std'] = recent_log_sales.tail(7).std() if len(recent_log_sales) >= 7 else train_group['log_sales'].std()
        
        if pd.isna(test_group['sales_roll_7_std'].iloc[0]):
            test_group['sales_roll_7_std'] = 0
            
        # Trend feature
        test_group['sales_trend_7'] = 1.0
        
    else:
        # No training data - use defaults (use log space defaults)
        default_log_sales = np.log1p(1.0)
        for col in ['sales_lag_1', 'sales_lag_7', 'sales_lag_14', 'sales_roll_7_mean', 'sales_roll_14_mean']:
            test_group[col] = default_log_sales
        test_group['sales_roll_7_std'] = 0.0
        test_group['sales_trend_7'] = 1.0
    
    test_with_lags.append(test_group)

if test_with_lags:
    test_fe = pd.concat(test_with_lags, ignore_index=True)
    test_fe = test_fe.sort_values(['store_nbr', 'family', 'date']).reset_index(drop=True)

# ============================
# 3️⃣ Add aggregate features (KEEPING YOUR EXACT VERSION - but with log sales)
# ============================
print(">>> Creating aggregate features (log space)...")

# Store level statistics from training data (using log_sales)
store_stats = train_fe.groupby('store_nbr')['log_sales'].agg(['mean', 'std', 'median']).reset_index()
store_stats.columns = ['store_nbr', 'store_sales_mean', 'store_sales_std', 'store_sales_median']
store_stats['store_sales_std'] = store_stats['store_sales_std'].fillna(0)

# Family level statistics
family_stats = train_fe.groupby('family')['log_sales'].agg(['mean', 'std', 'median']).reset_index()
family_stats.columns = ['family', 'family_sales_mean', 'family_sales_std', 'family_sales_median']
family_stats['family_sales_std'] = family_stats['family_sales_std'].fillna(0)

# Add to training data
train_fe = train_fe.merge(store_stats, on='store_nbr', how='left')
train_fe = train_fe.merge(family_stats, on='family', how='left')

# Add to test data
test_fe = test_fe.merge(store_stats, on='store_nbr', how='left')
test_fe = test_fe.merge(family_stats, on='family', how='left')

# Fill missing aggregate features
agg_cols = ['store_sales_mean', 'store_sales_std', 'store_sales_median', 
            'family_sales_mean', 'family_sales_std', 'family_sales_median']

for col in agg_cols:
    overall_median = train_fe[col].median()
    train_fe[col] = train_fe[col].fillna(overall_median)
    test_fe[col] = test_fe[col].fillna(overall_median)

# ============================
# 4️⃣ Encode categoricals (KEEPING YOUR EXACT VERSION)
# ============================
cat_cols = ['family']
if all(col in train_fe.columns and col in test_fe.columns for col in ['city', 'state', 'type']):
    cat_cols.extend(['city', 'state', 'type'])

for col in cat_cols:
    le = LabelEncoder()
    combined_values = pd.concat([train_fe[col].astype(str), test_fe[col].astype(str)]).unique()
    le.fit(combined_values)
    train_fe[f'{col}_encoded'] = le.transform(train_fe[col].astype(str))
    test_fe[f'{col}_encoded'] = le.transform(test_fe[col].astype(str))

# ============================
# 5️⃣ Feature selection (KEEPING YOUR EXACT VERSION)
# ============================
base_features = [
    'year', 'month', 'day', 'dayofweek', 'quarter', 'week', 'dayofyear',
    'is_weekend', 'is_monday', 'is_friday', 'is_month_start', 'is_month_end',
    'is_quarter_start', 'is_quarter_end',
    'month_sin', 'month_cos', 'dayofweek_sin', 'dayofweek_cos', 'day_sin', 'day_cos',
    'onpromotion', 'isHoliday', 'promo_holiday', 'oil_price', 'earthquake_impact', 'salary_day_impact'
]

lag_features = ['sales_lag_1', 'sales_lag_7', 'sales_lag_14', 
                'sales_roll_7_mean', 'sales_roll_14_mean', 'sales_roll_7_std', 'sales_trend_7']

aggregate_features = ['store_sales_mean', 'store_sales_std', 'store_sales_median',
                     'family_sales_mean', 'family_sales_std', 'family_sales_median']

encoded_features = [f'{col}_encoded' for col in cat_cols]

all_features = base_features + lag_features + aggregate_features + encoded_features
available_features = [col for col in all_features if col in train_fe.columns and col in test_fe.columns]

print(f">>> Using {len(available_features)} features")

# ============================
# 6️⃣ Random Forest training with REDUCED OVERFITTING + LOG TARGET
# ============================
print(">>> Training Random Forest model (log space, reduced overfitting)...")

# Clean training data
train_clean = train_fe.dropna(subset=lag_features[:3])  # Only require basic lags
print(f">>> Clean training samples: {len(train_clean)}")

if len(train_clean) > 500:
    # Prepare data
    X_train = train_clean[available_features].fillna(0)
    y_train = train_clean['log_sales']  # TARGET IS NOW LOG_SALES!
    X_test = test_fe[available_features].fillna(0)
    
    # Train Random Forest - REDUCED OVERFITTING PARAMETERS
    rf_model = RandomForestRegressor(
        n_estimators=150,      # Increased from 100 for better generalization
        max_depth=10,          # Reduced from 12 to prevent overfitting
        min_samples_split=20,  # Increased from 10 to prevent overfitting
        min_samples_leaf=10,   # Increased from 5 to prevent overfitting
        max_features='sqrt',   # Added to reduce overfitting
        bootstrap=True,        # Ensure bootstrap sampling
        max_samples=0.8,       # Use 80% of data per tree
        random_state=42,
        n_jobs=-1
    )
    
    rf_model.fit(X_train, y_train)
    
    # Make predictions (in log space)
    rf_log_predictions = rf_model.predict(X_test)
    
    # Convert back to original space
    rf_predictions = np.expm1(rf_log_predictions)  # Inverse of log1p
    rf_predictions = np.maximum(rf_predictions, 0.01)
    
    print(">>> Random Forest training completed (with log transform)")
    
    # Feature importance
    importance_df = pd.DataFrame({
        'feature': available_features,
        'importance': rf_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print(">>> Top 10 RF features:")
    print(importance_df.head(10).to_string(index=False))
    
else:
    rf_predictions = None
    rf_log_predictions = None
    print(">>> Insufficient training data for Random Forest")

# ============================
# 7️⃣ ARIMA Model - ENHANCED WITH LOG TRANSFORM
# ============================
print(">>> Adding ARIMA component (enhanced, log space)...")

def fit_enhanced_arima(train_data, forecast_periods, max_data_points=120):
    """Enhanced ARIMA model for time series patterns (log space)"""
    
    if len(train_data) < 28:
        return None
    
    try:
        # Use recent data and work in log space
        recent_data = train_data.sort_values('date').tail(max_data_points)
        log_sales_series = recent_data['log_sales'].values
        
        # Simple parameter selection with more configurations
        best_model = None
        best_aic = float('inf')
        
        # Try more ARIMA configurations
        configs = [
            (1,1,1), (2,1,1), (1,1,2), (0,1,1), (1,1,0), (2,1,2),
            (3,1,1), (1,1,3), (2,1,0), (0,1,2), (1,0,1), (2,0,2)  # Additional configs
        ]
        
        for p, d, q in configs:
            try:
                model = ARIMA(log_sales_series, order=(p, d, q))
                fitted_model = model.fit()
                
                if fitted_model.aic < best_aic:
                    best_aic = fitted_model.aic
                    best_model = fitted_model
            except:
                continue
        
        if best_model is not None:
            log_forecast = best_model.forecast(steps=forecast_periods)
            # Convert back to original space
            forecast = np.expm1(log_forecast)
            forecast = np.maximum(forecast, 0.01)
            return forecast
        else:
            return None
            
    except:
        return None

# Select MORE store-family combinations for ARIMA (increased impact)
print(">>> Selecting combinations for ARIMA (increased coverage)...")
arima_combinations = []

for (store, family), group in train_fe.groupby(['store_nbr', 'family']):
    if len(group) >= 42:  # Reduced threshold from 50 to get more combinations
        volume = group['sales'].sum()
        data_points = len(group)
        score = volume * np.log(data_points)  # Simple scoring
        arima_combinations.append((store, family, score))

# Sort and take top 200 combinations (increased from 100)
arima_combinations.sort(key=lambda x: x[2], reverse=True)
top_arima_combinations = [x[:2] for x in arima_combinations[:200]]

print(f">>> Processing ARIMA for {len(top_arima_combinations)} combinations...")

arima_results = []
for i, (store, family) in enumerate(top_arima_combinations):
    if i % 40 == 0:
        print(f">>> ARIMA progress: {i+1}/{len(top_arima_combinations)}")
        
    train_subset = train_fe[(train_fe['store_nbr'] == store) & (train_fe['family'] == family)]
    test_subset = test_fe[(test_fe['store_nbr'] == store) & (test_fe['family'] == family)]
    
    if len(test_subset) == 0:
        continue
        
    arima_pred = fit_enhanced_arima(train_subset, len(test_subset))
    
    if arima_pred is not None:
        result_df = test_subset[['id']].copy()
        result_df['sales_arima'] = arima_pred
        arima_results.append(result_df)

# Combine ARIMA results
if arima_results:
    arima_df = pd.concat(arima_results, ignore_index=True)
    print(f">>> ARIMA predictions generated for {len(arima_df)} test samples")
else:
    arima_df = pd.DataFrame(columns=['id', 'sales_arima'])

# ============================
# 8️⃣ Store-family level predictions (KEEPING YOUR EXACT VERSION but with log awareness)
# ============================
print(">>> Creating store-family level predictions...")

store_family_preds = []
store_family_combinations = set(
    zip(train_fe['store_nbr'], train_fe['family'])
).intersection(set(zip(test_fe['store_nbr'], test_fe['family'])))

for i, (store_num, family) in enumerate(sorted(store_family_combinations)):
    if i % 300 == 0:
        print(f">>> Processing {i+1}/{len(store_family_combinations)}")
    
    train_subset = train_fe[
        (train_fe['store_nbr'] == store_num) & (train_fe['family'] == family)
    ].sort_values('date').tail(100)  # Use recent data
    
    test_subset = test_fe[
        (test_fe['store_nbr'] == store_num) & (test_fe['family'] == family)
    ].sort_values('date')
    
    if len(train_subset) < 10 or len(test_subset) == 0:
        continue
    
    try:
        recent_sales = train_subset['sales'].values
        
        # Advanced prediction combining multiple signals
        base_pred = np.median(recent_sales[-21:]) if len(recent_sales) >= 21 else np.median(recent_sales)
        
        # Trend (last month vs previous month)
        if len(recent_sales) >= 28:
            recent_avg = np.mean(recent_sales[-14:])
            prev_avg = np.mean(recent_sales[-28:-14])
            trend_factor = recent_avg / (prev_avg + 0.01)
            trend_factor = np.clip(trend_factor, 0.8, 1.3)
        else:
            trend_factor = 1.0
        
        # Seasonality (day of week)
        seasonal_factors = []
        train_df_subset = train_subset.copy()
        
        for _, row in test_subset.iterrows():
            dow = row['dayofweek']
            dow_sales = train_df_subset[train_df_subset['dayofweek'] == dow]['sales']
            
            if len(dow_sales) >= 3:
                dow_factor = dow_sales.mean() / (train_df_subset['sales'].mean() + 0.01)
                seasonal_factors.append(np.clip(dow_factor, 0.7, 1.5))
            else:
                seasonal_factors.append(1.0)
        
        # Promotion effect
        promo_factors = []
        if 'onpromotion' in test_subset.columns:
            # Calculate promotion lift from training data
            promo_sales = train_df_subset[train_df_subset['onpromotion'] == 1]['sales']
            normal_sales = train_df_subset[train_df_subset['onpromotion'] == 0]['sales']
            
            if len(promo_sales) > 0 and len(normal_sales) > 0:
                promo_lift = promo_sales.mean() / (normal_sales.mean() + 0.01)
                promo_lift = np.clip(promo_lift, 1.0, 1.8)
            else:
                promo_lift = 1.15  # Default 15% lift
            
            for _, row in test_subset.iterrows():
                if row['onpromotion'] == 1:
                    promo_factors.append(promo_lift)
                else:
                    promo_factors.append(1.0)
        else:
            promo_factors = [1.0] * len(test_subset)
        
        # Combine all factors
        predictions_array = (
            base_pred * trend_factor * 
            np.array(seasonal_factors) * 
            np.array(promo_factors)
        )
        
        predictions_array = np.maximum(predictions_array, 0.01)
        
    except:
        predictions_array = np.full(len(test_subset), max(np.median(recent_sales), 0.01))
    
    # Store results
    output_df = test_subset[['id']].copy()
    output_df['sales'] = predictions_array
    store_family_preds.append(output_df)

# ============================
# 9️⃣ Final RF+ARIMA ensemble - INCREASED ARIMA IMPACT
# ============================
print(">>> Creating RF+ARIMA ensemble (increased ARIMA impact)...")

if store_family_preds:
    sf_pred_df = pd.concat(store_family_preds, ignore_index=True)
    
    if rf_predictions is not None:
        # Create RF prediction dataframe
        rf_pred_df = test_fe[['id']].copy()
        rf_pred_df['sales'] = rf_predictions
        
        # Merge all predictions
        ensemble_df = sf_pred_df.merge(rf_pred_df, on='id', how='outer', suffixes=('_sf', '_rf'))
        
        # Add ARIMA if available
        if len(arima_df) > 0:
            ensemble_df = ensemble_df.merge(arima_df[['id', 'sales_arima']], on='id', how='left')
        else:
            ensemble_df['sales_arima'] = np.nan
        
        # Fill missing values
        ensemble_df['sales_sf'] = ensemble_df['sales_sf'].fillna(ensemble_df['sales_rf'])
        ensemble_df['sales_rf'] = ensemble_df['sales_rf'].fillna(ensemble_df['sales_sf'])
        
        # RF+SF ensemble - slightly more conservative
        ensemble_df['sales'] = ensemble_df['sales_rf'] * 0.55 + ensemble_df['sales_sf'] * 0.45  # More balanced
        
        # Add ARIMA component where available - INCREASED IMPACT
        arima_mask = ~pd.isna(ensemble_df['sales_arima'])
        if arima_mask.sum() > 0:
            # Increased ARIMA weight: 50% base ensemble + 50% ARIMA (was 70%/30%)
            ensemble_df.loc[arima_mask, 'sales'] = (
                ensemble_df.loc[arima_mask, 'sales'] * 0.5 + 
                ensemble_df.loc[arima_mask, 'sales_arima'] * 0.5
            )
        
        final_submission = ensemble_df[['id', 'sales']].copy()
        
    else:
        final_submission = sf_pred_df.copy()
    
    # Handle missing predictions
    all_test_ids = set(test_fe['id'])
    predicted_ids = set(final_submission['id'])
    missing_ids = all_test_ids - predicted_ids
    
    if missing_ids:
        print(f">>> Filling {len(missing_ids)} missing predictions...")
        median_sales = train_fe['sales'].median()
        missing_df = pd.DataFrame({'id': list(missing_ids), 'sales': [median_sales] * len(missing_ids)})
        final_submission = pd.concat([final_submission, missing_df], ignore_index=True)
    
    # Final cleanup
    final_submission['sales'] = final_submission['sales'].fillna(train_fe['sales'].median())
    final_submission['sales'] = np.maximum(final_submission['sales'], 0.01)
    final_submission = final_submission.sort_values('id').reset_index(drop=True)
    
    # Save
    final_submission.to_csv("enhanced_rf_arima_log_submission.csv", index=False)
    
    print(f"\n>>> ENHANCED RF+ARIMA ENSEMBLE RESULTS (LOG TRANSFORM):")
    print(f">>> Submission shape: {final_submission.shape}")
    print(f">>> Sales range: {final_submission['sales'].min():.3f} - {final_submission['sales'].max():.3f}")
    print(f">>> Sales median: {final_submission['sales'].median():.3f}")
    print(f">>> Random Forest coverage: 100% (trained on log_sales)")
    print(f">>> ARIMA coverage: {len(arima_df) / len(final_submission) * 100:.1f}%")
    print(f">>> Store-family coverage: {len(sf_pred_df) / len(final_submission) * 100:.1f}%")
    
    print(f"\n>>> ENHANCED ENSEMBLE COMPOSITION:")
    arima_coverage = len(arima_df) / len(final_submission) if len(final_submission) > 0 else 0
    print(f">>> For {arima_coverage*100:.1f}% of predictions: 50% (RF+SF) + 50% ARIMA (INCREASED)")
    print(f">>> For remaining predictions: 55% RF + 45% Store-Family")
    print(f">>> Key improvements:")
    print(f"    - Random Forest trained on log-transformed sales (better for skewed data)")
    print(f"    - Reduced overfitting: max_depth=10, min_samples_split=20, min_samples_leaf=10")
    print(f"    - Added max_features='sqrt' and max_samples=0.8 for regularization")
    print(f"    - ARIMA coverage increased from 100 to 200 top combinations")
    print(f"    - ARIMA weight increased from 30% to 50% where available")
    print(f"    - All lag features computed in log space for consistency")
    print(f"    - Enhanced ARIMA with more parameter configurations")
    
    print(">>> Submission saved as 'enhanced_rf_arima_log_submission.csv'")
    print(">>> Expected improvements:")
    print(">>>   1. Better handling of sales distribution via log transform")
    print(">>>   2. Reduced overfitting in Random Forest")
    print(">>>   3. Stronger time series patterns from enhanced ARIMA")
    print(">>>   4. More robust ensemble with increased ARIMA influence")
    
else:
    print(">>> ERROR: No predictions generated!")

# ============================
# 🔟 Additional diagnostic outputs
# ============================
if rf_predictions is not None and len(arima_df) > 0:
    print(f"\n>>> MODEL DIAGNOSTICS:")
    print(f">>> Training log_sales stats: mean={train_fe['log_sales'].mean():.3f}, std={train_fe['log_sales'].std():.3f}")
    
    # Check prediction quality
    rf_pred_stats = f"RF predictions: mean={np.mean(rf_predictions):.2f}, median={np.median(rf_predictions):.2f}"
    arima_pred_stats = f"ARIMA predictions: mean={arima_df['sales_arima'].mean():.2f}, median={arima_df['sales_arima'].median():.2f}"
    
    print(f">>> {rf_pred_stats}")
    print(f">>> {arima_pred_stats}")
    
    # Check for extreme predictions
    extreme_high = (final_submission['sales'] > final_submission['sales'].quantile(0.99)).sum()
    extreme_low = (final_submission['sales'] < 0.1).sum()
    print(f">>> Extreme predictions: {extreme_high} high outliers, {extreme_low} very low")
    
print(">>> Enhanced pipeline complete!")

>>> Enhanced data preprocessing with validation...
Train sales - Min: 0.00, Max: 124717.00
Negative sales: 0, Zero sales: 939130
>>> Train period: 2013-01-01 00:00:00 to 2017-07-31 00:00:00
>>> Validation period: 2017-08-01 00:00:00 to 2017-08-15 00:00:00
>>> Test period: 2017-08-16 00:00:00 to 2017-08-31 00:00:00
>>> Creating advanced features...
>>> Creating enhanced lag features...
>>> Creating lag features for validation...
>>> Creating lag features for test...
>>> Creating aggregate features...
>>> Using 70 features
>>> Train samples: 2974158
>>> Validation samples: 26730
>>> Test samples: 28512
>>> Training Random Forest...
>>> Random Forest validation RMSLE: 0.5066
>>> Training XGBoost...
>>> XGBoost validation RMSLE: 0.6172
>>> Training Neural Network...
>>> Neural Network validation RMSLE: 0.6511
>>> Enhanced ARIMA component...
>>> Selecting combinations for enhanced time series...
>>> Processing time series for 300 combinations...
>>> Time series progress: 1/300


In [5]:
merged_train_df.dtypes

id                            int64
date                 datetime64[ns]
store_nbr                     int64
family                       object
sales                       float64
onpromotion                   int64
oil_price                   float64
city                         object
state                        object
type                         object
cluster                       int64
isHoliday                     int64
earthquake_impact             int64
salary_day_impact             int64
transactions                float64
dtype: object

In [6]:
test_df.dtypes

id                            int64
date                 datetime64[ns]
store_nbr                     int64
family                       object
onpromotion                   int64
earthquake_impact             int64
salary_day_impact             int64
isHoliday                     int64
oil_price                   float64
city                         object
state                        object
type                         object
cluster                       int64
transactions                float64
dtype: object

In [10]:
submission = pd.read_csv("my_submission_clean.csv")

# Check for NaN
print("NaN values per column:\n", submission.isna().sum())

# Check for inf
print("Infinite values per column:\n", np.isinf(submission).sum())

NaN values per column:
 id       0
sales    0
dtype: int64
Infinite values per column:
 id       0
sales    0
dtype: int64


In [9]:
import numpy as np
import pandas as pd

submission = pd.read_csv("my_submission_clean.csv")

# Find rows where sales is infinite
mask = np.isinf(submission["sales"])

# Show the offending rows
print(submission.loc[mask])


Empty DataFrame
Columns: [id, sales]
Index: []


In [8]:
import pandas as pd
import numpy as np

submission = pd.read_csv("my_submission.csv")

# Replace inf with NaN first (so ffill works)
submission["sales"].replace([np.inf, -np.inf], np.nan, inplace=True)

# Forward fill
submission["sales"].fillna(method="ffill", inplace=True)

# (Optional) if the very first row is NaN/inf, ffill won’t work — so backfill as fallback
submission["sales"].fillna(method="bfill", inplace=True)

# Save cleaned file
submission.to_csv("my_submission_clean.csv", index=False)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  submission["sales"].replace([np.inf, -np.inf], np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  submission["sales"].fillna(method="ffill", inplace=True)
  submission["sales"].fillna(method="ffill", inplace=True)
The behavior will change in pandas 3.0. This in

In [6]:
import pandas as pd
import numpy as np

# Load your files
test_df = pd.read_csv("test_df.csv")
submission = pd.read_csv("my_submission.csv")

# Check the range of IDs
print("▶️ Test set ID range:", test_df["id"].min(), "to", test_df["id"].max())
print("▶️ Submission ID range:", submission["id"].min(), "to", submission["id"].max())

# Also check for IDs in submission but not in test
extra_ids = set(submission["id"]) - set(test_df["id"])
if extra_ids:
    print("⚠️ IDs present in submission but not in test:", list(extra_ids)[:10], "...")
else:
    print("✅ All submission IDs are inside test_df.")


▶️ Test set ID range: 3000888 to 3029399
▶️ Submission ID range: 3000888 to 3029399
✅ All submission IDs are inside test_df.
