In [None]:
import pandas as pd
import numpy as np
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

# ============================
# 0️⃣ Load & prepare data
# ============================
merged_train_df = pd.read_csv('merged_train_df.csv')
test_df = pd.read_csv("test_df.csv")

merged_train_df['date'] = pd.to_datetime(merged_train_df['date'])
test_df['date'] = pd.to_datetime(test_df['date'])

merged_train_df = merged_train_df.sort_values("date").reset_index(drop=True)
test_df = test_df.sort_values("date").reset_index(drop=True)

merged_train_df['store_nbr'] = merged_train_df['store_nbr'].astype(int)
test_df['store_nbr'] = test_df['store_nbr'].astype(int)

# ============================
# CRITICAL FIX: Check for negative or zero sales
# ============================
print(">>> Checking sales data quality:")
print(f"Train sales - Min: {merged_train_df['sales'].min():.2f}, Max: {merged_train_df['sales'].max():.2f}")
print(f"Negative sales count: {(merged_train_df['sales'] < 0).sum()}")
print(f"Zero sales count: {(merged_train_df['sales'] == 0).sum()}")

# Remove or handle negative sales (critical for RMSLE)
if (merged_train_df['sales'] < 0).any():
    print(">>> WARNING: Found negative sales values - replacing with 0")
    merged_train_df['sales'] = merged_train_df['sales'].clip(lower=0)

# Add small constant to avoid log(0) in RMSLE calculation
merged_train_df['sales'] = merged_train_df['sales'] + 0.001

# ============================
# 1️⃣ Simplified approach - use store-family level modeling
# ============================
def create_time_features(df):
    """Add time-based features"""
    df = df.copy()
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['dayofweek'] = df['date'].dt.dayofweek
    df['quarter'] = df['date'].dt.quarter
    df['is_weekend'] = (df['dayofweek'] >= 5).astype(int)
    df['is_month_start'] = df['date'].dt.is_month_start.astype(int)
    df['is_month_end'] = df['date'].dt.is_month_end.astype(int)
    return df

def create_lag_features(group_df, target_col='sales', lags=[1, 7, 14]):
    """Create lag features"""
    df = group_df.copy().sort_values('date')
    
    for lag in lags:
        df[f'{target_col}_lag_{lag}'] = df[target_col].shift(lag)
    
    # Rolling means
    for window in [7, 14, 30]:
        if len(df) > window:
            df[f'{target_col}_rolling_mean_{window}'] = df[target_col].rolling(window, min_periods=1).mean()
        else:
            df[f'{target_col}_rolling_mean_{window}'] = df[target_col].mean()
    
    return df

print(">>> Creating time features...")
train_fe = create_time_features(merged_train_df)
test_fe = create_time_features(test_df)

# ============================
# 2️⃣ Create store-family combinations and lag features
# ============================
print(">>> Creating lag features per store-family...")

train_with_lags = []
for (store, family), group in train_fe.groupby(['store_nbr', 'family']):
    group_with_lags = create_lag_features(group)
    train_with_lags.append(group_with_lags)

train_fe = pd.concat(train_with_lags, ignore_index=True).sort_values(['store_nbr', 'family', 'date'])

# For test set, we need to be careful with lag features
# Use last known values from training data
print(">>> Creating lag features for test set...")

test_with_lags = []
for (store, family), group in test_fe.groupby(['store_nbr', 'family']):
    # Get corresponding training data for this store-family
    train_group = train_fe[(train_fe['store_nbr'] == store) & (train_fe['family'] == family)]
    
    if len(train_group) == 0:
        # No training data for this combination - use simple approach
        group_with_lags = create_time_features(group.copy())
        # Fill lag features with zeros or means
        for lag in [1, 7, 14]:
            group_with_lags[f'sales_lag_{lag}'] = 0
        for window in [7, 14, 30]:
            group_with_lags[f'sales_rolling_mean_{window}'] = 0
        test_with_lags.append(group_with_lags)
        continue
    
    # Use last values from training for initial lags
    last_sales = train_group['sales'].tail(14).values  # Get last 14 values
    
    group_with_lags = group.copy()
    
    # Initialize lag features with last known values
    for lag in [1, 7, 14]:
        if lag <= len(last_sales):
            initial_value = last_sales[-lag]
        else:
            initial_value = train_group['sales'].mean()
        group_with_lags[f'sales_lag_{lag}'] = initial_value
    
    # Initialize rolling means
    for window in [7, 14, 30]:
        if len(last_sales) >= window:
            initial_value = last_sales[-window:].mean()
        else:
            initial_value = train_group['sales'].mean()
        group_with_lags[f'sales_rolling_mean_{window}'] = initial_value
    
    test_with_lags.append(group_with_lags)

test_fe = pd.concat(test_with_lags, ignore_index=True).sort_values(['store_nbr', 'family', 'date'])

# ============================
# 3️⃣ Feature engineering
# ============================
def feature_engineering_simple(df):
    df = df.copy()
    
    # Basic interaction features
    df['promo_holiday'] = df['onpromotion'] * df['isHoliday'] if 'onpromotion' in df.columns and 'isHoliday' in df.columns else 0
    
    # Oil price features (if available)
    if 'oil_price' in df.columns:
        df['oil_price'] = df['oil_price'].fillna(df['oil_price'].mean())
        df['oil_price_diff'] = df.groupby('store_nbr')['oil_price'].diff().fillna(0)
    else:
        df['oil_price'] = 0
        df['oil_price_diff'] = 0
    
    return df

train_fe = feature_engineering_simple(train_fe)
test_fe = feature_engineering_simple(test_fe)

# ============================
# 4️⃣ Encode categoricals
# ============================
cat_cols = ['family', 'city', 'state', 'type'] if all(col in train_fe.columns for col in ['city', 'state', 'type']) else ['family']

for col in cat_cols:
    if col in train_fe.columns and col in test_fe.columns:
        le = LabelEncoder()
        combined_values = pd.concat([train_fe[col].astype(str), test_fe[col].astype(str)]).unique()
        le.fit(combined_values)
        train_fe[col + '_encoded'] = le.transform(train_fe[col].astype(str))
        test_fe[col + '_encoded'] = le.transform(test_fe[col].astype(str))

# ============================
# 5️⃣ Simplified modeling approach
# ============================
feature_cols = ['onpromotion', 'year', 'month', 'day', 'dayofweek', 'quarter', 
                'is_weekend', 'is_month_start', 'is_month_end', 'promo_holiday',
                'oil_price', 'oil_price_diff']

# Add lag features
lag_cols = [col for col in train_fe.columns if 'sales_lag_' in col or 'sales_rolling_mean_' in col]
feature_cols.extend(lag_cols)

# Add encoded categorical features
encoded_cols = [col for col in train_fe.columns if col.endswith('_encoded')]
feature_cols.extend(encoded_cols)

# Keep only available features
feature_cols = [col for col in feature_cols if col in train_fe.columns and col in test_fe.columns]

print(f">>> Using {len(feature_cols)} features: {feature_cols}")

# ============================
# 6️⃣ Train models per store-family combination
# ============================
predictions = []

store_family_combinations = set(
    zip(train_fe['store_nbr'], train_fe['family'])
).intersection(set(zip(test_fe['store_nbr'], test_fe['family'])))

print(f">>> Training models for {len(store_family_combinations)} store-family combinations...")

for i, (store_num, family) in enumerate(sorted(store_family_combinations)):
    if i % 100 == 0:
        print(f">>> Processing combination {i+1}/{len(store_family_combinations)}")
    
    # Get data for this store-family combination
    train_mask = (train_fe['store_nbr'] == store_num) & (train_fe['family'] == family)
    test_mask = (test_fe['store_nbr'] == store_num) & (test_fe['family'] == family)
    
    train_subset = train_fe[train_mask].sort_values('date')
    test_subset = test_fe[test_mask].sort_values('date')
    
    if len(train_subset) < 5 or len(test_subset) == 0:
        continue
    
    y_train = train_subset['sales']
    
    # Prepare features
    X_train = train_subset[feature_cols].fillna(0)
    X_test = test_subset[feature_cols].fillna(0)
    
    # Simple approach: use median/mean as baseline with trend adjustment
    try:
        # Calculate recent trend
        recent_sales = y_train.tail(min(14, len(y_train)))
        median_sales = recent_sales.median()
        
        # Simple trend calculation
        if len(recent_sales) > 7:
            early_avg = recent_sales.head(7).mean()
            late_avg = recent_sales.tail(7).mean()
            trend_factor = late_avg / early_avg if early_avg > 0 else 1.0
            trend_factor = np.clip(trend_factor, 0.5, 2.0)  # Limit extreme trends
        else:
            trend_factor = 1.0
        
        # Adjust for promotions if available
        if 'onpromotion' in test_subset.columns:
            promo_boost = test_subset['onpromotion'].mean() * 0.1 + 1.0  # 10% boost for promotions
        else:
            promo_boost = 1.0
        
        # Generate predictions
        base_prediction = median_sales * trend_factor * promo_boost
        
        # Add some seasonality based on day of week
        if 'dayofweek' in test_subset.columns:
            dow_factors = []
            for dow in test_subset['dayofweek']:
                train_dow = train_subset[train_subset['dayofweek'] == dow]['sales']
                if len(train_dow) > 0:
                    dow_factor = train_dow.mean() / y_train.mean() if y_train.mean() > 0 else 1.0
                    dow_factors.append(dow_factor)
                else:
                    dow_factors.append(1.0)
            
            predictions_array = base_prediction * np.array(dow_factors)
        else:
            predictions_array = np.full(len(test_subset), base_prediction)
        
        # Ensure predictions are positive
        predictions_array = np.maximum(predictions_array, 0.001)
        
    except Exception as e:
        # Fallback: use simple median
        predictions_array = np.full(len(test_subset), max(y_train.median(), 0.001))
    
    # Create output
    output_df = test_subset[['id']].copy()
    output_df['sales'] = predictions_array
    predictions.append(output_df)

# ============================
# 7️⃣ Create final submission
# ============================
if predictions:
    final_preds = pd.concat(predictions, ignore_index=True)
    
    # Ensure all test IDs are covered
    missing_ids = set(test_fe['id']) - set(final_preds['id'])
    if missing_ids:
        print(f">>> Found {len(missing_ids)} missing predictions, filling with median...")
        overall_median = train_fe['sales'].median()
        
        missing_df = pd.DataFrame({
            'id': list(missing_ids),
            'sales': [overall_median] * len(missing_ids)
        })
        final_preds = pd.concat([final_preds, missing_df], ignore_index=True)
    
    # Final cleanup
    submission_df = final_preds.copy()
    submission_df['sales'] = submission_df['sales'].fillna(1.0)  # Fill any remaining NaN
    submission_df['sales'] = np.maximum(submission_df['sales'], 0.001)  # Ensure positive
    
    # Sort by ID for submission
    submission_df = submission_df.sort_values('id').reset_index(drop=True)
    
    print(f">>> Final submission shape: {submission_df.shape}")
    print(f">>> Sales statistics - Min: {submission_df['sales'].min():.3f}, Max: {submission_df['sales'].max():.3f}, Median: {submission_df['sales'].median():.3f}")
    
    submission_df.to_csv("improved_submission.csv", index=False)
    print(">>> Improved submission saved to 'improved_submission.csv'")
    
else:
    print(">>> ERROR: No predictions generated!")

# ============================
# 8️⃣ Additional debugging
# ============================
print("\n>>> DEBUGGING INFO:")
print(f">>> Total store-family combinations in train: {train_fe.groupby(['store_nbr', 'family']).ngroups}")
print(f">>> Total store-family combinations in test: {test_fe.groupby(['store_nbr', 'family']).ngroups}")
print(f">>> Common combinations: {len(store_family_combinations)}")

if predictions:
    print(f">>> Predictions generated for: {len(predictions)} combinations")
    print(f">>> Total prediction rows: {len(final_preds)}")
    print(f">>> Expected test rows: {len(test_fe)}")

print(">>> Feature columns used:", feature_cols)

>>> Checking sales data quality:
Train sales - Min: 0.00, Max: 124717.00
Negative sales count: 0
Zero sales count: 939130
>>> Creating time features...
>>> Creating lag features per store-family...


KeyboardInterrupt: 

In [5]:
merged_train_df.dtypes

id                            int64
date                 datetime64[ns]
store_nbr                     int64
family                       object
sales                       float64
onpromotion                   int64
oil_price                   float64
city                         object
state                        object
type                         object
cluster                       int64
isHoliday                     int64
earthquake_impact             int64
salary_day_impact             int64
transactions                float64
dtype: object

In [6]:
test_df.dtypes

id                            int64
date                 datetime64[ns]
store_nbr                     int64
family                       object
onpromotion                   int64
earthquake_impact             int64
salary_day_impact             int64
isHoliday                     int64
oil_price                   float64
city                         object
state                        object
type                         object
cluster                       int64
transactions                float64
dtype: object

In [10]:
submission = pd.read_csv("my_submission_clean.csv")

# Check for NaN
print("NaN values per column:\n", submission.isna().sum())

# Check for inf
print("Infinite values per column:\n", np.isinf(submission).sum())

NaN values per column:
 id       0
sales    0
dtype: int64
Infinite values per column:
 id       0
sales    0
dtype: int64


In [9]:
import numpy as np
import pandas as pd

submission = pd.read_csv("my_submission_clean.csv")

# Find rows where sales is infinite
mask = np.isinf(submission["sales"])

# Show the offending rows
print(submission.loc[mask])


Empty DataFrame
Columns: [id, sales]
Index: []


In [8]:
import pandas as pd
import numpy as np

submission = pd.read_csv("my_submission.csv")

# Replace inf with NaN first (so ffill works)
submission["sales"].replace([np.inf, -np.inf], np.nan, inplace=True)

# Forward fill
submission["sales"].fillna(method="ffill", inplace=True)

# (Optional) if the very first row is NaN/inf, ffill won’t work — so backfill as fallback
submission["sales"].fillna(method="bfill", inplace=True)

# Save cleaned file
submission.to_csv("my_submission_clean.csv", index=False)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  submission["sales"].replace([np.inf, -np.inf], np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  submission["sales"].fillna(method="ffill", inplace=True)
  submission["sales"].fillna(method="ffill", inplace=True)
The behavior will change in pandas 3.0. This in

In [6]:
import pandas as pd
import numpy as np

# Load your files
test_df = pd.read_csv("test_df.csv")
submission = pd.read_csv("my_submission.csv")

# Check the range of IDs
print("▶️ Test set ID range:", test_df["id"].min(), "to", test_df["id"].max())
print("▶️ Submission ID range:", submission["id"].min(), "to", submission["id"].max())

# Also check for IDs in submission but not in test
extra_ids = set(submission["id"]) - set(test_df["id"])
if extra_ids:
    print("⚠️ IDs present in submission but not in test:", list(extra_ids)[:10], "...")
else:
    print("✅ All submission IDs are inside test_df.")


▶️ Test set ID range: 3000888 to 3029399
▶️ Submission ID range: 3000888 to 3029399
✅ All submission IDs are inside test_df.
