In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_log_error
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
import warnings
warnings.filterwarnings('ignore')

# ============================
# 0️⃣ Load & prepare data
# ============================
merged_train_df = pd.read_csv('merged_train_df.csv')
test_df = pd.read_csv("test_df.csv")

merged_train_df['date'] = pd.to_datetime(merged_train_df['date'])
test_df['date'] = pd.to_datetime(test_df['date'])

merged_train_df = merged_train_df.sort_values(["store_nbr", "family", "date"]).reset_index(drop=True)
test_df = test_df.sort_values(["store_nbr", "family", "date"]).reset_index(drop=True)

merged_train_df['store_nbr'] = merged_train_df['store_nbr'].astype(int)
test_df['store_nbr'] = test_df['store_nbr'].astype(int)

# Handle sales quality
merged_train_df['sales'] = np.maximum(merged_train_df['sales'], 0.01)
merged_train_df['log_sales'] = np.log1p(merged_train_df['sales'])

print(f">>> Train period: {merged_train_df['date'].min()} to {merged_train_df['date'].max()}")
print(f">>> Test period: {test_df['date'].min()} to {test_df['date'].max()}")

# ============================
# 1️⃣ Smart Feature Engineering (Computationally Efficient)
# ============================
def create_smart_features(df):
    """Create high-impact features with minimal computation"""
    df = df.copy()
    
    # Essential time features
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['dayofweek'] = df['date'].dt.dayofweek
    df['day'] = df['date'].dt.day
    df['quarter'] = df['date'].dt.quarter
    df['week'] = df['date'].dt.isocalendar().week
    
    # High-impact binary features
    df['is_weekend'] = (df['dayofweek'] >= 5).astype(int)
    df['is_month_end'] = df['date'].dt.is_month_end.astype(int)
    df['is_month_start'] = df['date'].dt.is_month_start.astype(int)
    df['is_payday'] = ((df['day'] == 15) | (df['day'] >= 28)).astype(int)
    
    # Critical cyclical encoding (top performers)
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    df['dow_sin'] = np.sin(2 * np.pi * df['dayofweek'] / 7)
    df['dow_cos'] = np.cos(2 * np.pi * df['dayofweek'] / 7)
    
    # Days since start (trend)
    start_date = pd.Timestamp('2013-01-01')
    df['days_since_start'] = (df['date'] - start_date).dt.days
    
    return df

def add_oil_and_promo_features(df):
    """Add oil price and promotion features - key for Ecuador"""
    df = df.copy()
    
    # Oil price handling (critical for Ecuador retail)
    if 'oil_price' in df.columns:
        df['oil_price'] = df['oil_price'].fillna(method='ffill').fillna(method='bfill').fillna(100)
    else:
        df['oil_price'] = 100
    
    # Oil price transformations (most predictive)
    df['oil_price_log'] = np.log1p(df['oil_price'])
    oil_mean = df['oil_price'].mean()
    df['oil_price_normalized'] = df['oil_price'] / oil_mean
    df['oil_price_high'] = (df['oil_price'] > oil_mean * 1.1).astype(int)
    df['oil_price_low'] = (df['oil_price'] < oil_mean * 0.9).astype(int)
    
    # Promotion features
    if 'onpromotion' in df.columns:
        df['onpromotion'] = df['onpromotion'].fillna(0).astype(int)
    else:
        df['onpromotion'] = 0
        
    # Holiday features
    if 'isHoliday' in df.columns:
        df['isHoliday'] = df['isHoliday'].fillna(0).astype(int)
        df['promo_holiday'] = df['onpromotion'] * df['isHoliday']
    else:
        df['isHoliday'] = 0
        df['promo_holiday'] = 0
    
    # Special events
    for col in ['earthquake_impact', 'salary_day_impact']:
        if col in df.columns:
            df[col] = df[col].fillna(0)
        else:
            df[col] = 0
    
    return df

def create_efficient_lag_features(df_groups):
    """Create essential lag features efficiently"""
    result_dfs = []
    
    for df_group in df_groups:
        if len(df_group) == 0:
            continue
            
        df_group = df_group.copy().sort_values('date').reset_index(drop=True)
        
        # Only most important lags (based on autocorrelation analysis)
        df_group['sales_lag_1'] = df_group['sales'].shift(1)
        df_group['sales_lag_7'] = df_group['sales'].shift(7)
        df_group['sales_lag_14'] = df_group['sales'].shift(14)
        
        # Oil price lags (crucial for Ecuador)
        df_group['oil_lag_7'] = df_group['oil_price'].shift(7)
        df_group['oil_lag_14'] = df_group['oil_price'].shift(14)
        
        # Essential rolling features
        df_group['sales_ma_7'] = df_group['sales'].rolling(7, min_periods=1).mean()
        df_group['sales_ma_14'] = df_group['sales'].rolling(14, min_periods=1).mean()
        df_group['sales_ma_28'] = df_group['sales'].rolling(28, min_periods=1).mean()
        
        # Volatility (key predictor)
        df_group['sales_std_14'] = df_group['sales'].rolling(14, min_periods=1).std().fillna(0)
        
        # Simple trend
        df_group['trend_7_14'] = (df_group['sales_ma_7'] / (df_group['sales_ma_14'] + 0.01)).fillna(1.0)
        df_group['trend_7_14'] = np.clip(df_group['trend_7_14'], 0.5, 2.0)
        
        # Fill missing lags with moving averages
        df_group['sales_lag_1'] = df_group['sales_lag_1'].fillna(df_group['sales_ma_7'])
        df_group['sales_lag_7'] = df_group['sales_lag_7'].fillna(df_group['sales_ma_14'])
        df_group['sales_lag_14'] = df_group['sales_lag_14'].fillna(df_group['sales_ma_28'])
        df_group['oil_lag_7'] = df_group['oil_lag_7'].fillna(df_group['oil_price'])
        df_group['oil_lag_14'] = df_group['oil_lag_14'].fillna(df_group['oil_price'])
        
        result_dfs.append(df_group)
    
    return result_dfs

# Apply feature engineering
print(">>> Creating smart features...")
train_fe = create_smart_features(merged_train_df)
test_fe = create_smart_features(test_df)

print(">>> Adding oil & promotion features...")
train_fe = add_oil_and_promo_features(train_fe)
test_fe = add_oil_and_promo_features(test_fe)

print(">>> Creating lag features...")
# Process by store-family groups
train_groups = [group for _, group in train_fe.groupby(['store_nbr', 'family'])]
train_groups_with_lags = create_efficient_lag_features(train_groups)
train_fe = pd.concat(train_groups_with_lags, ignore_index=True)
train_fe = train_fe.sort_values(['store_nbr', 'family', 'date']).reset_index(drop=True)

# Handle test lag features
test_with_lags = []
for (store, family), test_group in test_fe.groupby(['store_nbr', 'family']):
    test_group = test_group.copy().reset_index(drop=True)
    
    train_group = train_fe[(train_fe['store_nbr'] == store) & (train_fe['family'] == family)]
    
    if len(train_group) > 0:
        # Use recent training values for initialization
        recent_sales = train_group['sales'].tail(28)
        recent_oil = train_group['oil_price'].tail(28)
        
        # Initialize lags
        test_group['sales_lag_1'] = recent_sales.iloc[-1] if len(recent_sales) >= 1 else recent_sales.mean()
        test_group['sales_lag_7'] = recent_sales.iloc[-7] if len(recent_sales) >= 7 else recent_sales.mean()
        test_group['sales_lag_14'] = recent_sales.iloc[-14] if len(recent_sales) >= 14 else recent_sales.mean()
        test_group['oil_lag_7'] = recent_oil.iloc[-7] if len(recent_oil) >= 7 else recent_oil.mean()
        test_group['oil_lag_14'] = recent_oil.iloc[-14] if len(recent_oil) >= 14 else recent_oil.mean()
        
        # Rolling averages
        test_group['sales_ma_7'] = recent_sales.tail(7).mean()
        test_group['sales_ma_14'] = recent_sales.tail(14).mean()
        test_group['sales_ma_28'] = recent_sales.mean()
        test_group['sales_std_14'] = recent_sales.tail(14).std() if len(recent_sales) >= 14 else 0
        test_group['trend_7_14'] = test_group['sales_ma_7'] / (test_group['sales_ma_14'] + 0.01)
        
    else:
        # Default values
        for col in ['sales_lag_1', 'sales_lag_7', 'sales_lag_14', 'sales_ma_7', 'sales_ma_14', 'sales_ma_28']:
            test_group[col] = 1.0
        test_group['oil_lag_7'] = test_group['oil_lag_14'] = 100.0
        test_group['sales_std_14'] = 0.0
        test_group['trend_7_14'] = 1.0
    
    test_with_lags.append(test_group)

test_fe = pd.concat(test_with_lags, ignore_index=True)
test_fe = test_fe.sort_values(['store_nbr', 'family', 'date']).reset_index(drop=True)

# ============================
# 2️⃣ Aggregate Features (Efficient)
# ============================
print(">>> Creating aggregate features...")

# Store-level statistics (key predictors)
store_stats = train_fe.groupby('store_nbr')['sales'].agg(['mean', 'std']).reset_index()
store_stats.columns = ['store_nbr', 'store_sales_mean', 'store_sales_std']
store_stats['store_sales_std'] = store_stats['store_sales_std'].fillna(0)

# Family-level statistics
family_stats = train_fe.groupby('family')['sales'].agg(['mean', 'std']).reset_index()
family_stats.columns = ['family', 'family_sales_mean', 'family_sales_std']
family_stats['family_sales_std'] = family_stats['family_sales_std'].fillna(0)

# Add to datasets
train_fe = train_fe.merge(store_stats, on='store_nbr', how='left')
train_fe = train_fe.merge(family_stats, on='family', how='left')
test_fe = test_fe.merge(store_stats, on='store_nbr', how='left')
test_fe = test_fe.merge(family_stats, on='family', how='left')

# Fill missing aggregates
for col in ['store_sales_mean', 'store_sales_std', 'family_sales_mean', 'family_sales_std']:
    overall_val = train_fe[col].median()
    train_fe[col] = train_fe[col].fillna(overall_val)
    test_fe[col] = test_fe[col].fillna(overall_val)

# ============================
# 3️⃣ Encode Categoricals
# ============================
le_family = LabelEncoder()
combined_families = pd.concat([train_fe['family'], test_fe['family']]).unique()
le_family.fit(combined_families)
train_fe['family_encoded'] = le_family.transform(train_fe['family'])
test_fe['family_encoded'] = le_family.transform(test_fe['family'])

# ============================
# 4️⃣ Random Forest Model
# ============================
print(">>> Training Random Forest...")

# Feature selection (most important based on previous analysis)
rf_features = [
    # Time features
    'year', 'month', 'dayofweek', 'day', 'quarter', 'week',
    'is_weekend', 'is_month_end', 'is_month_start', 'is_payday',
    'month_sin', 'month_cos', 'dow_sin', 'dow_cos', 'days_since_start',
    
    # Oil features (crucial)
    'oil_price', 'oil_price_log', 'oil_price_normalized', 'oil_price_high', 'oil_price_low',
    'oil_lag_7', 'oil_lag_14',
    
    # Sales features
    'sales_lag_1', 'sales_lag_7', 'sales_lag_14',
    'sales_ma_7', 'sales_ma_14', 'sales_ma_28', 'sales_std_14', 'trend_7_14',
    
    # Promotion & Events
    'onpromotion', 'isHoliday', 'promo_holiday', 'earthquake_impact', 'salary_day_impact',
    
    # Store & Category
    'store_nbr', 'family_encoded',
    'store_sales_mean', 'store_sales_std', 'family_sales_mean', 'family_sales_std'
]

# Filter available features
available_rf_features = [col for col in rf_features if col in train_fe.columns and col in test_fe.columns]
print(f">>> Using {len(available_rf_features)} features for Random Forest")

# Clean data
train_clean = train_fe.dropna(subset=['sales_lag_1', 'sales_lag_7'])
X_train_rf = train_clean[available_rf_features].fillna(0)
y_train_rf = train_clean['sales']
X_test_rf = test_fe[available_rf_features].fillna(0)

# Train Random Forest
rf_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=12,
    min_samples_split=10,
    min_samples_leaf=4,
    max_features='sqrt',
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train_rf, y_train_rf)
rf_predictions = np.maximum(rf_model.predict(X_test_rf), 0.01)

print(">>> Random Forest completed")

# Feature importance
importance_df = pd.DataFrame({
    'feature': available_rf_features,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print(">>> Top 10 Random Forest features:")
print(importance_df.head(10).to_string(index=False))

# ============================
# 5️⃣ ARIMA with Exogenous Variables (Efficient Implementation)
# ============================
print(">>> Running ARIMA with exogenous variables...")

def fit_arima_with_exog(train_data, test_data, store_num, family_name):
    """Fit ARIMA model with exogenous variables efficiently"""
    
    if len(train_data) < 30:  # Need sufficient data
        return None
    
    try:
        # Prepare data
        train_sorted = train_data.sort_values('date').tail(84)  # Last ~3 months
        
        # Endogenous variable (sales) 
        y = train_sorted['sales'].values
        
        # Exogenous variables (most important ones)
        exog_cols = ['oil_price_normalized', 'onpromotion', 'isHoliday', 'is_weekend', 
                    'month_sin', 'month_cos', 'dow_sin', 'dow_cos']
        
        available_exog_cols = [col for col in exog_cols if col in train_sorted.columns]
        
        if len(available_exog_cols) == 0:
            return None
            
        X_train = train_sorted[available_exog_cols].values
        X_test = test_data[available_exog_cols].values
        
        # Simple ARIMA with exogenous variables
        # Use SARIMAX for better handling of exogenous variables
        model = SARIMAX(y, exog=X_train, order=(1, 1, 1), seasonal_order=(0, 0, 0, 0),
                       enforce_stationarity=False, enforce_invertibility=False)
        
        fitted_model = model.fit(disp=False, maxiter=50)
        
        # Forecast
        forecast = fitted_model.forecast(steps=len(test_data), exog=X_test)
        forecast = np.maximum(forecast, 0.01)  # Ensure positive
        
        return forecast
        
    except Exception as e:
        return None

# Run ARIMA for store-family combinations (sample to manage computation)
arima_predictions = {}
store_family_combinations = list(set(zip(train_fe['store_nbr'], train_fe['family'])))

# Process top combinations by volume first
combination_volumes = []
for store, family in store_family_combinations:
    vol = train_fe[(train_fe['store_nbr'] == store) & (train_fe['family'] == family)]['sales'].sum()
    combination_volumes.append((store, family, vol))

# Sort by volume and take top combinations
combination_volumes.sort(key=lambda x: x[2], reverse=True)
top_combinations = [x[:2] for x in combination_volumes[:200]]  # Top 200 for efficiency

print(f">>> Processing ARIMA for {len(top_combinations)} top store-family combinations...")

arima_results = []
for i, (store, family) in enumerate(top_combinations):
    if i % 50 == 0:
        print(f">>> ARIMA progress: {i}/{len(top_combinations)}")
        
    train_subset = train_fe[(train_fe['store_nbr'] == store) & (train_fe['family'] == family)]
    test_subset = test_fe[(test_fe['store_nbr'] == store) & (test_fe['family'] == family)]
    
    if len(test_subset) == 0:
        continue
        
    arima_pred = fit_arima_with_exog(train_subset, test_subset, store, family)
    
    if arima_pred is not None:
        result_df = test_subset[['id']].copy()
        result_df['sales_arima'] = arima_pred
        arima_results.append(result_df)

# Combine ARIMA results
if arima_results:
    arima_df = pd.concat(arima_results, ignore_index=True)
    print(f">>> ARIMA predictions generated for {len(arima_df)} test samples")
else:
    arima_df = pd.DataFrame(columns=['id', 'sales_arima'])

# ============================
# 6️⃣ Enhanced Store-Family Baseline
# ============================
print(">>> Creating enhanced store-family predictions...")

sf_predictions = []
all_combinations = set(zip(test_fe['store_nbr'], test_fe['family']))

for i, (store, family) in enumerate(sorted(all_combinations)):
    if i % 200 == 0:
        print(f">>> Store-family progress: {i}/{len(all_combinations)}")
        
    train_subset = train_fe[(train_fe['store_nbr'] == store) & (train_fe['family'] == family)]
    test_subset = test_fe[(test_fe['store_nbr'] == store) & (test_fe['family'] == family)]
    
    if len(train_subset) < 5 or len(test_subset) == 0:
        continue
        
    # Enhanced prediction logic
    recent_data = train_subset.sort_values('date').tail(42)  # Last 6 weeks
    
    predictions = []
    for _, test_row in test_subset.iterrows():
        # Base prediction
        base_pred = recent_data['sales'].median()
        
        # Adjust for day of week seasonality
        dow_sales = recent_data[recent_data['dayofweek'] == test_row['dayofweek']]['sales']
        if len(dow_sales) >= 2:
            dow_factor = dow_sales.mean() / (recent_data['sales'].mean() + 0.01)
            dow_factor = np.clip(dow_factor, 0.7, 1.4)
        else:
            dow_factor = 1.0
            
        # Adjust for promotion
        if test_row.get('onpromotion', 0) == 1:
            promo_sales = recent_data[recent_data['onpromotion'] == 1]['sales']
            normal_sales = recent_data[recent_data['onpromotion'] == 0]['sales']
            if len(promo_sales) > 0 and len(normal_sales) > 0:
                promo_factor = promo_sales.mean() / (normal_sales.mean() + 0.01)
                promo_factor = np.clip(promo_factor, 1.0, 1.6)
            else:
                promo_factor = 1.1
        else:
            promo_factor = 1.0
            
        # Oil price effect
        recent_oil = recent_data['oil_price'].mean()
        current_oil = test_row.get('oil_price', recent_oil)
        oil_factor = 1.0 - 0.1 * (current_oil - recent_oil) / (recent_oil + 0.01)  # Inverse relationship
        oil_factor = np.clip(oil_factor, 0.8, 1.2)
        
        # Combine factors
        final_pred = base_pred * dow_factor * promo_factor * oil_factor
        predictions.append(max(final_pred, 0.01))
    
    # Store results
    result_df = test_subset[['id']].copy()
    result_df['sales_sf'] = predictions
    sf_predictions.append(result_df)

sf_df = pd.concat(sf_predictions, ignore_index=True) if sf_predictions else pd.DataFrame(columns=['id', 'sales_sf'])

# ============================
# 7️⃣ Ensemble Predictions
# ============================
print(">>> Creating ensemble predictions...")

# Start with Random Forest predictions
final_df = test_fe[['id']].copy()
final_df['sales_rf'] = rf_predictions

# Add ARIMA predictions where available
if len(arima_df) > 0:
    final_df = final_df.merge(arima_df[['id', 'sales_arima']], on='id', how='left')
else:
    final_df['sales_arima'] = np.nan

# Add store-family predictions
if len(sf_df) > 0:
    final_df = final_df.merge(sf_df[['id', 'sales_sf']], on='id', how='left')
else:
    final_df['sales_sf'] = np.nan

# Create ensemble with adaptive weights
def create_ensemble(row):
    """Create ensemble prediction with adaptive weighting"""
    rf_pred = row['sales_rf']
    arima_pred = row['sales_arima']
    sf_pred = row['sales_sf']
    
    predictions = []
    weights = []
    
    # Random Forest (always available)
    predictions.append(rf_pred)
    weights.append(0.5)  # Base weight
    
    # ARIMA (when available)
    if not pd.isna(arima_pred):
        predictions.append(arima_pred)
        weights.append(0.3)
        
    # Store-family (when available)  
    if not pd.isna(sf_pred):
        predictions.append(sf_pred)
        weights.append(0.2)
    
    # Weighted average
    total_weight = sum(weights)
    ensemble_pred = sum(p * w for p, w in zip(predictions, weights)) / total_weight
    
    return max(ensemble_pred, 0.01)

final_df['sales'] = final_df.apply(create_ensemble, axis=1)

# Final validation and cleanup
final_df = final_df[['id', 'sales']].sort_values('id').reset_index(drop=True)

# Ensure all test IDs are covered
all_test_ids = set(test_fe['id'])
predicted_ids = set(final_df['id'])
missing_ids = all_test_ids - predicted_ids

if missing_ids:
    print(f">>> Adding {len(missing_ids)} missing predictions...")
    median_sales = train_fe['sales'].median()
    missing_df = pd.DataFrame({'id': list(missing_ids), 'sales': [median_sales] * len(missing_ids)})
    final_df = pd.concat([final_df, missing_df], ignore_index=True)
    final_df = final_df.sort_values('id').reset_index(drop=True)

# Final statistics
final_df['sales'] = final_df['sales'].fillna(train_fe['sales'].median())
final_df['sales'] = np.maximum(final_df['sales'], 0.01)

# Save submission
final_df.to_csv("ensemble_submission_rf_arima.csv", index=False)

print(f"\n>>> ENSEMBLE RESULTS:")
print(f">>> Submission shape: {final_df.shape}")
print(f">>> Sales range: {final_df['sales'].min():.3f} - {final_df['sales'].max():.3f}")
print(f">>> Sales median: {final_df['sales'].median():.3f}")
print(f">>> Random Forest coverage: 100%")
print(f">>> ARIMA coverage: {len(arima_df) / len(final_df) * 100:.1f}%")
print(f">>> Store-family coverage: {len(sf_df) / len(final_df) * 100:.1f}%")
print(">>> Submission saved as 'ensemble_submission_rf_arima.csv'")

>>> Train period: 2013-01-01 00:00:00 to 2017-08-15 00:00:00
>>> Test period: 2017-08-16 00:00:00 to 2017-08-31 00:00:00
>>> Creating smart features...
>>> Adding oil & promotion features...
>>> Creating lag features...
>>> Creating aggregate features...
>>> Training Random Forest...
>>> Using 41 features for Random Forest
>>> Random Forest completed
>>> Top 10 Random Forest features:
          feature  importance
       sales_ma_7    0.225452
      sales_ma_14    0.172364
      sales_lag_7    0.169633
      sales_ma_28    0.093207
      sales_lag_1    0.093170
     sales_lag_14    0.087709
     sales_std_14    0.053643
 family_sales_std    0.018108
family_sales_mean    0.013506
      onpromotion    0.011840
>>> Running ARIMA with exogenous variables...
>>> Processing ARIMA for 200 top store-family combinations...
>>> ARIMA progress: 0/200
>>> ARIMA progress: 50/200
>>> ARIMA progress: 100/200
>>> ARIMA progress: 150/200
>>> ARIMA predictions generated for 3200 test samples
>>> Creatin

In [5]:
merged_train_df.dtypes

id                            int64
date                 datetime64[ns]
store_nbr                     int64
family                       object
sales                       float64
onpromotion                   int64
oil_price                   float64
city                         object
state                        object
type                         object
cluster                       int64
isHoliday                     int64
earthquake_impact             int64
salary_day_impact             int64
transactions                float64
dtype: object

In [6]:
test_df.dtypes

id                            int64
date                 datetime64[ns]
store_nbr                     int64
family                       object
onpromotion                   int64
earthquake_impact             int64
salary_day_impact             int64
isHoliday                     int64
oil_price                   float64
city                         object
state                        object
type                         object
cluster                       int64
transactions                float64
dtype: object

In [10]:
submission = pd.read_csv("my_submission_clean.csv")

# Check for NaN
print("NaN values per column:\n", submission.isna().sum())

# Check for inf
print("Infinite values per column:\n", np.isinf(submission).sum())

NaN values per column:
 id       0
sales    0
dtype: int64
Infinite values per column:
 id       0
sales    0
dtype: int64


In [9]:
import numpy as np
import pandas as pd

submission = pd.read_csv("my_submission_clean.csv")

# Find rows where sales is infinite
mask = np.isinf(submission["sales"])

# Show the offending rows
print(submission.loc[mask])


Empty DataFrame
Columns: [id, sales]
Index: []


In [8]:
import pandas as pd
import numpy as np

submission = pd.read_csv("my_submission.csv")

# Replace inf with NaN first (so ffill works)
submission["sales"].replace([np.inf, -np.inf], np.nan, inplace=True)

# Forward fill
submission["sales"].fillna(method="ffill", inplace=True)

# (Optional) if the very first row is NaN/inf, ffill won’t work — so backfill as fallback
submission["sales"].fillna(method="bfill", inplace=True)

# Save cleaned file
submission.to_csv("my_submission_clean.csv", index=False)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  submission["sales"].replace([np.inf, -np.inf], np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  submission["sales"].fillna(method="ffill", inplace=True)
  submission["sales"].fillna(method="ffill", inplace=True)
The behavior will change in pandas 3.0. This in

In [6]:
import pandas as pd
import numpy as np

# Load your files
test_df = pd.read_csv("test_df.csv")
submission = pd.read_csv("my_submission.csv")

# Check the range of IDs
print("▶️ Test set ID range:", test_df["id"].min(), "to", test_df["id"].max())
print("▶️ Submission ID range:", submission["id"].min(), "to", submission["id"].max())

# Also check for IDs in submission but not in test
extra_ids = set(submission["id"]) - set(test_df["id"])
if extra_ids:
    print("⚠️ IDs present in submission but not in test:", list(extra_ids)[:10], "...")
else:
    print("✅ All submission IDs are inside test_df.")


▶️ Test set ID range: 3000888 to 3029399
▶️ Submission ID range: 3000888 to 3029399
✅ All submission IDs are inside test_df.
