In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_log_error
from statsmodels.tsa.arima.model import ARIMA
import warnings
warnings.filterwarnings('ignore')

# ============================
# 0️⃣ Load & prepare data
# ============================
merged_train_df = pd.read_csv('merged_train_df.csv')
test_df = pd.read_csv("test_df.csv")

merged_train_df['date'] = pd.to_datetime(merged_train_df['date'])
test_df['date'] = pd.to_datetime(test_df['date'])

merged_train_df = merged_train_df.sort_values(["store_nbr", "family", "date"]).reset_index(drop=True)
test_df = test_df.sort_values(["store_nbr", "family", "date"]).reset_index(drop=True)

merged_train_df['store_nbr'] = merged_train_df['store_nbr'].astype(int)
test_df['store_nbr'] = test_df['store_nbr'].astype(int)

# Handle sales
merged_train_df['sales'] = np.maximum(merged_train_df['sales'], 0.01)

print(f">>> Train period: {merged_train_df['date'].min()} to {merged_train_df['date'].max()}")
print(f">>> Test period: {test_df['date'].min()} to {test_df['date'].max()}")

# ============================
# 1️⃣ Focused Feature Engineering
# ============================
def create_core_features(df):
    """Create essential features only"""
    df = df.copy()
    
    # Core time features
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['dayofweek'] = df['date'].dt.dayofweek
    df['day'] = df['date'].dt.day
    df['quarter'] = df['date'].dt.quarter
    
    # Key binary features
    df['is_weekend'] = (df['dayofweek'] >= 5).astype(int)
    df['is_month_end'] = df['date'].dt.is_month_end.astype(int)
    df['is_payday'] = ((df['day'] == 15) | (df['day'] >= 28)).astype(int)
    
    # Essential cyclical features
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    df['dow_sin'] = np.sin(2 * np.pi * df['dayofweek'] / 7)
    df['dow_cos'] = np.cos(2 * np.pi * df['dayofweek'] / 7)
    
    # Trend
    start_date = pd.Timestamp('2013-01-01')
    df['days_since_start'] = (df['date'] - start_date).dt.days
    
    return df

def add_external_features(df):
    """Add oil, promotion, and holiday features"""
    df = df.copy()
    
    # Oil price
    if 'oil_price' in df.columns:
        df['oil_price'] = df['oil_price'].fillna(method='ffill').fillna(method='bfill').fillna(100)
    else:
        df['oil_price'] = 100
    
    # Oil price features
    df['oil_price_norm'] = df['oil_price'] / df['oil_price'].mean()
    df['oil_price_ma_7'] = df['oil_price'].rolling(7, min_periods=1).mean()
    
    # Promotion
    if 'onpromotion' in df.columns:
        df['onpromotion'] = df['onpromotion'].fillna(0).astype(int)
    else:
        df['onpromotion'] = 0
        
    # Holiday
    if 'isHoliday' in df.columns:
        df['isHoliday'] = df['isHoliday'].fillna(0).astype(int)
    else:
        df['isHoliday'] = 0
    
    # Key interactions
    df['promo_weekend'] = df['onpromotion'] * df['is_weekend']
    df['promo_holiday'] = df['onpromotion'] * df['isHoliday']
    
    # Handle special events if they exist
    for col in ['earthquake_impact', 'salary_day_impact']:
        if col in df.columns:
            df[col] = df[col].fillna(0)
        else:
            df[col] = 0
    
    return df

def create_lag_features(df_groups):
    """Create essential lag and rolling features"""
    result_dfs = []
    
    for df_group in df_groups:
        if len(df_group) == 0:
            continue
            
        df_group = df_group.copy().sort_values('date').reset_index(drop=True)
        
        # Key lag features
        df_group['sales_lag_1'] = df_group['sales'].shift(1)
        df_group['sales_lag_7'] = df_group['sales'].shift(7)
        df_group['sales_lag_14'] = df_group['sales'].shift(14)
        df_group['sales_lag_28'] = df_group['sales'].shift(28)
        
        # Rolling averages
        df_group['sales_ma_7'] = df_group['sales'].rolling(7, min_periods=1).mean()
        df_group['sales_ma_14'] = df_group['sales'].rolling(14, min_periods=1).mean()
        df_group['sales_ma_28'] = df_group['sales'].rolling(28, min_periods=1).mean()
        
        # Rolling std
        df_group['sales_std_7'] = df_group['sales'].rolling(7, min_periods=1).std().fillna(0)
        df_group['sales_std_14'] = df_group['sales'].rolling(14, min_periods=1).std().fillna(0)
        
        # Trend indicators
        df_group['sales_trend_7_14'] = (df_group['sales_ma_7'] / (df_group['sales_ma_14'] + 0.01)).fillna(1.0)
        df_group['sales_trend_14_28'] = (df_group['sales_ma_14'] / (df_group['sales_ma_28'] + 0.01)).fillna(1.0)
        
        # Oil lags
        df_group['oil_lag_7'] = df_group['oil_price'].shift(7)
        df_group['oil_lag_14'] = df_group['oil_price'].shift(14)
        
        # Promotion lags
        df_group['promo_lag_7'] = df_group['onpromotion'].shift(7)
        
        # Fill missing lags
        df_group['sales_lag_1'] = df_group['sales_lag_1'].fillna(df_group['sales_ma_7'])
        df_group['sales_lag_7'] = df_group['sales_lag_7'].fillna(df_group['sales_ma_14'])
        df_group['sales_lag_14'] = df_group['sales_lag_14'].fillna(df_group['sales_ma_28'])
        df_group['sales_lag_28'] = df_group['sales_lag_28'].fillna(df_group['sales_ma_28'])
        
        df_group['oil_lag_7'] = df_group['oil_lag_7'].fillna(df_group['oil_price'])
        df_group['oil_lag_14'] = df_group['oil_lag_14'].fillna(df_group['oil_price'])
        df_group['promo_lag_7'] = df_group['promo_lag_7'].fillna(0)
        
        # Clip trend features
        df_group['sales_trend_7_14'] = np.clip(df_group['sales_trend_7_14'], 0.5, 2.0)
        df_group['sales_trend_14_28'] = np.clip(df_group['sales_trend_14_28'], 0.5, 2.0)
        
        result_dfs.append(df_group)
    
    return result_dfs

# Apply feature engineering
print(">>> Creating core features...")
train_fe = create_core_features(merged_train_df)
test_fe = create_core_features(test_df)

train_fe = add_external_features(train_fe)
test_fe = add_external_features(test_fe)

print(">>> Creating lag features...")
train_groups = [group for _, group in train_fe.groupby(['store_nbr', 'family'])]
train_groups_with_lags = create_lag_features(train_groups)
train_fe = pd.concat(train_groups_with_lags, ignore_index=True)
train_fe = train_fe.sort_values(['store_nbr', 'family', 'date']).reset_index(drop=True)

# Initialize test lag features
test_with_lags = []
for (store, family), test_group in test_fe.groupby(['store_nbr', 'family']):
    test_group = test_group.copy().reset_index(drop=True)
    
    train_group = train_fe[(train_fe['store_nbr'] == store) & (train_fe['family'] == family)]
    
    if len(train_group) > 0:
        recent_sales = train_group['sales'].tail(56)  # 8 weeks
        recent_oil = train_group['oil_price'].tail(56)
        recent_promo = train_group['onpromotion'].tail(56)
        
        # Initialize lags
        test_group['sales_lag_1'] = recent_sales.iloc[-1] if len(recent_sales) >= 1 else recent_sales.mean()
        test_group['sales_lag_7'] = recent_sales.iloc[-7] if len(recent_sales) >= 7 else recent_sales.mean()
        test_group['sales_lag_14'] = recent_sales.iloc[-14] if len(recent_sales) >= 14 else recent_sales.mean()
        test_group['sales_lag_28'] = recent_sales.iloc[-28] if len(recent_sales) >= 28 else recent_sales.mean()
        
        # Rolling averages
        test_group['sales_ma_7'] = recent_sales.tail(7).mean()
        test_group['sales_ma_14'] = recent_sales.tail(14).mean()
        test_group['sales_ma_28'] = recent_sales.mean()
        
        # Rolling std
        test_group['sales_std_7'] = recent_sales.tail(7).std() if len(recent_sales) >= 7 else 0
        test_group['sales_std_14'] = recent_sales.tail(14).std() if len(recent_sales) >= 14 else 0
        
        # Trends
        test_group['sales_trend_7_14'] = test_group['sales_ma_7'] / (test_group['sales_ma_14'] + 0.01)
        test_group['sales_trend_14_28'] = test_group['sales_ma_14'] / (test_group['sales_ma_28'] + 0.01)
        
        # Oil lags
        test_group['oil_lag_7'] = recent_oil.iloc[-7] if len(recent_oil) >= 7 else recent_oil.mean()
        test_group['oil_lag_14'] = recent_oil.iloc[-14] if len(recent_oil) >= 14 else recent_oil.mean()
        
        # Promo lag
        test_group['promo_lag_7'] = recent_promo.iloc[-7] if len(recent_promo) >= 7 else recent_promo.mean()
        
    else:
        # Default values
        default_sales = 1.0
        for col in ['sales_lag_1', 'sales_lag_7', 'sales_lag_14', 'sales_lag_28', 
                   'sales_ma_7', 'sales_ma_14', 'sales_ma_28']:
            test_group[col] = default_sales
        test_group['sales_std_7'] = test_group['sales_std_14'] = 0.0
        test_group['sales_trend_7_14'] = test_group['sales_trend_14_28'] = 1.0
        test_group['oil_lag_7'] = test_group['oil_lag_14'] = 100.0
        test_group['promo_lag_7'] = 0.0
    
    test_with_lags.append(test_group)

test_fe = pd.concat(test_with_lags, ignore_index=True)
test_fe = test_fe.sort_values(['store_nbr', 'family', 'date']).reset_index(drop=True)

# ============================
# 2️⃣ Store and Family Statistics
# ============================
print(">>> Creating aggregate features...")

# Store stats
store_stats = train_fe.groupby('store_nbr')['sales'].agg(['mean', 'std', 'median']).reset_index()
store_stats.columns = ['store_nbr', 'store_sales_mean', 'store_sales_std', 'store_sales_median']
store_stats['store_sales_std'] = store_stats['store_sales_std'].fillna(0)

# Family stats  
family_stats = train_fe.groupby('family')['sales'].agg(['mean', 'std', 'median']).reset_index()
family_stats.columns = ['family', 'family_sales_mean', 'family_sales_std', 'family_sales_median']
family_stats['family_sales_std'] = family_stats['family_sales_std'].fillna(0)

# Add to datasets
train_fe = train_fe.merge(store_stats, on='store_nbr', how='left')
train_fe = train_fe.merge(family_stats, on='family', how='left')
test_fe = test_fe.merge(store_stats, on='store_nbr', how='left')
test_fe = test_fe.merge(family_stats, on='family', how='left')

# Fill missing
for col in ['store_sales_mean', 'store_sales_std', 'store_sales_median', 
           'family_sales_mean', 'family_sales_std', 'family_sales_median']:
    overall_val = train_fe[col].median()
    train_fe[col] = train_fe[col].fillna(overall_val)
    test_fe[col] = test_fe[col].fillna(overall_val)

# ============================
# 3️⃣ Encoding
# ============================
le_family = LabelEncoder()
combined_families = pd.concat([train_fe['family'], test_fe['family']]).unique()
le_family.fit(combined_families)
train_fe['family_encoded'] = le_family.transform(train_fe['family'])
test_fe['family_encoded'] = le_family.transform(test_fe['family'])

# ============================
# 4️⃣ Random Forest Model
# ============================
print(">>> Training Random Forest...")

# Core feature set
rf_features = [
    # Time features
    'year', 'month', 'dayofweek', 'day', 'quarter',
    'is_weekend', 'is_month_end', 'is_payday',
    'month_sin', 'month_cos', 'dow_sin', 'dow_cos', 'days_since_start',
    
    # External features
    'oil_price', 'oil_price_norm', 'oil_price_ma_7', 'onpromotion', 'isHoliday',
    'promo_weekend', 'promo_holiday', 'earthquake_impact', 'salary_day_impact',
    
    # Lag features
    'sales_lag_1', 'sales_lag_7', 'sales_lag_14', 'sales_lag_28',
    'sales_ma_7', 'sales_ma_14', 'sales_ma_28',
    'sales_std_7', 'sales_std_14',
    'sales_trend_7_14', 'sales_trend_14_28',
    'oil_lag_7', 'oil_lag_14', 'promo_lag_7',
    
    # Store & Category
    'store_nbr', 'family_encoded',
    'store_sales_mean', 'store_sales_std', 'store_sales_median',
    'family_sales_mean', 'family_sales_std', 'family_sales_median'
]

# Filter available features
available_rf_features = [col for col in rf_features if col in train_fe.columns and col in test_fe.columns]
print(f">>> Using {len(available_rf_features)} features for Random Forest")

# Prepare data
train_clean = train_fe.dropna(subset=['sales_lag_1', 'sales_lag_7'])
X_train_rf = train_clean[available_rf_features].fillna(0)
y_train_rf = train_clean['sales']
X_test_rf = test_fe[available_rf_features].fillna(0)

# Train Random Forest with optimized parameters
rf_model = RandomForestRegressor(
    n_estimators=120,
    max_depth=12,
    min_samples_split=6,
    min_samples_leaf=3,
    max_features=0.8,
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train_rf, y_train_rf)
rf_predictions = np.maximum(rf_model.predict(X_test_rf), 0.01)

print(">>> Random Forest completed")

# Feature importance
importance_df = pd.DataFrame({
    'feature': available_rf_features,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print(">>> Top 10 Random Forest features:")
print(importance_df.head(10).to_string(index=False))

# ============================
# 5️⃣ Simple ARIMA Model
# ============================
print(">>> Running Simple ARIMA for key store-family combinations...")

def fit_simple_arima(train_data, forecast_periods):
    """Simple ARIMA model focused on core patterns"""
    
    if len(train_data) < 28:
        return None
    
    try:
        # Use recent data for better forecasting
        recent_data = train_data.sort_values('date').tail(90)  # ~3 months
        sales_series = recent_data['sales'].values
        
        # Simple differencing check
        if len(sales_series) > 7:
            # Check if series needs differencing
            diff_series = np.diff(sales_series)
            if np.std(diff_series) < np.std(sales_series):
                d = 1
            else:
                d = 0
        else:
            d = 1
        
        # Try different ARIMA configurations
        arima_configs = [
            (1, d, 1),
            (2, d, 1), 
            (1, d, 2),
            (2, d, 2),
            (0, d, 1),
            (1, d, 0)
        ]
        
        best_model = None
        best_aic = float('inf')
        
        for p, d, q in arima_configs:
            try:
                model = ARIMA(sales_series, order=(p, d, q))
                fitted_model = model.fit()
                
                if fitted_model.aic < best_aic:
                    best_aic = fitted_model.aic
                    best_model = fitted_model
                    
            except:
                continue
        
        if best_model is not None:
            # Forecast
            forecast = best_model.forecast(steps=forecast_periods)
            forecast = np.maximum(forecast, 0.01)  # Ensure positive
            return forecast
        else:
            return None
            
    except Exception as e:
        return None

# Select top store-family combinations by volume and consistency
print(">>> Selecting combinations for ARIMA...")
combination_scores = []

for (store, family), group in train_fe.groupby(['store_nbr', 'family']):
    if len(group) >= 28:  # Minimum data requirement
        volume = group['sales'].sum()
        consistency = 1.0 / (group['sales'].std() / (group['sales'].mean() + 0.01) + 0.01)  # Inverse CV
        data_points = len(group)
        
        # Composite score favoring high volume, consistent, well-sampled series
        score = volume * consistency * np.log(data_points)
        combination_scores.append((store, family, score))

# Sort by score and take top combinations
combination_scores.sort(key=lambda x: x[2], reverse=True)
top_combinations = [x[:2] for x in combination_scores[:150]]  # Top 150 for efficiency

print(f">>> Processing ARIMA for {len(top_combinations)} top combinations...")

arima_results = []
for i, (store, family) in enumerate(top_combinations):
    if i % 50 == 0:
        print(f">>> ARIMA progress: {i}/{len(top_combinations)}")
        
    train_subset = train_fe[(train_fe['store_nbr'] == store) & (train_fe['family'] == family)]
    test_subset = test_fe[(test_fe['store_nbr'] == store) & (test_fe['family'] == family)]
    
    if len(test_subset) == 0:
        continue
        
    arima_pred = fit_simple_arima(train_subset, len(test_subset))
    
    if arima_pred is not None:
        result_df = test_subset[['id']].copy()
        result_df['sales_arima'] = arima_pred
        arima_results.append(result_df)

# Combine ARIMA results
if arima_results:
    arima_df = pd.concat(arima_results, ignore_index=True)
    print(f">>> ARIMA predictions generated for {len(arima_df)} test samples")
else:
    arima_df = pd.DataFrame(columns=['id', 'sales_arima'])

# ============================
# 6️⃣ Simple Store-Family Baseline
# ============================
print(">>> Creating store-family baseline...")

def simple_sf_prediction(train_subset, test_row):
    """Simple but effective store-family prediction"""
    
    if len(train_subset) < 3:
        return 1.0
    
    recent_data = train_subset.sort_values('date').tail(42)  # 6 weeks
    
    # Base prediction
    base_pred = recent_data['sales'].median()
    
    # Day of week adjustment
    dow_factor = 1.0
    dow_sales = recent_data[recent_data['dayofweek'] == test_row['dayofweek']]['sales']
    if len(dow_sales) >= 2:
        dow_factor = dow_sales.median() / (recent_data['sales'].median() + 0.01)
        dow_factor = np.clip(dow_factor, 0.7, 1.4)
    
    # Promotion adjustment
    promo_factor = 1.0
    if test_row.get('onpromotion', 0) == 1:
        promo_sales = recent_data[recent_data['onpromotion'] == 1]['sales']
        normal_sales = recent_data[recent_data['onpromotion'] == 0]['sales']
        if len(promo_sales) >= 1 and len(normal_sales) >= 1:
            promo_factor = promo_sales.median() / (normal_sales.median() + 0.01)
            promo_factor = np.clip(promo_factor, 1.0, 1.6)
        else:
            promo_factor = 1.15
    
    final_pred = base_pred * dow_factor * promo_factor
    return max(final_pred, 0.01)

# Create store-family predictions
sf_predictions = []
for (store, family), test_group in test_fe.groupby(['store_nbr', 'family']):
    train_subset = train_fe[(train_fe['store_nbr'] == store) & (train_fe['family'] == family)]
    
    if len(train_subset) < 2:
        continue
        
    predictions = []
    for _, test_row in test_group.iterrows():
        pred = simple_sf_prediction(train_subset, test_row)
        predictions.append(pred)
    
    if predictions:
        result_df = test_group[['id']].copy()
        result_df['sales_sf'] = predictions
        sf_predictions.append(result_df)

sf_df = pd.concat(sf_predictions, ignore_index=True) if sf_predictions else pd.DataFrame(columns=['id', 'sales_sf'])

# ============================
# 7️⃣ Three-Model Ensemble
# ============================
print(">>> Creating three-model ensemble...")

# Combine all predictions
final_df = test_fe[['id']].copy()
final_df['sales_rf'] = rf_predictions

# Add ARIMA predictions
if len(arima_df) > 0:
    final_df = final_df.merge(arima_df[['id', 'sales_arima']], on='id', how='left')
else:
    final_df['sales_arima'] = np.nan

# Add store-family predictions  
if len(sf_df) > 0:
    final_df = final_df.merge(sf_df[['id', 'sales_sf']], on='id', how='left')
else:
    final_df['sales_sf'] = np.nan

def three_model_ensemble(row):
    """Ensemble of Random Forest, ARIMA, and Store-Family predictions"""
    
    rf_pred = row['sales_rf']
    arima_pred = row['sales_arima']
    sf_pred = row['sales_sf']
    
    predictions = [rf_pred]  # RF always available
    weights = [0.6]  # RF gets highest weight
    
    # Add ARIMA if available (good for trend/seasonality)
    if not pd.isna(arima_pred):
        predictions.append(arima_pred)
        weights.append(0.25)
    
    # Add store-family if available (good for local patterns)
    if not pd.isna(sf_pred):
        predictions.append(sf_pred)
        weights.append(0.15)
    
    # Weighted average
    total_weight = sum(weights)
    ensemble_pred = sum(p * w for p, w in zip(predictions, weights)) / total_weight
    
    return max(ensemble_pred, 0.01)

final_df['sales'] = final_df.apply(three_model_ensemble, axis=1)

# ============================
# 8️⃣ Final Processing
# ============================
final_df = final_df[['id', 'sales']].sort_values('id').reset_index(drop=True)

# Handle missing IDs
all_test_ids = set(test_fe['id'])
predicted_ids = set(final_df['id'])
missing_ids = all_test_ids - predicted_ids

if missing_ids:
    print(f">>> Adding {len(missing_ids)} missing predictions...")
    median_sales = train_fe['sales'].median()
    missing_df = pd.DataFrame({'id': list(missing_ids), 'sales': [median_sales] * len(missing_ids)})
    final_df = pd.concat([final_df, missing_df], ignore_index=True)
    final_df = final_df.sort_values('id').reset_index(drop=True)

# Final validation
final_df['sales'] = final_df['sales'].fillna(train_fe['sales'].median())
final_df['sales'] = np.maximum(final_df['sales'], 0.01)

# Conservative outlier capping
q95 = train_fe['sales'].quantile(0.95)
final_df['sales'] = np.minimum(final_df['sales'], q95 * 3)

# Save submission
final_df.to_csv("rf_arima_ensemble_submission.csv", index=False)

print(f"\n>>> RF + ARIMA ENSEMBLE RESULTS:")
print(f">>> Submission shape: {final_df.shape}")
print(f">>> Sales range: {final_df['sales'].min():.3f} - {final_df['sales'].max():.3f}")
print(f">>> Sales median: {final_df['sales'].median():.3f}")
print(f">>> Sales mean: {final_df['sales'].mean():.3f}")
print(f">>> Random Forest coverage: 100%")
print(f">>> ARIMA coverage: {len(arima_df) / len(final_df) * 100:.1f}%")
print(f">>> Store-family coverage: {len(sf_df) / len(final_df) * 100:.1f}%")

print(f"\n>>> ENSEMBLE WEIGHTS:")
print(">>> Random Forest: 60% (main predictor)")
print(">>> ARIMA: 25% (time series patterns)")
print(">>> Store-Family: 15% (local knowledge)")

print(f"\n>>> KEY FEATURES:")
print(">>> ✓ Focused feature set (~40 features)")
print(">>> ✓ Simple ARIMA with automatic order selection") 
print(">>> ✓ Median-based store-family predictions")
print(">>> ✓ Three-model ensemble with fixed weights")
print(">>> ✓ Conservative outlier handling")
print(">>> ✓ Top 150 combinations for ARIMA efficiency")

print(">>> Submission saved as 'rf_arima_ensemble_submission.csv'")
print(">>> Target: < 0.40 RMSLE")

>>> Train period: 2013-01-01 00:00:00 to 2017-08-15 00:00:00
>>> Test period: 2017-08-16 00:00:00 to 2017-08-31 00:00:00
>>> Creating core features...
>>> Creating lag features...
>>> Creating aggregate features...
>>> Training Random Forest...
>>> Using 44 features for Random Forest
>>> Random Forest completed
>>> Top 10 Random Forest features:
         feature  importance
      sales_ma_7    0.706201
     sales_ma_14    0.126933
     sales_lag_7    0.079838
    sales_lag_14    0.033890
    sales_lag_28    0.010916
     sales_lag_1    0.008019
     sales_std_7    0.007630
sales_trend_7_14    0.007249
             day    0.002947
    sales_std_14    0.002350
>>> Running Simple ARIMA for key store-family combinations...
>>> Selecting combinations for ARIMA...
>>> Processing ARIMA for 150 top combinations...
>>> ARIMA progress: 0/150
>>> ARIMA progress: 50/150
>>> ARIMA progress: 100/150
>>> ARIMA predictions generated for 2400 test samples
>>> Creating store-family baseline...
>>> Creat

In [5]:
merged_train_df.dtypes

id                            int64
date                 datetime64[ns]
store_nbr                     int64
family                       object
sales                       float64
onpromotion                   int64
oil_price                   float64
city                         object
state                        object
type                         object
cluster                       int64
isHoliday                     int64
earthquake_impact             int64
salary_day_impact             int64
transactions                float64
dtype: object

In [6]:
test_df.dtypes

id                            int64
date                 datetime64[ns]
store_nbr                     int64
family                       object
onpromotion                   int64
earthquake_impact             int64
salary_day_impact             int64
isHoliday                     int64
oil_price                   float64
city                         object
state                        object
type                         object
cluster                       int64
transactions                float64
dtype: object

In [10]:
submission = pd.read_csv("my_submission_clean.csv")

# Check for NaN
print("NaN values per column:\n", submission.isna().sum())

# Check for inf
print("Infinite values per column:\n", np.isinf(submission).sum())

NaN values per column:
 id       0
sales    0
dtype: int64
Infinite values per column:
 id       0
sales    0
dtype: int64


In [9]:
import numpy as np
import pandas as pd

submission = pd.read_csv("my_submission_clean.csv")

# Find rows where sales is infinite
mask = np.isinf(submission["sales"])

# Show the offending rows
print(submission.loc[mask])


Empty DataFrame
Columns: [id, sales]
Index: []


In [8]:
import pandas as pd
import numpy as np

submission = pd.read_csv("my_submission.csv")

# Replace inf with NaN first (so ffill works)
submission["sales"].replace([np.inf, -np.inf], np.nan, inplace=True)

# Forward fill
submission["sales"].fillna(method="ffill", inplace=True)

# (Optional) if the very first row is NaN/inf, ffill won’t work — so backfill as fallback
submission["sales"].fillna(method="bfill", inplace=True)

# Save cleaned file
submission.to_csv("my_submission_clean.csv", index=False)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  submission["sales"].replace([np.inf, -np.inf], np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  submission["sales"].fillna(method="ffill", inplace=True)
  submission["sales"].fillna(method="ffill", inplace=True)
The behavior will change in pandas 3.0. This in

In [6]:
import pandas as pd
import numpy as np

# Load your files
test_df = pd.read_csv("test_df.csv")
submission = pd.read_csv("my_submission.csv")

# Check the range of IDs
print("▶️ Test set ID range:", test_df["id"].min(), "to", test_df["id"].max())
print("▶️ Submission ID range:", submission["id"].min(), "to", submission["id"].max())

# Also check for IDs in submission but not in test
extra_ids = set(submission["id"]) - set(test_df["id"])
if extra_ids:
    print("⚠️ IDs present in submission but not in test:", list(extra_ids)[:10], "...")
else:
    print("✅ All submission IDs are inside test_df.")


▶️ Test set ID range: 3000888 to 3029399
▶️ Submission ID range: 3000888 to 3029399
✅ All submission IDs are inside test_df.
