In [27]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, power_transform
from sklearn.linear_model import LinearRegression, SGDRegressor, ElasticNet, Ridge
from sklearn.ensemble import BaggingRegressor, ExtraTreesRegressor, GradientBoostingRegressor, RandomForestRegressor, VotingRegressor
from sklearn.model_selection import train_test_split, cross_val_score, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings
warnings.filterwarnings('ignore')

# Load the Data

In [28]:

    train = pd.read_csv('/kaggle/input/time-series-store-sales/train.csv')
    test = pd.read_csv('/kaggle/input/time-series-store-sales/test.csv')
    holiday = pd.read_csv('/kaggle/input/time-series-store-sales/holidays_events.csv')
    oil = pd.read_csv('/kaggle/input/time-series-store-sales/oil.csv')
    store = pd.read_csv('/kaggle/input/time-series-store-sales/stores.csv')
    transaction = pd.read_csv('/kaggle/input/time-series-store-sales/transactions.csv')
    sample = pd.read_csv('/kaggle/input/time-series-store-sales/sample_submission.csv')
    


In [29]:
# ===== DATE PROCESSING FUNCTIONS =====
def modify_date(df):
    """Enhanced date feature engineering"""
    df = df.copy()
    df['date'] = pd.to_datetime(df['date'])
    
    # Basic date features
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['dayofweek'] = df['date'].dt.dayofweek
    df['weekofyear'] = df['date'].dt.isocalendar().week
    df['is_weekend'] = df['dayofweek'].isin([5, 6]).astype(int)
    
    # Additional time features for better time series modeling
    df['quarter'] = df['date'].dt.quarter
    df['is_month_start'] = df['date'].dt.is_month_start.astype(int)
    df['is_month_end'] = df['date'].dt.is_month_end.astype(int)
    df['days_from_start'] = (df['date'] - df['date'].min()).dt.days
    
    # Cyclical encoding for better ML performance
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    df['dayofweek_sin'] = np.sin(2 * np.pi * df['dayofweek'] / 7)
    df['dayofweek_cos'] = np.cos(2 * np.pi * df['dayofweek'] / 7)
    
    return df

In [30]:
# ===== HOLIDAY PROCESSING =====
def process_holidays(df, holiday):
    """Improved holiday processing"""
    # Filter out transferred holidays
    holidays_clean = holiday[holiday["transferred"] == False].copy()
    
    # Process different holiday types
    nat = holidays_clean[holidays_clean["locale"] == "National"][["date", "description"]].rename(
        columns={"description": "holiday_national"})
    reg = holidays_clean[holidays_clean["locale"] == "Regional"][["date", "locale_name", "description"]].rename(
        columns={"locale_name": "state", "description": "holiday_regional"})
    loc = holidays_clean[holidays_clean["locale"] == "Local"][["date", "locale_name", "description"]].rename(
        columns={"locale_name": "city", "description": "holiday_local"})
    
    # Merge holidays step by step
    df_merged = df.merge(nat, on="date", how="left")
    df_merged = df_merged.merge(reg, on=["date", "state"], how="left")
    df_merged = df_merged.merge(loc, on=["date", "city"], how="left")
    
    # Create holiday indicators
    df_merged['is_national_holiday'] = df_merged['holiday_national'].notna().astype(int)
    df_merged['is_regional_holiday'] = df_merged['holiday_regional'].notna().astype(int)
    df_merged['is_local_holiday'] = df_merged['holiday_local'].notna().astype(int)
    df_merged['is_any_holiday'] = (df_merged['is_national_holiday'] | 
                                   df_merged['is_regional_holiday'] | 
                                   df_merged['is_local_holiday']).astype(int)
    
    return df_merged

In [31]:
# ===== EXTERNAL DATA PROCESSING =====
def process_oil_data(df, oil):
    """Process and merge oil price data"""
    oil_clean = oil.copy()
    oil_clean['date'] = pd.to_datetime(oil_clean['date'])
    
    # Forward fill missing oil prices
    oil_clean['dcoilwtico'] = oil_clean['dcoilwtico'].fillna(method='ffill')
    oil_clean['dcoilwtico'] = oil_clean['dcoilwtico'].fillna(method='bfill')
    
    # Create oil price features
    oil_clean['oil_price_ma7'] = oil_clean['dcoilwtico'].rolling(window=7).mean()
    oil_clean['oil_price_change'] = oil_clean['dcoilwtico'].pct_change()
    df['date'] = pd.to_datetime(df['date'])
    return df.merge(oil_clean, on='date', how='left')

In [32]:
def process_transactions(df, transaction):
    """Process and merge transaction data"""
    trans_clean = transaction.copy()
    trans_clean['date'] = pd.to_datetime(trans_clean['date'])
    
    # Create transaction features
    trans_clean['transactions_ma7'] = trans_clean.groupby('store_nbr')['transactions'].transform(
        lambda x: x.rolling(window=7, min_periods=1).mean())
    
    return df.merge(trans_clean, on=['date', 'store_nbr'], how='left')

In [33]:
# ===== ENCODING FUNCTIONS =====
existing_cat_cols = []
def onehotencoding(df, columns):
    for col in columns:
        existing_cat_cols.append(col)
    
    if not existing_cat_cols:
        print("not working")
        return df
    
    trf1 = ColumnTransformer(
        transformers=[
            ('OneHotEncode', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), existing_cat_cols)
        ],
        remainder='passthrough'
    )
    
    transformed = trf1.fit_transform(df)
    
    # Get feature names
    ohe_features = trf1.named_transformers_['OneHotEncode'].get_feature_names_out(existing_cat_cols)
    remain_columns = [col for col in df.columns if col not in existing_cat_cols]
    
    combine_features = list(ohe_features) + remain_columns
    
    return pd.DataFrame(transformed, columns=combine_features)

In [34]:
# ===== DATA TRANSFORMATION =====
def data_distribution(df, choice, target_col='sales'):
    """Improved data transformation with target preservation"""
    df_copy = df.copy()
    
    # Separate target if it exists
    target_data = None
    if target_col in df_copy.columns:
        target_data = df_copy[target_col].copy()
        df_copy = df_copy.drop(columns=[target_col])
    
    # Only transform numeric columns
    numeric_cols = df_copy.select_dtypes(include=[np.number]).columns
    df_numeric = df_copy[numeric_cols].copy()
    df_non_numeric = df_copy.drop(columns=numeric_cols)
    
    # Handle negative values and zeros for certain transformations
    if choice in ["log", "sqrt"]:
        df_numeric = df_numeric.clip(lower=0.001)  # Avoid log(0) and sqrt of negative
    
    try:
        match choice:
            case "log":
                log_transform = FunctionTransformer(np.log1p, validate=True)
                transformed_numeric = log_transform.fit_transform(df_numeric)
            
            case "sqrt":
                sqrt_transformer = FunctionTransformer(np.sqrt, validate=True)
                transformed_numeric = sqrt_transformer.fit_transform(df_numeric)
            
            case "reciprocal":
                reciprocal_transformer = FunctionTransformer(lambda x: 1 / (x + 0.0001), validate=True)
                transformed_numeric = reciprocal_transformer.fit_transform(df_numeric)
            
            case "yeo_johnson":
                transformed_numeric = power_transform(df_numeric, method='yeo-johnson')
            
            case _:
                print(f"Unknown choice: {choice}. Returning original data.")
                return df
    
    except Exception as e:
        print(f"Error in transformation {choice}: {e}")
        return df
    
    # Reconstruct dataframe
    result = pd.DataFrame(transformed_numeric, columns=numeric_cols, index=df_copy.index)
    
    # Add back non-numeric columns
    if not df_non_numeric.empty:
        result = pd.concat([result, df_non_numeric.reset_index(drop=True)], axis=1)
    
    # Add back target if it existed
    if target_data is not None:
        result[target_col] = target_data.reset_index(drop=True)
    
    return result

In [35]:
# ===== SCALING FUNCTIONS =====
def scale_data(df, choice, target_col='sales'):
    """Improved scaling with target preservation"""
    df_copy = df.copy()
    
    # Separate target if it exists
    target_data = None
    if target_col in df_copy.columns:
        target_data = df_copy[target_col].copy()
        df_copy = df_copy.drop(columns=[target_col])
    
    # Only scale numeric columns
    numeric_cols = df_copy.select_dtypes(include=[np.number]).columns
    df_numeric = df_copy[numeric_cols]
    df_non_numeric = df_copy.drop(columns=numeric_cols)
    
    match choice:
        case "minmax":
            scaler = MinMaxScaler()
            scaled_numeric = scaler.fit_transform(df_numeric)
            print("Applied MinMaxScaler")
        
        case "standard":
            scaler = StandardScaler()
            scaled_numeric = scaler.fit_transform(df_numeric)
            print("Applied StandardScaler")
        
        case _:
            print("Invalid scaling type selected!")
            return df
    
    # Reconstruct dataframe
    result = pd.DataFrame(scaled_numeric, columns=numeric_cols, index=df_copy.index)
    
    # Add back non-numeric columns
    if not df_non_numeric.empty:
        result = pd.concat([result, df_non_numeric.reset_index(drop=True)], axis=1)
    
    # Add back target
    if target_data is not None:
        result[target_col] = target_data.reset_index(drop=True)
    
    return result

# ===== MAIN PREPROCESSING PIPELINE =====

In [36]:
    anurag = None
    """Complete preprocessing pipeline"""
    # Load data
    # train, test, holiday, oil, store, transaction, sample = load_data()
    
    print("Original shapes:")
    print(f"Train: {train.shape}, Test: {test.shape}")
    
    # Merge with store information
    train_merged = train.merge(store, on="store_nbr", how="left")
    test_merged = test.merge(store, on="store_nbr", how="left")
    
    print(f"After store merge - Train: {train_merged.shape}, Test: {test_merged.shape}")
    
    # Process holidays
    train_merged = process_holidays(train_merged, holiday)
    test_merged = process_holidays(test_merged, holiday)
    
    # Process oil data
    train_merged = process_oil_data(train_merged, oil)
    test_merged = process_oil_data(test_merged, oil)
    
    # Process transactions
    train_merged = process_transactions(train_merged, transaction)
    test_merged = process_transactions(test_merged, transaction)
    
    # Process dates
    train_processed = modify_date(train_merged)
    test_processed = modify_date(test_merged)
    
    # Drop original date column and id from train
    train_processed = train_processed.drop(columns=['date'])
    test_processed = test_processed.drop(columns=['date'])
    
    if 'id' in train_processed.columns:
        train_processed = train_processed.drop(columns=['id'])
    print(f"After feature engineering - Train: {train_processed.shape}, Test: {test_processed.shape}")

    
    # Handle missing values
    train_processed = train_processed.fillna(0)  # or use more sophisticated imputation
    test_processed = test_processed.fillna(0)
    

Original shapes:
Train: (3000888, 6), Test: (28512, 5)
After store merge - Train: (3000888, 10), Test: (28512, 9)
After feature engineering - Train: (3008280, 34), Test: (28512, 34)


In [37]:
cate_col = [col for col in train_processed.select_dtypes(include= ['object']).columns]
train_processed= train_processed.drop(columns = cate_col)

In [38]:
cate_col = [col for col in test_processed.select_dtypes(include= ['object']).columns]
test_processed= test_processed.drop(columns = cate_col)

In [39]:
train_processed.columns

Index(['store_nbr', 'sales', 'onpromotion', 'cluster', 'is_national_holiday',
       'is_regional_holiday', 'is_local_holiday', 'is_any_holiday',
       'dcoilwtico', 'oil_price_ma7', 'oil_price_change', 'transactions',
       'transactions_ma7', 'year', 'month', 'day', 'dayofweek', 'weekofyear',
       'is_weekend', 'quarter', 'is_month_start', 'is_month_end',
       'days_from_start', 'month_sin', 'month_cos', 'dayofweek_sin',
       'dayofweek_cos'],
      dtype='object')

In [40]:
test_processed = test_processed.drop(columns = ['holiday_national','holiday_regional'])

In [41]:
test_processed.shape

(28512, 27)

In [42]:
    # One-hot encode categorical variables
    # cat_cols = train_processed.select_dtypes(include=['object']).columns
    # train_processed = onehotencoding(train_processed, cat_cols)
    # test_processed = onehotencoding(test_processed, cat_cols)
    # print(f"After OneHotEncoding - Train: {train_processed.shape}, Test: {test_processed.shape}")
    # Align columns between train and test
    common_cols = list(set(train_processed.columns) & set(test_processed.columns))
    train_cols_only = [col for col in train_processed.columns if col not in common_cols and col != 'sales']
    test_cols_only = [col for col in test_processed.columns if col not in common_cols]
    
    # Add missing columns with zeros
    for col in train_cols_only:
        test_processed[col] = 0
    for col in test_cols_only:
        train_processed[col] = 0
    
    # Apply transformations (optional - test different options)
    # train_processed = data_distribution(train_processed, "log")  # Uncomment if needed
    # train_processed = scale_data(train_processed, "standard")    # Uncomment if needed
    
    print(f"Final shapes - Train: {train_processed.shape}, Test: {test_processed.shape}")
    
    # return train_processed, test_processed

Final shapes - Train: (3008280, 28), Test: (28512, 27)


In [43]:
train_final = train_processed
test_final = test_processed

In [44]:
# ===== MODEL TRAINING AND EVALUATION =====
def evaluate_models(X_train, X_test, y_train, y_test, use_time_series_cv=True):
    """
    Train and evaluate multiple regression models
    """
    # Initialize models
    models = {
        'LinearRegression': LinearRegression(),
        'Ridge': Ridge(alpha=1.0),
        'ElasticNet': ElasticNet(alpha=1.0, l1_ratio=0.5, max_iter=2000),
        # 'SGDRegressor': SGDRegressor(random_state=42, max_iter=2000),
        # 'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
        # 'ExtraTrees': ExtraTreesRegressor(n_estimators=100, random_state=42, n_jobs=-1),
        # 'GradientBoosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
        # 'Bagging': BaggingRegressor(random_state=42, n_jobs=-1)
    }
    
    results = {}
    trained_models = {}
    
    print("Training and evaluating models...")
    print("=" * 60)
    
    for name, model in models.items():
        print(f"Training {name}...")
        
        try:
            # Train the model
            model.fit(X_train, y_train)
            
            # Make predictions
            y_pred_train = model.predict(X_train)
            y_pred_test = model.predict(X_test)
            
            # Calculate metrics
            train_mse = mean_squared_error(y_train, y_pred_train)
            test_mse = mean_squared_error(y_test, y_pred_test)
            train_mae = mean_absolute_error(y_train, y_pred_train)
            test_mae = mean_absolute_error(y_test, y_pred_test)
            train_r2 = r2_score(y_train, y_pred_train)
            test_r2 = r2_score(y_test, y_pred_test)
            
            # Time series cross-validation (optional)
            cv_scores = None
            if use_time_series_cv:
                try:
                    tscv = TimeSeriesSplit(n_splits=5)
                    cv_scores = cross_val_score(model, X_train, y_train, 
                                              cv=tscv, scoring='neg_mean_squared_error', n_jobs=-1)
                    cv_rmse = np.sqrt(-cv_scores.mean())
                except:
                    cv_rmse = "N/A"
            
            results[name] = {
                'train_mse': train_mse,
                'test_mse': test_mse,
                'train_rmse': np.sqrt(train_mse),
                'test_rmse': np.sqrt(test_mse),
                'train_mae': train_mae,
                'test_mae': test_mae,
                'train_r2': train_r2,
                'test_r2': test_r2,
                'cv_rmse': cv_rmse if use_time_series_cv else "N/A"
            }
            
            trained_models[name] = model
            
            print(f"✓ {name} completed")
            print(f"  Test RMSE: {np.sqrt(test_mse):.4f}")
            print(f"  Test R²: {test_r2:.4f}")
            print()
            
        except Exception as e:
            print(f"✗ Error training {name}: {str(e)}")
            results[name] = "Error"
            continue
    
    return results, trained_models

In [45]:
def create_ensemble_models(trained_models):
    """
    Create ensemble models using the best performing individual models
    """
    # Filter out models that had errors
    valid_models = {name: model for name, model in trained_models.items() 
                   if model is not None}
    
    if len(valid_models) < 2:
        print("Not enough valid models for ensemble creation")
        return {}
    
    ensemble_models = {}
    
    # Voting Regressor (average predictions)
    try:
        voting_models = [(name, model) for name, model in valid_models.items() 
                        if name in ['RandomForest', 'ExtraTrees', 'GradientBoosting']]
        
        if len(voting_models) >= 2:
            ensemble_models['VotingRegressor'] = VotingRegressor(
                estimators=voting_models, n_jobs=-1
            )
    except Exception as e:
        print(f"Error creating VotingRegressor: {e}")
    
    return ensemble_models

In [46]:
def display_results(results):
    """
    Display model evaluation results in a formatted table
    """
    import pandas as pd
    
    # Convert results to DataFrame for better display
    results_df = pd.DataFrame(results).T
    
    # Filter out error results
    results_df = results_df[results_df.apply(lambda x: x != "Error").all(axis=1)]
    
    if results_df.empty:
        print("No valid results to display")
        return
    
    # Sort by test RMSE
    try:
        results_df = results_df.sort_values('test_rmse', ascending=True)
    except:
        pass
    
    print("Model Performance Summary:")
    print("=" * 80)
    print(f"{'Model':<20} {'Test RMSE':<12} {'Test R²':<10} {'Test MAE':<12} {'CV RMSE':<12}")
    print("-" * 80)
    
    for model_name, metrics in results_df.iterrows():
        try:
            test_rmse = f"{metrics['test_rmse']:.4f}"
            test_r2 = f"{metrics['test_r2']:.4f}"
            test_mae = f"{metrics['test_mae']:.4f}"
            cv_rmse = f"{metrics['cv_rmse']:.4f}" if metrics['cv_rmse'] != "N/A" else "N/A"
            
            print(f"{model_name:<20} {test_rmse:<12} {test_r2:<10} {test_mae:<12} {cv_rmse:<12}")
        except:
            print(f"{model_name:<20} Error displaying metrics")
    
    print("-" * 80)
    
    # Best model
    try:
        best_model = results_df.iloc[0].name
        best_rmse = results_df.iloc[0]['test_rmse']
        print(f"\nBest Model: {best_model} (Test RMSE: {best_rmse:.4f})")
    except:
        print("\nCould not determine best model")

In [47]:
def train_and_evaluate_pipeline(train_data, test_size=0.2, random_state=42):
    """
    Complete pipeline for training and evaluating models
    """
    if 'sales' not in train_data.columns:
        print("Error: 'sales' column not found in training data")
        return None, None, None
    
    # Prepare features and target
    X = train_data.drop(columns=['sales'])
    y = train_data['sales']
    
    print(f"Dataset shape: {X.shape}")
    print(f"Target shape: {y.shape}")
    print(f"Target statistics:")
    print(y.describe())
    print()
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, shuffle=False  # No shuffle for time series
    )
    
    print(f"Train set size: {X_train.shape[0]}")
    print(f"Test set size: {X_test.shape[0]}")
    print()
    
    # Train and evaluate models
    results, trained_models = evaluate_models(X_train, X_test, y_train, y_test)
    
    # Create and train ensemble models
    ensemble_models = create_ensemble_models(trained_models)
    
    if ensemble_models:
        print("Training ensemble models...")
        for name, model in ensemble_models.items():
            try:
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
                test_mse = mean_squared_error(y_test, y_pred)
                test_r2 = r2_score(y_test, y_pred)
                test_mae = mean_absolute_error(y_test, y_pred)
                
                results[name] = {
                    'train_mse': 0,  # Not calculated for ensemble
                    'test_mse': test_mse,
                    'train_rmse': 0,
                    'test_rmse': np.sqrt(test_mse),
                    'train_mae': 0,
                    'test_mae': test_mae,
                    'train_r2': 0,
                    'test_r2': test_r2,
                    'cv_rmse': "N/A"
                }
                
                trained_models[name] = model
                print(f"✓ {name} completed - Test RMSE: {np.sqrt(test_mse):.4f}")
                
            except Exception as e:
                print(f"✗ Error training {name}: {str(e)}")
    
    # Display results
    print("\n")
    display_results(results)
    
    return results, trained_models, (X_train, X_test, y_train, y_test)

In [48]:
def make_predictions(model, test_data, feature_columns=None):
    """
    Make predictions on test data using trained model
    """
    try:
        if feature_columns is not None:
            # Ensure test data has the same columns as training data
            missing_cols = set(feature_columns) - set(test_data.columns)
            if missing_cols:
                print(f"Warning: Missing columns in test data: {missing_cols}")
                for col in missing_cols:
                    test_data[col] = 0
            
            # Select only the required columns in the same order
            test_data = test_data[feature_columns]
        
        predictions = model.predict(test_data)
        return predictions
    
    except Exception as e:
        print(f"Error making predictions: {str(e)}")
        return None

# ===== USAGE EXAMPLE =====

In [49]:

    # Run the preprocessing pipeline
    # train_final, test_final = preprocess_data()
    
    print("\nPreprocessing completed successfully!")
    print(f"Train shape: {train_final.shape}")
    print(f"Test shape: {test_final.shape}")
    
    # Check for missing values
    print(f"\nMissing values in train: {train_final.isnull().sum().sum()}")
    print(f"Missing values in test: {test_final.isnull().sum().sum()}")
    
    # Display basic info
    print(f"\nTrain columns: {train_final.columns.tolist()[:10]}...")  # First 10 columns
    print(f"Target variable stats:")
    if 'sales' in train_final.columns:
        print(train_final['sales'].describe())
    


Preprocessing completed successfully!
Train shape: (3008280, 28)
Test shape: (28512, 27)

Missing values in train: 0
Missing values in test: 0

Train columns: ['store_nbr', 'sales', 'onpromotion', 'cluster', 'is_national_holiday', 'is_regional_holiday', 'is_local_holiday', 'is_any_holiday', 'dcoilwtico', 'oil_price_ma7']...
Target variable stats:
count    3.008280e+06
mean     3.582643e+02
std      1.103486e+03
min      0.000000e+00
25%      0.000000e+00
50%      1.100000e+01
75%      1.960000e+02
max      1.247170e+05
Name: sales, dtype: float64


In [50]:
    # ===== MODEL TRAINING AND EVALUATION =====
    print("\n" + "="*60)
    print("STARTING MODEL TRAINING AND EVALUATION")
    print("="*60)
    
    # Train and evaluate models
    results, trained_models, data_splits = train_and_evaluate_pipeline(train_final)
    
    if results is not None:
        # Get the best model
        results_df = pd.DataFrame(results).T
        results_df = results_df[results_df.apply(lambda x: x != "Error").all(axis=1)]
        
        if not results_df.empty:
            best_model_name = results_df.sort_values('test_rmse').index[0]
            best_model = trained_models[best_model_name]
            
            print(f"\nBest model selected: {best_model_name}")
            
            # Make predictions on the actual test set
            print("\nMaking predictions on test set...")
            X_train, X_test, y_train, y_test = data_splits
            feature_columns = X_train.columns.tolist()
            
            test_predictions = make_predictions(best_model, test_final, feature_columns)
            
            if test_predictions is not None:
                print(f"Predictions shape: {test_predictions.shape}")
                print(f"Prediction statistics:")
                print(f"Min: {test_predictions.min():.4f}")
                print(f"Max: {test_predictions.max():.4f}")
                print(f"Mean: {test_predictions.mean():.4f}")
                
                # Create submission file if test data has 'id' column
                if 'id' in test_final.columns:
                    submission = pd.DataFrame({
                        'id': test_final['id'],
                        'sales': test_predictions
                    })
                    
                    # Save submission file
                    submission.to_csv('submission.csv', index=False)
                    print(f"\nSubmission file saved as 'submission.csv'")
                    print(f"Submission shape: {submission.shape}")
                    print(submission.head())
    
    print("\n" + "="*60)
    print("PIPELINE COMPLETED SUCCESSFULLY!")
    print("="*60)


STARTING MODEL TRAINING AND EVALUATION
Dataset shape: (3008280, 27)
Target shape: (3008280,)
Target statistics:
count    3.008280e+06
mean     3.582643e+02
std      1.103486e+03
min      0.000000e+00
25%      0.000000e+00
50%      1.100000e+01
75%      1.960000e+02
max      1.247170e+05
Name: sales, dtype: float64

Train set size: 2406624
Test set size: 601656

Training and evaluating models...
Training LinearRegression...
✓ LinearRegression completed
  Test RMSE: 1067.7300
  Test R²: 0.3858

Training Ridge...
✓ Ridge completed
  Test RMSE: 1068.1036
  Test R²: 0.3854

Training ElasticNet...
✓ ElasticNet completed
  Test RMSE: 1068.0884
  Test R²: 0.3854



Model Performance Summary:
Model                Test RMSE    Test R²    Test MAE     CV RMSE     
--------------------------------------------------------------------------------
LinearRegression     1067.7300    0.3858     522.1726     1006.4971   
ElasticNet           1068.0884    0.3854     510.1100     997.3973    
Ridge       

In [56]:
submission  = pd.read_csv('/kaggle/working/submission.csv')
submission.shape

(28512, 2)

In [52]:
train.columns

Index(['id', 'date', 'store_nbr', 'family', 'sales', 'onpromotion'], dtype='object')