In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error
import xgboost as xgb
import pickle
from datetime import datetime

In [12]:
df = pd.read_csv(r"C:\Users\Bharath Kumar\Downloads\Forecasting_sales_data.csv") 
df.head()

Unnamed: 0,date,store_id,region,product_id,sales,price,promo_flag,promo_depth,holiday_flag,holiday_name,temp_c,precipitation_mm,stock_available,competitor_price
0,16-10-2024,S02,East,P001,11,103.52,0,0.0,0,,29.3,0.0,74,105.17
1,08-09-2023,S04,North,P005,75,108.37,0,0.0,0,,22.6,1.3,87,105.83
2,23-11-2024,S09,West,P003,18,104.53,0,0.0,0,,22.7,0.0,48,109.5
3,02-02-2023,S01,South,P003,42,107.92,1,0.39,0,,28.7,0.3,189,108.28
4,27-08-2023,S09,North,P020,9,98.37,0,0.0,0,,25.6,0.0,52,103.03


In [13]:
def preprocess_data(df):
    df = df.copy()
    
    # Convert date to features
    df['date'] = pd.to_datetime(df['date'])
    df['month'] = df['date'].dt.month
    df['day_of_week'] = df['date'].dt.dayofweek
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
    
    # Encode categorical variables
    label_encoders = {}
    categorical_cols = ['store_id', 'region', 'product_id', 'holiday_name']
    
    for col in categorical_cols:
        if col in df.columns:
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col].astype(str))
            label_encoders[col] = le
    
    # Feature engineering
    df['price_ratio'] = df['price'] / df['competitor_price']
    df['promo_effectiveness'] = df['promo_flag'] * df['promo_depth']
    
    return df, label_encoders

In [14]:
def prepare_features(df):
    feature_columns = [
        'store_id', 'region', 'product_id', 'price', 'promo_flag', 
        'promo_depth', 'holiday_flag', 'temp_c', 'precipitation_mm',
        'stock_available', 'competitor_price', 'month', 'day_of_week',
        'is_weekend', 'price_ratio', 'promo_effectiveness'
    ]
    
    X = df[feature_columns]
    y = df['sales']
    
    return X, y

# Preprocess data
df_processed, label_encoders = preprocess_data(df)
X, y = prepare_features(df_processed)

  df['date'] = pd.to_datetime(df['date'])


In [15]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
model = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.1,
    max_depth=6,
    min_child_weight=1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

print("Training XGBoost model...")

Training XGBoost model...


In [17]:
model.fit(X_train, y_train)

In [18]:
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"Model Performance:")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")

Model Performance:
MAE: 9.82
RMSE: 14.58


In [19]:
model_data = {
    'model': model,
    'label_encoders': label_encoders,
    'feature_columns': X.columns.tolist(),
    'performance': {'mae': mae, 'rmse': rmse}
}

with open('sales_model.pkl', 'wb') as f:
    pickle.dump(model_data, f)

print("Model saved as 'sales_model.pkl'")

Model saved as 'sales_model.pkl'
