# Sales Forecasting Analysis
## Using Superstore Sales Data

## 1. Data Loading and Initial Exploration

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import warnings
warnings.filterwarnings('ignore')

# Set style for plots
plt.style.use('seaborn')
%matplotlib inline

In [None]:
# Load the data
df = pd.read_csv('../train.csv', encoding='latin-1')

# Convert date columns to datetime
df['Order Date'] = pd.to_datetime(df['Order Date'], format='%d/%m/%Y')
df['Ship Date'] = pd.to_datetime(df['Ship Date'], format='%d/%m/%Y')

# Display basic info
print("Dataset shape:", df.shape)
print("\nFirst 5 rows:")
display(df.head())

# Basic statistics
print("\nSummary statistics:")
display(df.describe(include='all'))

# Check for missing values
print("\nMissing values per column:")
print(df.isnull().sum())

## 2. Data Preprocessing and Feature Engineering

In [None]:
# Extract date components
df['Year'] = df['Order Date'].dt.year
df['Month'] = df['Order Date'].dt.month
df['Day'] = df['Order Date'].dt.day
df['DayOfWeek'] = df['Order Date'].dt.dayofweek
df['Quarter'] = df['Order Date'].dt.quarter

# Calculate days between order and ship date
df['Days_to_Ship'] = (df['Ship Date'] - df['Order Date']).dt.days

# Create a daily sales dataframe
daily_sales = df.groupby('Order Date')['Sales'].sum().reset_index()
daily_sales = daily_sales.set_index('Order Date')

# Resample to monthly sales
monthly_sales = daily_sales.resample('M').sum()

# Plot monthly sales
plt.figure(figsize=(14, 6))
monthly_sales.plot()
plt.title('Monthly Sales Over Time')
plt.xlabel('Date')
plt.ylabel('Total Sales ($)')
plt.grid(True)
plt.savefig('../monthly_sales_trend.png')
plt.show()

## 3. Time Series Analysis

In [None]:
# Decompose the time series
decomposition = seasonal_decompose(monthly_sales, model='additive', period=12)

# Plot decomposition
plt.figure(figsize=(14, 10))
plt.subplot(411)
plt.plot(monthly_sales, label='Original')
plt.legend()
plt.subplot(412)
plt.plot(decomposition.trend, label='Trend')
plt.legend()
plt.subplot(413)
plt.plot(decomposition.seasonal, label='Seasonal')
plt.legend()
plt.subplot(414)
plt.plot(decomposition.resid, label='Residual')
plt.legend()
plt.tight_layout()
plt.savefig('../time_series_decomposition.png')
plt.show()

## 4. Feature Engineering for Forecasting

In [None]:
# Create features for the model
def create_features(df):
    df = df.copy()
    df['month'] = df.index.month
    df['year'] = df.index.year
    df['quarter'] = df.index.quarter
    
    # Lag features
    for i in [1, 2, 3, 12]:
        df[f'sales_lag_{i}'] = df['Sales'].shift(i)
    
    # Rolling features
    df['rolling_mean_3'] = df['Sales'].shift(1).rolling(window=3).mean()
    df['rolling_std_3'] = df['Sales'].shift(1).rolling(window=3).std()
    
    return df

# Apply feature engineering
df_forecast = create_features(monthly_sales)
df_forecast = df_forecast.dropna()

# Prepare features and target
X = df_forecast.drop('Sales', axis=1)
y = df_forecast['Sales']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False
)

## 5. Model Training and Evaluation

In [None]:
# Train a Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"Mean Absolute Error: ${mae:,.2f}")
print(f"Root Mean Squared Error: ${rmse:,.2f}")

# Plot feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importance)
plt.title('Feature Importance')
plt.tight_layout()
plt.savefig('../feature_importance.png')
plt.show()

## 6. Future Sales Forecasting

In [None]:
def forecast_future(months_to_forecast=12):
    # Prepare the last known data point
    last_date = monthly_sales.index[-1]
    forecast_dates = pd.date_range(
        start=last_date + pd.DateOffset(months=1),
        periods=months_to_forecast,
        freq='M'
    )
    
    # Create future dataframe
    future = pd.DataFrame(index=forecast_dates)
    future['Sales'] = np.nan
    
    # Combine historical and future data
    full_data = pd.concat([monthly_sales, future])
    
    # Generate features for future dates
    full_data = create_features(full_data)
    
    # Get only future data for prediction
    future_data = full_data[full_data.index > last_date]
    
    # Make predictions
    future_predictions = model.predict(future_data.drop('Sales', axis=1))
    
    # Create results dataframe
    results = pd.DataFrame({
        'Date': future_data.index,
        'Forecasted_Sales': future_predictions
    })
    
    return results

# Generate 12-month forecast
forecast = forecast_future(12)

# Plot historical and forecasted sales
plt.figure(figsize=(14, 7))
plt.plot(monthly_sales.index, monthly_sales['Sales'], label='Historical Sales')
plt.plot(forecast['Date'], forecast['Forecasted_Sales'], 'r--', label='Forecasted Sales')
plt.axvline(x=monthly_sales.index[-1], color='g', linestyle='--', label='Forecast Start')
plt.title('Historical and Forecasted Monthly Sales')
plt.xlabel('Date')
plt.ylabel('Sales ($)')
plt.legend()
plt.grid(True)
plt.savefig('../sales_forecast.png')
plt.show()

# Display forecast results
print("\n12-Month Sales Forecast:")
display(forecast)

## 7. Business Insights and Recommendations

### Key Findings:
1. **Sales Trends**: The analysis reveals [describe trend based on the plot]
2. **Seasonality**: Strong seasonal patterns are observed with [describe seasonality]
3. **Forecast Accuracy**: The model achieved [MAE/RMSE] which indicates [good/fair/poor] performance

### Business Recommendations:
1. **Inventory Management**:
   - Increase stock before [high-sales months]
   - Reduce inventory before [low-sales months]
   
2. **Staffing**:
   - Schedule additional staff during [busy periods]
   - Plan maintenance/training during [slow periods]
   
3. **Marketing Strategy**:
   - Launch promotions before [low-sales periods] to boost demand
   - Consider loyalty programs during [high-sales periods] to maintain customer engagement

4. **Financial Planning**:
   - Allocate budget for [upcoming high-sales periods]
   - Consider cash flow needs during [seasonal lows]

## 8. Next Steps

1. **Model Improvement**:
   - Try more advanced time series models (ARIMA, Prophet, LSTM)
   - Incorporate external factors (holidays, promotions, economic indicators)
   
2. **Feature Engineering**:
   - Add more lag features
   - Include product category-level forecasts
   
3. **Deployment**:
   - Create a dashboard for real-time monitoring
   - Set up automated monthly forecasts
   - Implement alerting for significant forecast deviations