In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline

In [4]:
# Load the data
df = pd.read_csv("D:/retail_store_inventory.csv")

# Display basic info
print(df.info())
print(df.describe())

# Check for missing values
print(df.isnull().sum())

# Convert date to datetime and extract features
df['Date'] = pd.to_datetime(df['Date'])
df['Day'] = df['Date'].dt.day
df['Month'] = df['Date'].dt.month
df['Year'] = df['Date'].dt.year

# Encode categorical variables
df = pd.get_dummies(df, columns=['Category', 'Region', 'Weather Condition', 'Seasonality'], drop_first=True)

# Drop columns that are not useful or would cause data leakage
df.drop(['Date', 'Store ID', 'Product ID', 'Holiday/Promotion'], axis=1, inplace=True)

# Separate features and target
X = df.drop('Demand Forecast', axis=1)
y = df['Demand Forecast']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73100 entries, 0 to 73099
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Date                73100 non-null  object 
 1   Store ID            73100 non-null  object 
 2   Product ID          73100 non-null  object 
 3   Category            73100 non-null  object 
 4   Region              73100 non-null  object 
 5   Inventory Level     73100 non-null  int64  
 6   Units Sold          73100 non-null  int64  
 7   Units Ordered       73100 non-null  int64  
 8   Demand Forecast     73100 non-null  float64
 9   Price               73100 non-null  float64
 10  Discount            73100 non-null  int64  
 11  Weather Condition   73100 non-null  object 
 12  Holiday/Promotion   73100 non-null  int64  
 13  Competitor Pricing  73100 non-null  float64
 14  Seasonality         73100 non-null  object 
dtypes: float64(3), int64(5), object(7)
memory usage: 8.4+

In [5]:
# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=1.0),
    'Lasso Regression': Lasso(alpha=0.1),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42)
}

# Dictionary to store evaluation metrics
results = {}

# Train and evaluate each model
for name, model in models.items():
    # Create pipeline
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', model)
    ])
    
    # Fit the model
    pipeline.fit(X_train, y_train)
    
    # Make predictions
    y_pred = pipeline.predict(X_test)
    
    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Cross-validation score
    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='r2')
    cv_mean = np.mean(cv_scores)
    
    # Store results
    results[name] = {
        'RMSE': rmse,
        'MAE': mae,
        'R2': r2,
        'CV R2 Mean': cv_mean
    }
    
    print(f"\n{name} Performance:")
    print(f"RMSE: {rmse:.2f}")
    print(f"MAE: {mae:.2f}")
    print(f"R2 Score: {r2:.2f}")
    print(f"Cross-validated R2: {cv_mean:.2f}")

# Convert results to DataFrame for better visualization
results_df = pd.DataFrame(results).T
print("\nSummary of Model Performance:")
print(results_df)


Linear Regression Performance:
RMSE: 8.65
MAE: 7.47
R2 Score: 0.99
Cross-validated R2: 0.99

Ridge Regression Performance:
RMSE: 8.65
MAE: 7.47
R2 Score: 0.99
Cross-validated R2: 0.99

Lasso Regression Performance:
RMSE: 8.65
MAE: 7.47
R2 Score: 0.99
Cross-validated R2: 0.99

Random Forest Performance:
RMSE: 8.88
MAE: 7.61
R2 Score: 0.99
Cross-validated R2: 0.99

Gradient Boosting Performance:
RMSE: 8.70
MAE: 7.49
R2 Score: 0.99
Cross-validated R2: 0.99

Summary of Model Performance:
                       RMSE       MAE        R2  CV R2 Mean
Linear Regression  8.648626  7.471905  0.993725    0.993709
Ridge Regression   8.648661  7.471935  0.993725    0.993709
Lasso Regression   8.646598  7.470728  0.993728    0.993711
Random Forest      8.882036  7.605630  0.993382    0.993336
Gradient Boosting  8.704699  7.494951  0.993644    0.993608


In [None]:
# Plot model performance metrics
plt.figure(figsize=(15, 10))

# RMSE comparison
plt.subplot(2, 2, 1)
sns.barplot(x=results_df.index, y=results_df['RMSE'])
plt.title('RMSE Comparison')
plt.xticks(rotation=45)
plt.ylabel('Root Mean Squared Error')

# MAE comparison
plt.subplot(2, 2, 2)
sns.barplot(x=results_df.index, y=results_df['MAE'])
plt.title('MAE Comparison')
plt.xticks(rotation=45)
plt.ylabel('Mean Absolute Error')

# R2 comparison
plt.subplot(2, 2, 3)
sns.barplot(x=results_df.index, y=results_df['R2'])
plt.title('R2 Score Comparison')
plt.xticks(rotation=45)
plt.ylabel('R-squared')

# Cross-validated R2 comparison
plt.subplot(2, 2, 4)
sns.barplot(x=results_df.index, y=results_df['CV R2 Mean'])
plt.title('Cross-validated R2 Comparison')
plt.xticks(rotation=45)
plt.ylabel('CV R2 Mean')

plt.tight_layout()
plt.show()

# Plot actual vs predicted for the best performing model
best_model_name = results_df['R2'].idxmax()
best_model = models[best_model_name]
best_model.fit(X_train_scaled, y_train)
y_pred_best = best_model.predict(X_test_scaled)

plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_best, alpha=0.5)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2)
plt.xlabel('Actual Demand Forecast')
plt.ylabel('Predicted Demand Forecast')
plt.title(f'Actual vs Predicted Demand Forecast ({best_model_name})')
plt.show()

# Feature importance for tree-based models
if hasattr(best_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'Feature': X.columns,
        'Importance': best_model.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    plt.figure(figsize=(10, 8))
    sns.barplot(x='Importance', y='Feature', data=feature_importance.head(15))
    plt.title('Top 15 Important Features')
    plt.show()

In [None]:
# Plot regression lines for all models
plt.figure(figsize=(12, 8))

# Sort test values for better visualization
sorted_idx = np.argsort(y_test.values)
y_test_sorted = y_test.values[sorted_idx]

# Plot actual values
plt.plot(y_test_sorted, y_test_sorted, 'k-', label='Perfect Prediction', linewidth=2)

# Plot predictions from each model
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    y_pred_sorted = y_pred[sorted_idx]
    plt.plot(y_test_sorted, y_pred_sorted, '--', label=name, alpha=0.7)

plt.xlabel('Actual Demand Forecast')
plt.ylabel('Predicted Demand Forecast')
plt.title('Comparison of Regression Models')
plt.legend()
plt.grid(True)
plt.show()