# Advanced Big Mart Sales Analysis

This notebook explores deeper insights and trends in the Big Mart Sales dataset, going beyond standardized preprocessing to look at interactions, distributions, and fine-grained performance metrics.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set Style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Data Loading & Feature Engineering

In [None]:
df = pd.read_csv('updated_dataset.csv')

# Calculate Outlet Age if not present (Assuming data is from 2013 context)
if 'Outlet_Age' not in df.columns:
    df['Outlet_Age'] = 2013 - df['Outlet_Establishment_Year']

df.head()

## 2. Multivariate Analysis: MRP vs Sales vs Outlet Type
**Insight Goal:** Do higher MRP items sell better in specific types of outlets (e.g., Supermarkets vs Grocery Stores)?

In [None]:
plt.figure(figsize=(15, 8))
sns.scatterplot(data=df, x='Item_MRP', y='Item_Outlet_Sales', hue='Outlet_Type', alpha=0.6, palette='viridis')
plt.title('Item MRP vs Outlet Sales colored by Outlet Type')
plt.xlabel('Item MRP')
plt.ylabel('Sales')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

> **Observation:** Note distinct clusters. Grocery Stores (Type 0/1?) typically stay at the bottom (low sales), while Supermarket Type 3 often dominates the upper sales spectrum for the same MRP.

## 3. Outlet Location vs Sales Distribution
**Insight Goal:** Is the spread of sales different across Tiers?

In [None]:
plt.figure(figsize=(12, 6))
sns.violinplot(data=df, x='Outlet_Location_Type', y='Item_Outlet_Sales', palette='muted')
plt.title('Distribution of Sales across Location Tier')
plt.show()

## 4. Item Type Performance Heatmap
**Insight Goal:** Which Item Categories perform best in which Outlet Size?

In [None]:
# Pivot table for Heatmap
pivot = df.pivot_table(index='Item_Type', columns='Outlet_Size', values='Item_Outlet_Sales', aggfunc='mean')

plt.figure(figsize=(14, 10))
sns.heatmap(pivot, annot=True, fmt='.0f', cmap='YlGnBu')
plt.title('Average Sales Heatmap: Item Type vs Outlet Size')
plt.show()

## 5. Visibility Impact Analysis
**Insight Goal:** Does higher visibility actually correlate with higher sales?
We create bins for visibility to aggregate trends.

In [None]:
df['Visibility_Bin'] = pd.cut(df['Item_Visibility'], bins=[0, 0.05, 0.1, 0.15, 0.2, 0.35], labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])

plt.figure(figsize=(10, 6))
sns.barplot(data=df, x='Visibility_Bin', y='Item_Outlet_Sales', palette='rocket')
plt.title('Average Sales by Visibility Level')
plt.show()

## 6. Fat Content vs Item Type Interaction
**Insight Goal:** How does Fat Content affect sales within different categories (e.g., Dairy vs Meat)?

In [None]:
plt.figure(figsize=(16, 6))
sns.boxplot(data=df, x='Item_Type', y='Item_Outlet_Sales', hue='Item_Fat_Content')
plt.xticks(rotation=45)
plt.title('Sales Distribution by Item Type & Fat Content')
plt.legend(loc='upper right')
plt.tight_layout()
plt.show()

# Part 2: Business Owner Insights (Readable Trends)
We load the original dataset to use the actual Product Names and Outlet Types for easier understanding.

In [None]:
df_raw = pd.read_csv('bigmart.csv')
df_raw['Total_Sales'] = df_raw['Item_Outlet_Sales'] # Alias for clarity

## 7. What sells best where? (Year x Outlet x City)
We group the data to find the **Top Selling Product Category** for each Store, Year, and Location.

In [None]:
# Group by Year, Outlet Identifier, Location Type, and Item Type
grouped = df_raw.groupby(['Outlet_Establishment_Year', 'Outlet_Identifier', 'Outlet_Location_Type', 'Item_Type'])['Total_Sales'].sum().reset_index()

# Sort to find top categories
grouped = grouped.sort_values(['Outlet_Establishment_Year', 'Outlet_Identifier', 'Total_Sales'], ascending=[True, True, False])

# Get top 3 categories per store
top_products = grouped.groupby(['Outlet_Establishment_Year', 'Outlet_Identifier']).head(3)

print("Top 3 Selling Categories per Store over the Years:")
display(top_products[['Outlet_Establishment_Year', 'Outlet_Identifier', 'Outlet_Location_Type', 'Item_Type', 'Total_Sales']].style.background_gradient(cmap='Greens'))

## 8. Product Performance: Tier 1 vs Tier 3
Do people in Tier 1 cities buy different things than Tier 3?

In [None]:
plt.figure(figsize=(14, 6))
sns.countplot(data=df_raw, x='Item_Type', hue='Outlet_Location_Type', palette='Set2')
plt.xticks(rotation=45, ha='right')
plt.title('Count of Products Sold by City Tier')
plt.legend(title='City Tier')
plt.show()

# Part 3: API Creation for Frontend Integration
We will create a FastAPI backend that:
1. Loads the trained XGBoost model (`best_xgb_regressor_random_model.pkl`).
2. Re-constructs the Label Encoders (since they weren't saved) by mapping `bigmart.csv` to `updated_dataset.csv`.
3. Exposes a `/predict` endpoint.

In [None]:
%%writefile backend_api.py
import pandas as pd
import joblib
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import uvicorn
import os

app = FastAPI(title="Big Mart Sales Predictor")

# --- Load Resources ---
MODEL_PATH = "best_xgb_regressor_random_model.pkl"
RAW_CSV_PATH = "bigmart.csv"
ENCODED_CSV_PATH = "updated_dataset.csv"

try:
    model = joblib.load(MODEL_PATH)
    print("Model loaded successfully.")
except Exception as e:
    print(f"Error loading model: {e}")
    model = None

# --- Rebuild Encoders ---
# Since we don't have the pickle files for encoders, we map Raw -> Encoded values from the datasets
encoders = {}
cat_cols = ['Item_Identifier', 'Item_Fat_Content', 'Item_Type', 'Outlet_Identifier', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']

try:
    df_raw = pd.read_csv(RAW_CSV_PATH)
    df_enc = pd.read_csv(ENCODED_CSV_PATH)
    
    # Remove duplicates to create clean mapping 
    # Ensure we sort by index to maintain row-to-row correspondence if possible
    # Assumption: rows correspond 1-to-1
    
    for col in cat_cols:
        # Create a dictionary mapping { 'RawString': EncodedInt }
        mapping = dict(zip(df_raw[col], df_enc[col]))
        encoders[col] = mapping
        
    print("Encoders rebuilt successfully.")

except Exception as e:
    print(f"Error rebuilding encoders: {e}")

class ItemPredictionRequest(BaseModel):
    Item_Identifier: str
    Item_Weight: float
    Item_Fat_Content: str
    Item_Visibility: float
    Item_Type: str
    Item_MRP: float
    Outlet_Identifier: str
    Outlet_Establishment_Year: int
    Outlet_Size: str
    Outlet_Location_Type: str
    Outlet_Type: str

@app.get("/")
def read_root():
    return {"message": "Big Mart Sales Prediction API is running"}

@app.post("/predict")
def predict_sales(item: ItemPredictionRequest):
    if not model:
        raise HTTPException(status_code=500, detail="Model not loaded")
    
    try:
        # Prepare input vector
        data = item.dict()
        
        # Encode categorical fields using our rebuilt map
        for col in cat_cols:
            if col in data:
                raw_val = data[col]
                if raw_val not in encoders[col]:
                     # Fallback 1: Try finding a key that contains the string (partial match)
                     # Fallback 2: Use the most common value (mode) from encoder
                     # Fallback 3: Default to 0 (risky but failsafe)
                     pass
                
                # Apply mapping with default 0 if not found
                data[col] = encoders[col].get(raw_val, 0)
        
        # Create DataFrame for model (ensure order matches training)
        input_df = pd.DataFrame([data])
        
        # Order must be exact
        required_cols = ['Item_Identifier', 'Item_Weight', 'Item_Fat_Content', 'Item_Visibility',
                         'Item_Type', 'Item_MRP', 'Outlet_Identifier', 'Outlet_Establishment_Year',
                         'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']
                         
        input_df = input_df[required_cols]
                             
        prediction = model.predict(input_df)
        return {"predicted_sales": float(prediction[0])}

    except Exception as e:
        raise HTTPException(status_code=400, detail=f"Prediction error: {str(e)}")

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8001)
