In [6]:
!pip install pandas numpy matplotlib seaborn plotly gradio scikit-learn



In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import gradio as gr
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

class SupermartAnalytics:
    def __init__(self):
        self.df = None
        self.model = None
        self.scaler = None
        self.label_encoders = {}
        self.feature_columns = []

    def load_data(self, file_path):
        """Load and preprocess the dataset"""
        try:
            self.df = pd.read_csv(file_path)

            # Data preprocessing
            self.df['Order Date'] = pd.to_datetime(self.df['Order Date'], errors='coerce')
            self.df['Day'] = self.df['Order Date'].dt.day
            self.df['Month_Num'] = self.df['Order Date'].dt.month
            self.df['Year'] = self.df['Order Date'].dt.year
            self.df['Month_Name'] = self.df['Order Date'].dt.strftime('%B')
            self.df['Weekday'] = self.df['Order Date'].dt.dayofweek

            # Remove any rows with missing values
            self.df = self.df.dropna()

            # Remove duplicates
            self.df = self.df.drop_duplicates()

            return f"✅ Data loaded successfully! Shape: {self.df.shape}"

        except Exception as e:
            return f"❌ Error loading data: {str(e)}"

    def get_data_overview(self):
        """Generate data overview with properly formatted tables"""
        if self.df is None:
            return "⚠️ Please load data first", None, None, None

        # Basic info
        basic_info = f"""
        ## 📊 Dataset Overview

        **Shape:** {self.df.shape[0]} rows × {self.df.shape[1]} columns
        **Date Range:** {self.df['Order Date'].min().strftime('%Y-%m-%d')} to {self.df['Order Date'].max().strftime('%Y-%m-%d')}
        """

        # Data types and info table
        dtypes_info = []
        for col in self.df.columns:
            dtypes_info.append({
                'Column': col,
                'Data Type': str(self.df[col].dtype),
                'Non-Null Count': self.df[col].count(),
                'Null Count': self.df[col].isnull().sum(),
                'Unique Values': self.df[col].nunique()
            })

        dtypes_df = pd.DataFrame(dtypes_info)

        # Basic statistics for numerical columns
        numeric_cols = self.df.select_dtypes(include=[np.number]).columns
        if len(numeric_cols) > 0:
            numeric_stats = self.df[numeric_cols].describe().round(2).T
            numeric_stats.reset_index(inplace=True)
            numeric_stats.rename(columns={'index': 'Column'}, inplace=True)
        else:
            numeric_stats = pd.DataFrame({'Message': ['No numeric columns found']})

        # Sample data
        sample_data = self.df.head(10)

        return basic_info, dtypes_df, numeric_stats, sample_data

    def create_category_analysis(self):
        """Analyze sales by category with proper formatting"""
        if self.df is None:
            return None, None

        # Sales by Category
        category_analysis = []
        for category in self.df['Category'].unique():
            cat_data = self.df[self.df['Category'] == category]
            category_analysis.append({
                'Category': category,
                'Total Sales': cat_data['Sales'].sum(),
                'Average Sales': cat_data['Sales'].mean(),
                'Order Count': len(cat_data),
                'Total Profit': cat_data['Profit'].sum(),
                'Average Discount': cat_data['Discount'].mean()
            })

        category_df = pd.DataFrame(category_analysis)
        category_df = category_df.sort_values('Total Sales', ascending=False)

        # Add percentage of total sales
        category_df['Sales Percentage'] = (category_df['Total Sales'] / category_df['Total Sales'].sum() * 100).round(2)

        # Format numerical columns
        category_df['Total Sales'] = category_df['Total Sales'].round(2)
        category_df['Average Sales'] = category_df['Average Sales'].round(2)
        category_df['Total Profit'] = category_df['Total Profit'].round(2)
        category_df['Average Discount'] = (category_df['Average Discount'] * 100).round(2)

        # Create visualization
        fig = px.bar(
            category_df,
            x='Category',
            y='Total Sales',
            title='Sales Performance by Category',
            labels={'Total Sales': 'Total Sales (₹)', 'Category': 'Product Category'},
            color='Total Sales',
            color_continuous_scale='viridis'
        )
        fig.update_layout(xaxis_tickangle=-45, height=500)

        return fig, category_df

    def create_temporal_analysis(self):
        """Analyze sales trends over time"""
        if self.df is None:
            return None, None

        # Monthly sales trend
        monthly_data = []
        for year in sorted(self.df['Year'].unique()):
            for month in range(1, 13):
                month_data = self.df[(self.df['Year'] == year) & (self.df['Month_Num'] == month)]
                if not month_data.empty:
                    monthly_data.append({
                        'Year': year,
                        'Month': month,
                        'Period': f"{year}-{month:02d}",
                        'Sales': month_data['Sales'].sum(),
                        'Orders': len(month_data),
                        'Profit': month_data['Profit'].sum()
                    })

        monthly_df = pd.DataFrame(monthly_data)

        # Create time series plot
        fig = go.Figure()
        fig.add_trace(go.Scatter(
            x=monthly_df['Period'],
            y=monthly_df['Sales'],
            mode='lines+markers',
            name='Monthly Sales',
            line=dict(color='blue', width=2)
        ))

        fig.update_layout(
            title='Sales Trend Over Time',
            xaxis_title='Time Period',
            yaxis_title='Sales Amount (₹)',
            xaxis_tickangle=-45,
            height=500
        )

        # Year-wise summary
        yearly_summary = []
        for year in sorted(self.df['Year'].unique()):
            year_data = self.df[self.df['Year'] == year]
            yearly_summary.append({
                'Year': year,
                'Total Sales': year_data['Sales'].sum(),
                'Total Orders': len(year_data),
                'Average Order Value': year_data['Sales'].mean(),
                'Total Profit': year_data['Profit'].sum()
            })

        yearly_df = pd.DataFrame(yearly_summary)
        yearly_df['Total Sales'] = yearly_df['Total Sales'].round(2)
        yearly_df['Average Order Value'] = yearly_df['Average Order Value'].round(2)
        yearly_df['Total Profit'] = yearly_df['Total Profit'].round(2)

        return fig, yearly_df

    def create_regional_analysis(self):
        """Analyze sales by region and city"""
        if self.df is None:
            return None, None

        # Regional analysis
        regional_data = []
        for region in self.df['Region'].unique():
            region_data = self.df[self.df['Region'] == region]
            regional_data.append({
                'Region': region,
                'Total Sales': region_data['Sales'].sum(),
                'Average Sales': region_data['Sales'].mean(),
                'Order Count': len(region_data),
                'Total Profit': region_data['Profit'].sum(),
                'Unique Cities': region_data['City'].nunique()
            })

        regional_df = pd.DataFrame(regional_data)
        regional_df = regional_df.sort_values('Total Sales', ascending=False)

        # Format columns
        regional_df['Total Sales'] = regional_df['Total Sales'].round(2)
        regional_df['Average Sales'] = regional_df['Average Sales'].round(2)
        regional_df['Total Profit'] = regional_df['Total Profit'].round(2)

        # Top cities analysis
        city_data = []
        for city in self.df['City'].unique():
            city_info = self.df[self.df['City'] == city]
            city_data.append({
                'City': city,
                'Region': city_info['Region'].iloc[0],
                'State': city_info['State'].iloc[0],
                'Total Sales': city_info['Sales'].sum(),
                'Order Count': len(city_info),
                'Average Sales': city_info['Sales'].mean()
            })

        city_df = pd.DataFrame(city_data)
        city_df = city_df.sort_values('Total Sales', ascending=False).head(15)

        # Format columns
        city_df['Total Sales'] = city_df['Total Sales'].round(2)
        city_df['Average Sales'] = city_df['Average Sales'].round(2)

        # Create visualization
        fig = make_subplots(
            rows=1, cols=2,
            subplot_titles=('Sales by Region', 'Top 15 Cities by Sales'),
            specs=[[{"type": "pie"}, {"type": "bar"}]]
        )

        # Pie chart for regions
        fig.add_trace(
            go.Pie(labels=regional_df['Region'], values=regional_df['Total Sales'], name="Regional Sales"),
            row=1, col=1
        )

        # Bar chart for top cities
        fig.add_trace(
            go.Bar(x=city_df['City'], y=city_df['Total Sales'], name="City Sales"),
            row=1, col=2
        )

        fig.update_layout(title_text="Regional and City-wise Sales Analysis", height=500)
        fig.update_xaxes(tickangle=-45, row=1, col=2)

        return fig, regional_df, city_df

    def create_profit_analysis(self):
        """Analyze profit margins and discount impact"""
        if self.df is None:
            return None, None

        # Calculate profit margin
        self.df['Profit_Margin'] = (self.df['Profit'] / self.df['Sales']) * 100

        # Category-wise profit analysis
        profit_data = []
        for category in self.df['Category'].unique():
            cat_data = self.df[self.df['Category'] == category]
            profit_data.append({
                'Category': category,
                'Total Profit': cat_data['Profit'].sum(),
                'Average Profit Margin': cat_data['Profit_Margin'].mean(),
                'Average Discount': cat_data['Discount'].mean(),
                'Total Sales': cat_data['Sales'].sum(),
                'Orders': len(cat_data)
            })

        profit_df = pd.DataFrame(profit_data)
        profit_df = profit_df.sort_values('Total Profit', ascending=False)

        # Format columns
        profit_df['Total Profit'] = profit_df['Total Profit'].round(2)
        profit_df['Average Profit Margin'] = profit_df['Average Profit Margin'].round(2)
        profit_df['Average Discount'] = (profit_df['Average Discount'] * 100).round(2)
        profit_df['Total Sales'] = profit_df['Total Sales'].round(2)

        # Create scatter plot
        fig = px.scatter(
            self.df,
            x='Discount',
            y='Profit_Margin',
            size='Sales',
            color='Category',
            title='Discount vs Profit Margin Analysis',
            labels={'Discount': 'Discount Rate', 'Profit_Margin': 'Profit Margin (%)'},
            height=500
        )

        # Overall statistics
        overall_stats = {
            'Metric': ['Total Profit', 'Average Profit Margin', 'Average Discount', 'Total Sales', 'Total Orders'],
            'Value': [
                f"₹{self.df['Profit'].sum():.2f}",
                f"{self.df['Profit_Margin'].mean():.2f}%",
                f"{self.df['Discount'].mean():.2%}",
                f"₹{self.df['Sales'].sum():.2f}",
                f"{len(self.df):,}"
            ]
        }

        stats_df = pd.DataFrame(overall_stats)

        return fig, profit_df, stats_df

    def build_prediction_model(self):
        """Build machine learning model for sales prediction"""
        if self.df is None:
            return "⚠️ Please load data first", None

        try:
            # Prepare features for modeling
            df_model = self.df.copy()

            # Select features for prediction
            categorical_features = ['Category', 'Sub Category', 'City', 'Region', 'State']
            numerical_features = ['Month_Num', 'Year', 'Discount', 'Weekday']

            # Encode categorical variables
            for feature in categorical_features:
                le = LabelEncoder()
                df_model[feature + '_encoded'] = le.fit_transform(df_model[feature])
                self.label_encoders[feature] = le

            # Prepare feature matrix
            feature_cols = [f + '_encoded' for f in categorical_features] + numerical_features
            self.feature_columns = feature_cols

            X = df_model[feature_cols]
            y = df_model['Sales']

            # Split data
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

            # Scale features
            self.scaler = StandardScaler()
            X_train_scaled = self.scaler.fit_transform(X_train)
            X_test_scaled = self.scaler.transform(X_test)

            # Train Random Forest model
            self.model = RandomForestRegressor(n_estimators=100, random_state=42)
            self.model.fit(X_train_scaled, y_train)

            # Make predictions
            y_pred = self.model.predict(X_test_scaled)

            # Calculate metrics
            mse = mean_squared_error(y_test, y_pred)
            rmse = np.sqrt(mse)
            mae = mean_absolute_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)

            # Model performance summary
            performance_data = {
                'Metric': ['R² Score', 'Root Mean Squared Error', 'Mean Absolute Error', 'Training Samples', 'Test Samples'],
                'Value': [f"{r2:.4f}", f"{rmse:.2f}", f"{mae:.2f}", f"{len(X_train):,}", f"{len(X_test):,}"]
            }

            performance_df = pd.DataFrame(performance_data)

            # Feature importance
            feature_importance_data = []
            for feature, importance in zip(feature_cols, self.model.feature_importances_):
                feature_importance_data.append({
                    'Feature': feature,
                    'Importance': importance,
                    'Importance_Percentage': f"{importance*100:.2f}%"
                })

            feature_importance_df = pd.DataFrame(feature_importance_data)
            feature_importance_df = feature_importance_df.sort_values('Importance', ascending=False)

            results = "✅ Model Training Complete!"

            return results, performance_df, feature_importance_df

        except Exception as e:
            return f"❌ Error building model: {str(e)}", None, None

    def predict_sales(self, category, sub_category, city, region, state, month, year, discount):
        """Make sales prediction"""
        if self.model is None:
            return "⚠️ Please build the model first"

        try:
            # Create input dataframe
            input_data = pd.DataFrame({
                'Category': [category],
                'Sub Category': [sub_category],
                'City': [city],
                'Region': [region],
                'State': [state],
                'Month_Num': [month],
                'Year': [year],
                'Discount': [discount],
                'Weekday': [0]  # Default weekday
            })

            # Encode categorical variables
            for feature in ['Category', 'Sub Category', 'City', 'Region', 'State']:
                if feature in self.label_encoders:
                    try:
                        input_data[feature + '_encoded'] = self.label_encoders[feature].transform(input_data[feature])
                    except:
                        input_data[feature + '_encoded'] = 0  # Default for unseen categories

            # Prepare features
            X_input = input_data[self.feature_columns]
            X_input_scaled = self.scaler.transform(X_input)

            # Make prediction
            prediction = self.model.predict(X_input_scaled)[0]

            return f"🔮 **Predicted Sales: ₹{prediction:.2f}**"

        except Exception as e:
            return f"❌ Error making prediction: {str(e)}"

# Initialize the analytics class
analytics = SupermartAnalytics()

# Gradio interface functions
def load_data_interface(file):
    if file is None:
        return "⚠️ Please upload a CSV file"
    return analytics.load_data(file.name)

def get_overview():
    basic_info, dtypes_df, numeric_stats, sample_data = analytics.get_data_overview()
    return basic_info, dtypes_df, numeric_stats, sample_data

def category_analysis():
    fig, analysis_df = analytics.create_category_analysis()
    return fig, analysis_df

def temporal_analysis():
    fig, yearly_df = analytics.create_temporal_analysis()
    return fig, yearly_df

def regional_analysis():
    fig, regional_df, city_df = analytics.create_regional_analysis()
    return fig, regional_df, city_df

def profit_analysis():
    fig, profit_df, stats_df = analytics.create_profit_analysis()
    return fig, profit_df, stats_df

def build_model():
    results, performance_df, feature_df = analytics.build_prediction_model()
    return results, performance_df, feature_df

def make_prediction(category, sub_category, city, region, state, month, year, discount):
    return analytics.predict_sales(category, sub_category, city, region, state, month, year, discount)

# Create Gradio interface
with gr.Blocks(title="Supermart Grocery Sales Analytics", theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # 🛒 Supermart Grocery Sales Analytics Dashboard

    **Data Science Internship Project**

    This interactive dashboard provides comprehensive analysis of grocery sales data including:
    - Data exploration and visualization
    - Sales performance analysis
    - Predictive modeling
    - Interactive predictions

    **Dataset:** [Download from Kaggle](https://www.kaggle.com/datasets/mohanavamsi/supermart-grocery-sales-retail-analytics-dataset)
    """)

    with gr.Tab("📁 Data Loading"):
        gr.Markdown("### Upload your CSV file to get started")
        file_input = gr.File(label="Upload CSV File", file_types=[".csv"])
        load_btn = gr.Button("Load Data", variant="primary")
        load_output = gr.Textbox(label="Status", lines=3)

        load_btn.click(load_data_interface, inputs=[file_input], outputs=[load_output])

    with gr.Tab("📊 Data Overview"):
        overview_btn = gr.Button("Generate Overview", variant="primary")

        with gr.Row():
            overview_text = gr.Textbox(label="Dataset Summary", lines=10)

        with gr.Row():
            with gr.Column():
                gr.Markdown("### Column Information")
                dtypes_table = gr.Dataframe(label="Data Types & Info")
            with gr.Column():
                gr.Markdown("### Numerical Statistics")
                stats_table = gr.Dataframe(label="Statistical Summary")

        gr.Markdown("### Sample Data (First 10 rows)")
        sample_table = gr.Dataframe(label="Sample Data")

        overview_btn.click(get_overview, outputs=[overview_text, dtypes_table, stats_table, sample_table])

    with gr.Tab("📈 Category Analysis"):
        category_btn = gr.Button("Analyze Categories", variant="primary")

        with gr.Row():
            category_plot = gr.Plot(label="Category Sales Visualization")

        with gr.Row():
            gr.Markdown("### Category Performance Summary")
            category_table = gr.Dataframe(label="Category Analysis")

        category_btn.click(category_analysis, outputs=[category_plot, category_table])

    with gr.Tab("⏱️ Time Series Analysis"):
        temporal_btn = gr.Button("Analyze Temporal Trends", variant="primary")

        with gr.Row():
            temporal_plot = gr.Plot(label="Temporal Analysis")

        with gr.Row():
            gr.Markdown("### Yearly Performance Summary")
            yearly_table = gr.Dataframe(label="Year-wise Analysis")

        temporal_btn.click(temporal_analysis, outputs=[temporal_plot, yearly_table])

    with gr.Tab("🗺️ Regional Analysis"):
        regional_btn = gr.Button("Analyze Regional Performance", variant="primary")

        with gr.Row():
            regional_plot = gr.Plot(label="Regional Analysis")

        with gr.Row():
            with gr.Column():
                gr.Markdown("### Regional Summary")
                regional_table = gr.Dataframe(label="Regional Performance")
            with gr.Column():
                gr.Markdown("### Top Cities")
                city_table = gr.Dataframe(label="City Performance")

        regional_btn.click(regional_analysis, outputs=[regional_plot, regional_table, city_table])

    with gr.Tab("💰 Profit Analysis"):
        profit_btn = gr.Button("Analyze Profit Margins", variant="primary")

        with gr.Row():
            profit_plot = gr.Plot(label="Profit Analysis")

        with gr.Row():
            with gr.Column():
                gr.Markdown("### Overall Statistics")
                stats_table = gr.Dataframe(label="Summary Statistics")
            with gr.Column():
                gr.Markdown("### Category-wise Profit")
                profit_table = gr.Dataframe(label="Profit by Category")

        profit_btn.click(profit_analysis, outputs=[profit_plot, stats_table, profit_table])

    with gr.Tab("🤖 ML Model"):
        model_btn = gr.Button("Build Prediction Model", variant="primary")

        with gr.Row():
            model_status = gr.Textbox(label="Model Status", lines=3)

        with gr.Row():
            with gr.Column():
                gr.Markdown("### Model Performance")
                performance_table = gr.Dataframe(label="Performance Metrics")
            with gr.Column():
                gr.Markdown("### Feature Importance")
                feature_table = gr.Dataframe(label="Feature Importance")

        model_btn.click(build_model, outputs=[model_status, performance_table, feature_table])

    with gr.Tab("🔮 Sales Prediction"):
        gr.Markdown("### Make Sales Predictions")

        with gr.Row():
            with gr.Column():
                pred_category = gr.Dropdown(
                    choices=["Food Grains", "Beverages", "Snacks & Branded Foods", "Fruits & Veggies",
                            "Egg, Meat & Fish", "Oil & Masala", "Cleaning & Household", "Dairy", "Gourmet & World Food"],
                    label="Category", value="Food Grains"
                )
                pred_sub_category = gr.Textbox(label="Sub Category", value="Atta & Flour")
                pred_city = gr.Textbox(label="City", value="Chennai")
                pred_region = gr.Dropdown(choices=["North", "South", "East", "West"], label="Region", value="South")
                pred_state = gr.Textbox(label="State", value="Tamil Nadu")

            with gr.Column():
                pred_month = gr.Slider(minimum=1, maximum=12, step=1, label="Month", value=6)
                pred_year = gr.Slider(minimum=2016, maximum=2025, step=1, label="Year", value=2024)
                pred_discount = gr.Slider(minimum=0, maximum=0.5, step=0.01, label="Discount Rate", value=0.1)

        predict_btn = gr.Button("Predict Sales", variant="primary")
        prediction_output = gr.Textbox(label="Prediction Result", lines=3)

        predict_btn.click(
            make_prediction,
            inputs=[pred_category, pred_sub_category, pred_city, pred_region, pred_state,
                   pred_month, pred_year, pred_discount],
            outputs=[prediction_output]
        )

    gr.Markdown("""
    ---
    **Instructions:**
    1. Upload your CSV file in the "Data Loading" tab
    2. Explore data insights through various analysis tabs
    3. Build the ML model in the "ML Model" tab
    4. Make predictions in the "Sales Prediction" tab

    **Note:** This is a comprehensive data science project for internship purposes.
    """)

# Launch the app
if __name__ == "__main__":
    demo.launch(share=True, debug=True)

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://2955df7d1ef9826355.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://2955df7d1ef9826355.gradio.live
