# 📊 Exploratory Data Analysis (EDA) on Global Superstore Dataset
**Objective**: Perform EDA to identify trends, patterns, anomalies, and factors influencing performance.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)
sns.set(style='whitegrid')

## Step 1: Load Dataset

In [None]:
# Make sure Global_Superstore.xlsx is in the same folder
file_path = 'Global_Superstore.xlsx'
df = pd.read_excel(file_path)
df.head()

## Step 2: Data Overview

In [None]:
print("Initial shape:", df.shape)
df.info()

## Step 3: Handle Missing Values

In [None]:
df['Postal Code'].fillna(df['Postal Code'].median(), inplace=True)
df.dropna(subset=['Sales', 'Profit'], inplace=True)

## Step 4: Remove Duplicates

In [None]:
df.drop_duplicates(inplace=True)
print("Duplicates removed.")

## Step 5: Handle Outliers using IQR

In [None]:
def remove_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return data[(data[column] >= lower) & (data[column] <= upper)]

df = remove_outliers_iqr(df, 'Sales')
df = remove_outliers_iqr(df, 'Profit')

## Step 6: Descriptive Statistics

In [None]:
df[['Sales', 'Profit', 'Discount', 'Quantity']].describe()

## Step 7: Correlation Matrix

In [None]:
correlation = df[['Sales', 'Profit', 'Discount', 'Quantity']].corr()
correlation

## Step 8: Visualizations
### Histogram - Sales

In [None]:
plt.figure(figsize=(8,5))
df['Sales'].hist(bins=50)
plt.title("Sales Distribution")
plt.xlabel("Sales")
plt.ylabel("Frequency")
plt.show()

### Boxplot - Profit

In [None]:
plt.figure(figsize=(6,4))
sns.boxplot(y=df['Profit'])
plt.title("Boxplot of Profit")
plt.show()

### Heatmap - Correlation Matrix

In [None]:
plt.figure(figsize=(6,5))
sns.heatmap(correlation, annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()

### Sales by Region

In [None]:
plt.figure(figsize=(8,5))
sns.barplot(data=df, x='Region', y='Sales', estimator=sum, ci=None)
plt.title("Total Sales by Region")
plt.xticks(rotation=45)
plt.ylabel("Total Sales")
plt.show()

### Sales by Category

In [None]:
plt.figure(figsize=(8,5))
sns.barplot(data=df, x='Category', y='Sales', estimator=sum, ci=None)
plt.title("Total Sales by Category")
plt.ylabel("Total Sales")
plt.show()

## Step 9: Save Cleaned Dataset

In [None]:
df.to_csv('Cleaned_Global_Superstore.csv', index=False)
print("Cleaned dataset saved as 'Cleaned_Global_Superstore.csv'")