In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (12,6)

In [2]:
# Load cleaned data from processed folder
df = pd.read_csv('../data/processed/sales_data.csv')  # adjust path if needed

# Convert Order_Date to datetime
if 'Order_Date' in df.columns:
    df['Order_Date'] = pd.to_datetime(df['Order_Date'])

# Calculate Profit Margin
df['Profit_Margin'] = df['Profit'] / df['Sales']

# Preview
print(df.head())
print(df.info())
print(df.describe())


FileNotFoundError: [Errno 2] No such file or directory: '../data/processed/sales_data.csv'

In [None]:
#Dataset Overview


# Shape & info
print("Dataset shape:", df.shape)
print("\nInfo:\n", df.info())

# Missing values
print("\nMissing values:\n", df.isnull().sum())

# Summary statistics
df.describe()

In [None]:
# Numerical Features Analysis


# Profit Distribution
sns.histplot(df['Profit'], bins=30, kde=True, color='green')
plt.title('Profit Distribution')
plt.show()

# Sales vs Profit
sns.scatterplot(x='Sales', y='Profit', data=df)
plt.title('Profit vs Sales')
plt.show()

# Correlation Heatmap
num_cols = df.select_dtypes(include='number').columns
sns.heatmap(df[num_cols].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# 5. Categorical Analysis

# Profit by Category
df.groupby('Category')['Profit'].sum().sort_values(ascending=False).plot(kind='bar', color='skyblue')
plt.title('Total Profit by Category')
plt.show()

# Profit by Sub-Category
df.groupby('Sub-Category')['Profit'].sum().sort_values(ascending=False).plot(kind='bar', color='orange')
plt.title('Total Profit by Sub-Category')
plt.show()

# Profit by Region
df.groupby('Region')['Profit'].sum().sort_values(ascending=False).plot(kind='bar', color='purple')
plt.title('Total Profit by Region')
plt.show()

In [None]:
# Interactive Plots
# Profit distribution interactive
fig = px.histogram(df, x='Profit', nbins=50, title='Profit Distribution', marginal='box', color_discrete_sequence=['green'])
fig.show()

# Profit vs Sales scatter
fig = px.scatter(df, x='Sales', y='Profit', color='Region', hover_data=['Product', 'Category'], title='Profit vs Sales')
fig.show()

# Monthly profit trend (if date exists)
monthly_profit = df.set_index('Order_Date').resample('M')['Profit'].sum().reset_index()
fig = px.line(monthly_profit, x='Order_Date', y='Profit', title='Monthly Profit Trend')
fig.show()