In [None]:
# --- Cell 1 ---
# Import required libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from pathlib import Path

# Configure visual style
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 6)

# Paths
DATA_PATH = Path("data/financial_sample.xlsx")
CSV_PATH = Path("data/financial_sample.csv")
VIS_PATH = Path("visualizations")
VIS_PATH.mkdir(exist_ok=True)


In [None]:
# --- Cell 2 ---
# Convert Excel file to CSV for easier processing
df_excel = pd.read_excel(DATA_PATH)
df_excel.to_csv(CSV_PATH, index=False)
print(f"✅ File converted and saved as {CSV_PATH}")


In [None]:
# --- Cell 3 ---
# Load data from CSV
df = pd.read_csv(CSV_PATH)

# Display dataset info
display(df.head())
print("\n--- Dataset Info ---")
print(df.info())
print("\n--- Missing Values ---")
print(df.isnull().sum())


In [None]:
# --- Cell 4 ---
# Basic numerical statistics
display(df.describe())

# Check unique values in key columns
print("\n--- Unique values per categorical column ---")
for col in df.select_dtypes(include="object").columns:
    print(f"{col}: {df[col].nunique()} unique values")


In [None]:
# --- Cell 5 ---
# Compute correlation for numerical columns
corr = df.corr(numeric_only=True)

plt.figure(figsize=(10, 6))
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Matrix")
plt.show()


In [None]:
# --- Cell 6 ---
# Static grouped barplot
plt.figure(figsize=(12, 6))
sns.barplot(data=df, x="Country", y="Profit", hue="Segment", estimator=sum)
plt.title("Total Profit by Segment and Country")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# --- Cell 7 ---
# Interactive version of the same visualization
fig = px.bar(
    df,
    x="Country",
    y="Profit",
    color="Segment",
    title="Total Profit by Segment and Country (Interactive)",
    hover_data=["Product", "Discount Band", "Month Name"],
)
fig.update_layout(barmode="group", template="plotly_white")

html_path = VIS_PATH / "profit_by_segment_country.html"
fig.write_html(html_path)
print(f"✅ Interactive chart saved to: {html_path}")

fig.show()


In [None]:
# --- Cell 8 ---
# Scatterplot with regression
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x="Discount", y="Profit", hue="Segment")
sns.regplot(data=df, x="Discount", y="Profit", scatter=False, color="black")
plt.title("Relationship Between Discount and Profit")
plt.tight_layout()
plt.show()


In [None]:
# --- Cell 9 ---
# Interactive scatter plot
fig2 = px.scatter(
    df,
    x="Discount",
    y="Profit",
    color="Segment",
    size="Sales",
    hover_data=["Product", "Country"],
    title="Profit vs. Discount (Interactive)",
)
fig2.update_layout(template="plotly_white")

html_path2 = VIS_PATH / "profit_vs_discount.html"
fig2.write_html(html_path2)
print(f"✅ Interactive chart saved to: {html_path2}")

fig2.show()


In [None]:
# --- Cell 10 ---
# Quick text-based summary for insights
print("📊 Key Insights:")
print("- Profit is heavily influenced by discount rate — higher discounts lower profit.")
print("- The 'Consumer' segment generally yields higher sales but lower margins.")
print("- Certain countries (like the USA and Germany) dominate in total profit.")
print("- Product categories vary strongly by segment, suggesting targeted pricing strategy could help.")


In [None]:
# --- Cell 11 ---
# Save cleaned version (optional)
CLEAN_PATH = Path("data/financial_sample_cleaned.csv")
df.to_csv(CLEAN_PATH, index=False)
print(f"✅ Cleaned dataset saved to: {CLEAN_PATH}")
