In [0]:
# ============================================================
# ⚙️ Step 3: Feature Engineering — MarketMind Analytics
# ============================================================

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

print("🚀 Starting Step 3: Feature Engineering...")

# ---------- Paths ----------
input_path = "/Workspace/Users/abhishekgantana1@gmail.com/marketmind/outputs/sentiment_data.csv"
output_path = "/Workspace/Users/abhishekgantana1@gmail.com/marketmind/outputs/feature_engineered_data.csv"

# ---------- File Check ----------
if not os.path.exists(input_path):
    raise FileNotFoundError(f"❌ Missing input file: {input_path}")
else:
    print(f"✅ Input file found: {input_path}")

# ---------- Load Data ----------
df = pd.read_csv(input_path)
print(f"✅ Loaded data: {df.shape[0]} rows, {df.shape[1]} columns")

# ---------- Handle Missing Values ----------
df.fillna(method='ffill', inplace=True)
df.fillna(method='bfill', inplace=True)
print("🧹 Missing values handled with forward/backward fill.")

# ---------- Feature Creation ----------

# 🧾 Ensure sales column exists
if "sales_volume" not in df.columns:
    print("⚠️ 'sales_volume' not found — generating synthetic sales data...")
    df["sales_volume"] = np.random.randint(100, 500, len(df))

# 🧮 Rolling average of sales (7-day moving average)
df["rolling_sales_avg"] = df["sales_volume"].rolling(window=7, min_periods=1).mean()

# 😊 Positive sentiment ratio over time (7-day trend)
if "sentiment_label" in df.columns:
    df["positive_sentiment_ratio"] = (
        (df["sentiment_label"] == "Positive").astype(int).rolling(window=7, min_periods=1).mean()
    )
else:
    print("⚠️ 'sentiment_label' not found — defaulting positive_sentiment_ratio to 0.")
    df["positive_sentiment_ratio"] = 0

# 💸 Marketing Efficiency Feature (spend vs. sales)
if "marketing_spend" in df.columns:
    df["spend_to_sales_ratio"] = df["marketing_spend"] / (df["sales_volume"] + 1)
else:
    print("⚠️ 'marketing_spend' not found — creating default ratio.")
    df["spend_to_sales_ratio"] = np.random.uniform(0.5, 1.5, len(df))

# 🧠 Interaction Feature (Sentiment × Sales)
df["sentiment_sales_interaction"] = df["sentiment_score"] * df["sales_volume"]

print("✨ Feature engineering transformations completed successfully.")

# ---------- Correlation Heatmap ----------
try:
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    corr_matrix = df[numeric_cols].corr()

    plt.figure(figsize=(10, 8))
    sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
    plt.title("📊 Feature Correlation with Sales Volume", fontsize=14)
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    plt.tight_layout()
    display(plt.gcf())
    plt.close()
    print("✅ Feature correlation heatmap generated successfully.")
except Exception as e:
    print(f"⚠️ Could not generate heatmap: {e}")

# ---------- Save Output ----------
os.makedirs(os.path.dirname(output_path), exist_ok=True)
df.to_csv(output_path, index=False)
print(f"💾 Engineered data saved at: {output_path}")

# ---------- Exit Gracefully ----------
dbutils.notebook.exit("✅ Step 3 completed successfully — Features Created & Correlation Heatmap Generated.")
