In [None]:
# 🟢 STEP 1: Upload your CSV files manually
from google.colab import files
import pandas as pd
import io

print("Upload cluster_insights_summary.csv")
uploaded1 = files.upload()
# Get the actual filename from the uploaded dictionary
uploaded1_filename = list(uploaded1.keys())[0]
traits_df = pd.read_csv(io.BytesIO(uploaded1[uploaded1_filename]))

print("Upload uplift_predictions.csv")
uploaded2 = files.upload()
# Get the actual filename from the uploaded dictionary
uploaded2_filename = list(uploaded2.keys())[0]
uplift_df = pd.read_csv(io.BytesIO(uploaded2[uploaded2_filename]))

In [None]:
%pip install -q transformers accelerate bitsandbytes
%pip install tqdm
%pip install -q transformers huggingface_hub

from huggingface_hub import login

login(token="your_huggingface_token_here")  # Replace with your Hugging Face token


In [None]:
# STEP 3: Load the Mistral model from Hugging Face directly
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch

model_id = "mistralai/Mistral-7B-Instruct-v0.3"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.float16
)

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)


In [None]:
# 🟢 STEP 5: Merge datasets
merged_df = pd.merge(traits_df, uplift_df, on="cluster", how="inner")


In [None]:
# Step 6: Load existing output
import json

print("\n📤 (Optional) Upload previous output JSON if resuming:")
uploaded_json = files.upload()

try:
    existing_json_path = list(uploaded_json.keys())[0]  # expects one JSON
    with open(existing_json_path, "r", encoding="utf-8") as f:
        existing_results = json.load(f)
    completed_clusters = {(r["cluster"], r["segment"]) for r in existing_results}
    print(f"✅ Loaded {len(existing_results)} existing summaries.")
except:
    existing_results = []
    completed_clusters = set()
    print("⚠️ No existing JSON found. Starting fresh.")


In [None]:
# Step 7: Prompt builder
def build_prompt(row):
    return f"""
You are a marketing strategist working with a leading hypermarket chain in the MENA region.
Your task is to generate a short, persuasive marketing campaign brief for a customer cluster,
based on the data provided for **Cluster {row['cluster']}**.

Customer Traits:
- Customer Cluster: {row['cluster']}
- Segment: {row['segment']}
- Average Age: {row['age']:.1f}
- Recency (avg days since last purchase): {row['recency_mean']:.1f}
- Frequency (avg monthly purchases): {row['frequency_mean']:.1f}
- Monetary (avg monthly spend): ${row['monetary_mean']:.2f}
- Number of customers: {int(row['num_customers'])}

Promotion Impact:
- Treatment strategy applied: {row['treatment']}
- Observed uplift score: {row['uplift_score']:.3f}

Write a 3-4 sentence campaign proposal including: suggested tone, promotional offer
type (discount, bundle, loyalty points, etc.), product categories to target, and
expected customer response behavior. Avoid repeating the input values verbatim and
do not mention cluster numbers.
Focus on creating a compelling narrative that aligns with the customer traits and
the uplift strategy.
Make sure to highlight how the campaign will resonate with the target audience
and drive engagement.
Do not include any introductory phrases like "Based on the data" or "The analysis shows".
Your response should be concise, engaging, and actionable.
""".strip()


In [None]:
# 🟢 STEP 7: Generate new summaries only
from tqdm import tqdm

new_results = []
for i, row in tqdm(merged_df.iterrows(), total=len(merged_df)):
    key = (int(row["cluster"]), row["segment"])
    if key in completed_clusters:
        continue
    try:
        prompt = build_prompt(row)
        response = pipe(prompt, max_new_tokens=256, temperature=0.7)[0]["generated_text"]
        new_results.append({
            "cluster": int(row["cluster"]),
            "segment": row.get("segment", "N/A"),
            "uplift_score": row["uplift_score"],
            "campaign_brief": response.strip()
        })
    except Exception as e:
        print(f"Error on cluster {key}: {e}")
        new_results.append({
            "cluster": int(row["cluster"]),
            "segment": row.get("segment", "N/A"),
            "uplift_score": row["uplift_score"],
            "campaign_brief": f"Error: {e}"
        })


In [None]:
# 🟢 STEP 8: Save updated results
output_filename = "campaign_summaries_gpu.json"
all_results = existing_results + new_results
with open(output_filename, "w", encoding="utf-8") as f:
    json.dump(all_results, f, indent=2, ensure_ascii=False)

print(f"\n✅ Generated {len(new_results)} new summaries. Total: {len(all_results)} saved.")
print(f"📄 Saved to: {output_filename}")


In [None]:
from google.colab import files
files.download("campaign_summaries_gpu.json")
