<a href="https://colab.research.google.com/github/DATAGEEKN/Customer-Segmentation-in-Python-and-AI/blob/main/Customer_Segmentation_in_Python_and_AI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install the Google AI library
!pip install -q google-generativeai

# Import necessary libraries
import pandas as pd
import numpy as np
import google.generativeai as genai
import json
import time
from tqdm.auto import tqdm

# For Colab-specific features
from google.colab import userdata
from google.colab import files
import io

# For ML/Clustering
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# For visualization
import matplotlib.pyplot as plt
import seaborn as sns

# To display dataframes nicely side-by-side
from IPython.display import display, HTML

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Access the API key and configure the Gemini API
try:
    api_key = userdata.get('GEMINI_API_KEY')
    genai.configure(api_key=api_key)
    print("✅ Gemini API configured successfully!")
except Exception as e:
    print(f"❌ Could not configure Gemini API. Please check your secret settings. Error: {e}")

In [None]:
# --- DIAGNOSTIC CELL: Check for available columns ---

import pandas as pd

url = 'https://raw.githubusercontent.com/mdoganozgun/product-enrichment-agent/refs/heads/main/data/enriched_retail.csv'

try:
    df_check = pd.read_csv(url)
    print("✅ File loaded for inspection. The available columns are:")
    print(list(df_check.columns))
except Exception as e:
    print(f"❌ Could not load file for diagnosis. Error: {e}")

In [None]:
# CELL 3 - UPDATED to use all available rich columns

import pandas as pd

# URL for the single, pre-enriched retail dataset
url = 'https://raw.githubusercontent.com/mdoganozgun/product-enrichment-agent/refs/heads/main/data/enriched_retail.csv'

print(f"⬇️ Loading the fully enriched dataset from:\n{url}")

try:
    df_enriched = pd.read_csv(url)

    # --- Data Cleaning (Updated) ---
    # Define all the columns we expect to use.
    required_columns = [
        'Description', 'CustomerID', 'category', 'usage_context',
        'price_segment', 'material_type', 'target_gender', 'target_age_group', 'tags'
    ]

    # Drop rows where any of our key enrichment columns are missing
    df_enriched.dropna(subset=required_columns, inplace=True)

    if 'InvoiceNo' in df_enriched.columns:
        df_enriched = df_enriched[~df_enriched['InvoiceNo'].astype(str).str.startswith('C')]
    df_enriched['CustomerID'] = df_enriched['CustomerID'].astype(int)

    print("\n✅ Fully enriched data loaded and cleaned successfully!")
    print(f"   Data has {df_enriched.shape[0]} rows and {df_enriched.shape[1]} columns.")
    print("\nSample of the loaded data:")
    display(df_enriched.head())

except Exception as e:
    print(f"❌ A critical error occurred while loading the data.")
    print(f"   Error details: {e}")

In [None]:
# CELL 4 - UPDATED to build richer profiles

print("🛠️ Building richer customer profiles using all available features...")

# --- Corrected: Combine all the useful enriched columns into a single string ---
feature_columns = [
    'category', 'usage_context', 'price_segment',
    'material_type', 'target_gender', 'target_age_group', 'tags'
]

# Convert all feature columns to string and join them
df_enriched['all_tags'] = df_enriched[feature_columns].astype(str).agg(' '.join, axis=1)


# --- For each customer, create a single document of all tags from all their purchases ---
customer_profiles = df_enriched.groupby('CustomerID')['all_tags'].apply(lambda x: ' '.join(x)).reset_index()

# --- Vectorize the Profiles ---
# We can now potentially capture more features
vectorizer = CountVectorizer(min_df=0.05, max_df=0.9)
customer_vectors = vectorizer.fit_transform(customer_profiles['all_tags'])
vector_df = pd.DataFrame(customer_vectors.toarray(), columns=vectorizer.get_feature_names_out(), index=customer_profiles['CustomerID'])

print("✅ Richer customer profile vectors created.")
print("\nSample of Numerical Customer Profile Vectors:")
display(vector_df.head())


In [None]:
# CELL 5 - Cluster Customers into Segments

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# --- Clustering with K-Means ---
N_CLUSTERS = 4 # You can change this number to find more or fewer segments

print(f"⚙️  Grouping customers into {N_CLUSTERS} segments using K-Means clustering...")

# It's good practice to scale the data before clustering
scaler = StandardScaler()
scaled_vectors = scaler.fit_transform(vector_df)

kmeans = KMeans(n_clusters=N_CLUSTERS, random_state=42, n_init=10)
vector_df['cluster'] = kmeans.fit_predict(scaled_vectors)

print(f"\n✅ Customers clustered successfully.")
print("\nDistribution of customers across segments:")

# Display the number of customers in each segment
display(vector_df['cluster'].value_counts().sort_index().to_frame())

In [None]:
# CELL 6 - AI-Assisted Persona Generation

import pandas as pd
from IPython.display import display, HTML
import json

def create_cluster_persona(cluster_id, top_tags, model):
    """
    The 'Persona Generation Agent'. It takes a cluster's top purchase
    tags and creates a descriptive persona using the Gemini AI.
    """
    prompt = f"""
    As an expert marketing strategist for an online retail store, create a detailed customer persona for a segment that primarily buys products related to these keywords: {', '.join(top_tags)}.

    Please provide a response as a valid JSON object with the following keys:
    - "persona_name": A catchy, descriptive name for this segment (e.g., "The Thoughtful Gifter", "The Home Comfort Creator").
    - "description": A 1-2 sentence summary of who this person is, their motivations, and what they likely value.
    - "marketing_strategies": A bulleted list of 3 actionable marketing strategies to effectively engage this specific segment.

    JSON Response:
    """
    try:
        response = model.generate_content(prompt)
        return json.loads(response.text.replace("```json", "").replace("```", "").strip())
    except Exception as e:
        print(f"An error occurred while generating a persona for cluster {cluster_id}: {e}")
        return {"persona_name": "Error", "description": "Failed to generate persona.", "marketing_strategies": []}

# --- Analyze and Interpret Each Cluster ---
all_personas = []
# Ensure the Gemini model is initialized
gemini_model = genai.GenerativeModel('gemini-1.5-flash')

print("🤖 Deploying AI Persona Generation Agent...")

for cluster_id in range(N_CLUSTERS):
    print(f"\nAnalyzing Cluster {cluster_id}...")

    # Isolate the customers belonging to this cluster
    cluster_customers = vector_df[vector_df['cluster'] == cluster_id].index

    # Find the top 10 most common features for this cluster
    # We drop the 'cluster' column before summing up the features
    top_tags = vector_df.loc[cluster_customers].drop('cluster', axis=1).sum().sort_values(ascending=False).head(10).index.tolist()
    print(f"  > Top keywords for this group: {top_tags}")

    # Call the AI agent to create a persona
    print(f"  > Sending data to Gemini to generate persona...")
    persona_data = create_cluster_persona(cluster_id, top_tags, gemini_model)
    persona_data['cluster_id'] = cluster_id
    persona_data['top_keywords'] = ', '.join(top_tags)
    all_personas.append(persona_data)

print("\n\n--- ⭐️ FINAL CUSTOMER SEGMENT PERSONAS ⭐️ ---")
personas_df = pd.DataFrame(all_personas)[['cluster_id', 'persona_name', 'description', 'top_keywords', 'marketing_strategies']]

# Prettify the display for easier reading
personas_df['marketing_strategies'] = personas_df['marketing_strategies'].apply(
    lambda x: "• " + "\n• ".join(x) if isinstance(x, list) else "N/A"
)
display(HTML(personas_df.to_html(index=False, justify='left')))