In [1]:
# Customer Segmentation Analysis
# Using Mall Customers Dataset

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Set the visual style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("Set2")

"""Load and perform initial exploration of the dataset"""
def load_and_explore_data(filepath):

    print("Step 1: Loading and exploring data...")

    # Load the dataset
    try:
        df = pd.read_csv(filepath)
    except FileNotFoundError:
        print(f"File {filepath} not found.")

    # Display basic information
    print(f"Dataset shape: {df.shape}")

    print("\nData sample:")
    print(df.head())

    print("\nData info:")
    print(df.info())

    print("\nSummary statistics:")
    print(df.describe())

    print("\nMissing values:")
    print(df.isnull().sum())

    print("\nDuplicate rows:")
    print(df.duplicated().sum())

    return df

"""Clean, transform and prepare data for analysis"""
def preprocess_data(df):

    print("\nStep 2: Preprocessing data...")

    # Create a copy to avoid modifying the original
    processed_df = df.copy()

    # Drop CustomerID as it's not useful for clustering
    if 'CustomerID' in processed_df.columns:
        processed_df.drop('CustomerID', axis=1, inplace=True)
        print("Dropped CustomerID column")

    # Handle missing values
    # For numerical columns, fill with median
    num_cols = processed_df.select_dtypes(include=np.number).columns
    for col in num_cols:
        if processed_df[col].isnull().sum() > 0:
            median_val = processed_df[col].median()
            processed_df[col].fillna(median_val, inplace=True)
            print(f"Filled missing values in {col} with median: {median_val}")

    # For categorical columns, fill with mode
    cat_cols = processed_df.select_dtypes(exclude=np.number).columns
    for col in cat_cols:
        if processed_df[col].isnull().sum() > 0:
            mode_val = processed_df[col].mode()[0]
            processed_df[col].fillna(mode_val, inplace=True)
            print(f"Filled missing values in {col} with mode: {mode_val}")

    # Remove duplicates
    before_dedup = len(processed_df)
    processed_df.drop_duplicates(inplace=True)
    after_dedup = len(processed_df)
    print(f"Removed {before_dedup - after_dedup} duplicate rows")

    # Detect and remove outliers using Z-score
    for col in num_cols:
        z_scores = stats.zscore(processed_df[col])
        abs_z_scores = np.abs(z_scores)
        filtered_entries = (abs_z_scores < 3)  # Keep only entries with z-score < 3
        outliers = processed_df[~filtered_entries]
        if len(outliers) > 0:
            print(f"Removed {len(outliers)} outliers from {col}")
            processed_df = processed_df[filtered_entries]

    # print(f"processed df columns: {processed_df.columns}")
    return processed_df

"""Determine if Gender should be included in clustering and prepare encoded version"""
def analyze_gender_relevance(df):

    print("\nStep 3: Analyzing gender relevance...")

    if 'Gender' not in df.columns:
        print("Gender column not found in dataset")
        return False, df

    include_gender = False

    # Analyze spending patterns by gender
    if 'Spending Score (1-100)' in df.columns:
        plt.figure(figsize=(12, 5))

        plt.subplot(1, 2, 1)
        sns.boxplot(x='Gender', y='Spending Score (1-100)', data=df)
        plt.title('Spending Score by Gender')

        plt.subplot(1, 2, 2)
        sns.boxplot(x='Gender', y='Annual Income (k$)', data=df)
        plt.title('Annual Income by Gender')

        # Perform t-test to check if the difference is significant
        male_spending = df[df['Gender'] == 'Male']['Spending Score (1-100)']
        female_spending = df[df['Gender'] == 'Female']['Spending Score (1-100)']

        t_stat, p_value = stats.ttest_ind(male_spending, female_spending, equal_var=False)
        print(f"T-test for Spending Score by Gender: t={t_stat:.2f}, p={p_value:.4f}")

        if p_value < 0.05:
            print("Gender shows significant difference in spending patterns (p < 0.05)")
            include_gender = True
        else:
            print("Gender does not show significant difference in spending patterns (p >= 0.05)")

        # Also test income differences
        male_income = df[df['Gender'] == 'Male']['Annual Income (k$)']
        female_income = df[df['Gender'] == 'Female']['Annual Income (k$)']

        t_stat, p_value = stats.ttest_ind(male_income, female_income, equal_var=False)
        print(f"T-test for Annual Income by Gender: t={t_stat:.2f}, p={p_value:.4f}")

        if p_value < 0.05:
            print("Gender shows significant difference in income (p < 0.05)")
            include_gender = True

    # Create copies of the dataset
    df_with_gender = df.copy()
    df_without_gender = df.drop('Gender', axis=1)

    # Encode Gender in the df_with_gender dataset
    if include_gender:
        le = LabelEncoder()
        df_with_gender['Gender_Encoded'] = le.fit_transform(df_with_gender['Gender'])
        print(f"Gender encoding: {dict(zip(le.classes_, le.transform(le.classes_)))}")

    # Return the appropriate dataset based on analysis
    if include_gender:
        print("Decision: Include Gender in the clustering analysis")
        print(include_gender, df_with_gender.columns)
        return True, df_with_gender
    else:
        print("Decision: Exclude Gender from the clustering analysis")
        print(include_gender, df_without_gender.columns)
        return False, df_without_gender

"""Standardize numerical features to ensure equal contribution"""
def standardize_features(df):

    print("\nStep 4: Standardizing features...")

    # Identify numerical columns (excluding any encoded categories we want to keep separate)
    num_cols = [col for col in df.select_dtypes(include=np.number).columns
                if not col.endswith('_Encoded')]

    print(f"Standardizing columns: {num_cols}")

    # Create a copy of the dataframe
    scaled_df = df.copy()

    # Apply standardization
    scaler = StandardScaler()
    scaled_df[num_cols] = scaler.fit_transform(df[num_cols])

    print("Features standardized successfully")
    return scaled_df, scaler

"""Perform EDA to visualize data distributions and relationships"""
def exploratory_data_analysis(df, original_df):

    print("\nStep 5: Exploratory Data Analysis...")

    # 1. Distribution of numerical features
    num_cols = df.select_dtypes(include=np.number).columns

    plt.figure(figsize=(15, 5))
    for i, col in enumerate(num_cols):
        if '_Encoded' not in col:  # Skip encoded categorical values
            plt.subplot(1, len(num_cols), i+1)
            sns.histplot(original_df[col], kde=True)
            plt.title(f'Distribution of {col}')
    plt.tight_layout()

    # 2. Correlation matrix
    plt.figure(figsize=(10, 8))
    correlation_matrix = df.corr()
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
    plt.title('Feature Correlation Matrix')

    # 3. Pairplot for numerical features
    if len(num_cols) >= 2:
        plt.figure(figsize=(12, 10))
        sns.pairplot(original_df, vars=[col for col in num_cols if '_Encoded' not in col],
                     hue='Gender' if 'Gender' in original_df.columns else None)
        plt.suptitle('Pairplot of Features', y=1.02)

    return correlation_matrix

"""Apply dimensionality reduction techniques"""
def reduce_dimensions(df):

    print("\nStep 6: Dimensionality Reduction...")

    # Extract features for clustering (all numerical columns)
    features = df.select_dtypes(include=np.number).values

    # 1. Principal Component Analysis (PCA)
    pca = PCA()
    pca_result = pca.fit_transform(features)

    # Analyze explained variance
    explained_variance = pca.explained_variance_ratio_
    cumulative_variance = np.cumsum(explained_variance)

    plt.figure(figsize=(10, 6))
    plt.plot(range(1, len(explained_variance) + 1), cumulative_variance, marker='o')
    plt.xlabel('Number of Components')
    plt.ylabel('Cumulative Explained Variance')
    plt.axhline(y=0.95, color='r', linestyle='--', label='95% Variance Threshold')
    plt.title('PCA Explained Variance')
    plt.legend()

    # Find number of components for 95% variance
    n_components_95 = np.where(cumulative_variance >= 0.95)[0][0] + 1
    print(f"Number of components needed for 95% variance: {n_components_95}")

    # Keep components for 95% variance
    pca = PCA(n_components=n_components_95)
    pca_result = pca.fit_transform(features)

    # 2. t-SNE for visualization (always produces 2 components for visualization)
    if features.shape[0] > 10:  # t-SNE requires more than a few samples
        tsne = TSNE(n_components=2, random_state=42)
        tsne_result = tsne.fit_transform(features)

        plt.figure(figsize=(10, 8))
        plt.scatter(tsne_result[:, 0], tsne_result[:, 1], alpha=0.7)
        plt.title('t-SNE Visualization of Customer Data')
        plt.xlabel('t-SNE Component 1')
        plt.ylabel('t-SNE Component 2')
    else:
        tsne_result = None
        print("Not enough samples for t-SNE visualization")

    return pca_result, tsne_result, pca

"""Determine the optimal number of clusters using various methods"""
def determine_optimal_clusters(df, pca_result):

    print("\nStep 7: Determining optimal number of clusters...")

    # Elbow Method for K-Means
    inertia = []
    silhouette_scores = []
    k_range = range(2, 11)

    for k in k_range:
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        kmeans.fit(pca_result)
        inertia.append(kmeans.inertia_)

        # Calculate silhouette score
        if k > 1:  # Silhouette score requires at least 2 clusters
            labels = kmeans.labels_
            silhouette_avg = silhouette_score(pca_result, labels)
            silhouette_scores.append(silhouette_avg)
            print(f"For k={k}, silhouette score: {silhouette_avg:.3f}")

    # Plot Elbow Method
    plt.figure(figsize=(12, 5))

    plt.subplot(1, 2, 1)
    plt.plot(k_range, inertia, marker='o')
    plt.xlabel('Number of Clusters')
    plt.ylabel('Inertia')
    plt.title('K-Means Elbow Method')

    plt.subplot(1, 2, 2)
    plt.plot(k_range[0:], silhouette_scores, marker='o')
    plt.xlabel('Number of Clusters')
    plt.ylabel('Silhouette Score')
    plt.title('Silhouette Scores')

    plt.tight_layout()

    # Find optimal k from silhouette score
    optimal_k_silhouette = k_range[0:][np.argmax(silhouette_scores)]
    print(f"Optimal K based on silhouette score: {optimal_k_silhouette}")

    # Hierarchical Clustering - Dendrogram
    plt.figure(figsize=(12, 8))
    linked = linkage(pca_result, 'ward')
    dendrogram(linked, truncate_mode='lastp', p=15)
    plt.title('Hierarchical Clustering Dendrogram')
    plt.xlabel('Sample Index')
    plt.ylabel('Distance')
    optimal_k_dendrogram = 6 #hardcoded after analyzing diagram
    print(f"Optimal K based on dendogram (hardcoded after analyzing graph): {optimal_k_dendrogram}")

    # GMM - AIC and BIC
    bic = []
    aic = []

    for k in k_range:
        gmm = GaussianMixture(n_components=k, random_state=42)
        gmm.fit(pca_result)
        bic.append(gmm.bic(pca_result))
        aic.append(gmm.aic(pca_result))

    plt.figure(figsize=(12, 5))

    plt.subplot(1, 2, 1)
    plt.plot(k_range, bic, marker='o')
    plt.xlabel('Number of Components')
    plt.ylabel('BIC')
    plt.title('BIC for GMM')

    plt.subplot(1, 2, 2)
    plt.plot(k_range, aic, marker='o')
    plt.xlabel('Number of Components')
    plt.ylabel('AIC')
    plt.title('AIC for GMM')

    plt.tight_layout()

    # Find optimal k from BIC
    optimal_k_bic = k_range[np.argmin(bic)]
    print(f"Optimal K based on BIC: {optimal_k_bic}")

    # Return different optimal k values from different methods
    return {
        'kmeans': optimal_k_silhouette,
        'gmm': optimal_k_bic,
        'dendogram': optimal_k_dendrogram
    }

"""Perform clustering using various algorithms"""
def perform_clustering(pca_result, optimal_k, original_df, include_gender):

    print("\nStep 8: Performing clustering with optimal K =", optimal_k)

    # K-Means Clustering
    kmeans = KMeans(n_clusters=optimal_k['kmeans'], random_state=42, n_init=10)
    kmeans_labels = kmeans.fit_predict(pca_result)

    # Hierarchical Clustering
    hierarchical = AgglomerativeClustering(n_clusters=optimal_k['kmeans'])
    hierarchical_labels = hierarchical.fit_predict(pca_result)

    # Gaussian Mixture Model
    gmm = GaussianMixture(n_components=optimal_k['gmm'], random_state=42)
    gmm.fit(pca_result)
    gmm_labels = gmm.predict(pca_result)

    # Add cluster labels to the original dataframe
    clustered_df = original_df.copy()
    clustered_df['KMeans_Cluster'] = kmeans_labels
    clustered_df['Hierarchical_Cluster'] = hierarchical_labels
    clustered_df['GMM_Cluster'] = gmm_labels

    # Visualize clusters in 2D PCA space
    plt.figure(figsize=(18, 6))

    # K-Means visualization
    plt.subplot(1, 3, 1)
    scatter = plt.scatter(pca_result[:, 0], pca_result[:, 1], c=kmeans_labels, cmap='viridis', alpha=0.7)
    plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1],
                c='red', marker='X', s=100, label='Centroids')
    plt.title('K-Means Clustering')
    plt.xlabel('PCA Component 1')
    plt.ylabel('PCA Component 2')
    plt.colorbar(scatter, label='Cluster')
    plt.legend()

    # Hierarchical visualization
    plt.subplot(1, 3, 2)
    scatter = plt.scatter(pca_result[:, 0], pca_result[:, 1], c=hierarchical_labels, cmap='viridis', alpha=0.7)
    plt.title('Hierarchical Clustering')
    plt.xlabel('PCA Component 1')
    plt.ylabel('PCA Component 2')
    plt.colorbar(scatter, label='Cluster')

    # GMM visualization
    plt.subplot(1, 3, 3)
    scatter = plt.scatter(pca_result[:, 0], pca_result[:, 1], c=gmm_labels, cmap='viridis', alpha=0.7)
    plt.title('Gaussian Mixture Model')
    plt.xlabel('PCA Component 1')
    plt.ylabel('PCA Component 2')
    plt.colorbar(scatter, label='Cluster')

    plt.tight_layout()

    # If we have 3 or more PCA components, create 3D visualization
    if pca_result.shape[1] >= 3:
        fig = plt.figure(figsize=(10, 8))
        ax = fig.add_subplot(111, projection='3d')
        scatter = ax.scatter(pca_result[:, 0], pca_result[:, 1], pca_result[:, 2],
                            c=kmeans_labels, cmap='viridis', alpha=0.7)
        ax.set_title('3D K-Means Clustering Visualization')
        ax.set_xlabel('PCA Component 1')
        ax.set_ylabel('PCA Component 2')
        ax.set_zlabel('PCA Component 3')
        plt.colorbar(scatter, label='Cluster')

    # Compare algorithms
    from sklearn.metrics import adjusted_rand_score

    ari_km_hc = adjusted_rand_score(kmeans_labels, hierarchical_labels)
    ari_km_gmm = adjusted_rand_score(kmeans_labels, gmm_labels)
    ari_hc_gmm = adjusted_rand_score(hierarchical_labels, gmm_labels)

    print(f"Adjusted Rand Index (KMeans vs Hierarchical): {ari_km_hc:.3f}")
    print(f"Adjusted Rand Index (KMeans vs GMM): {ari_km_gmm:.3f}")
    print(f"Adjusted Rand Index (Hierarchical vs GMM): {ari_hc_gmm:.3f}")

    # Choose the best algorithm based on silhouette score
    silhouette_km = silhouette_score(pca_result, kmeans_labels)
    silhouette_hc = silhouette_score(pca_result, hierarchical_labels)
    silhouette_gmm = silhouette_score(pca_result, gmm_labels)

    print(f"K-Means Silhouette Score: {silhouette_km:.3f}")
    print(f"Hierarchical Silhouette Score: {silhouette_hc:.3f}")
    print(f"GMM Silhouette Score: {silhouette_gmm:.3f}")

    scores = {
        'KMeans': silhouette_km,
        'Hierarchical': silhouette_hc,
        'GMM': silhouette_gmm
    }
    best_algorithm = max(scores, key=scores.get)
    print(f"Best clustering algorithm based on silhouette score: {best_algorithm}")

    # Select the best algorithm's labels
    if best_algorithm == 'KMeans':
        final_labels = kmeans_labels
        clustered_df['Cluster'] = kmeans_labels
        model = kmeans
    elif best_algorithm == 'Hierarchical':
        final_labels = hierarchical_labels
        clustered_df['Cluster'] = hierarchical_labels
        model = hierarchical
    else:  # GMM
        final_labels = gmm_labels
        clustered_df['Cluster'] = gmm_labels
        model = gmm

    return clustered_df, final_labels, model, best_algorithm

"""Profile and interpret each cluster"""
def profile_clusters(clustered_df, include_gender):

    print("\nStep 9: Profiling clusters...")

    # Focus on the final cluster assignment
    cluster_profiles = clustered_df.groupby('Cluster').agg({
        'Age': ['mean', 'min', 'max'],
        'Annual Income (k$)': ['mean', 'min', 'max'],
        'Spending Score (1-100)': ['mean', 'min', 'max']
    })

    print("\nCluster Profiles:")
    print(cluster_profiles)

    # Calculate gender distribution if gender was included
    if include_gender and 'Gender' in clustered_df.columns:
        gender_distribution = pd.crosstab(
            clustered_df['Cluster'],
            clustered_df['Gender'],
            normalize='index'
        ) * 100  # Convert to percentage

        print("\nGender Distribution by Cluster (%):")
        print(gender_distribution)

    # Visualize the cluster profiles
    # 1. Age distribution by cluster
    plt.figure(figsize=(15, 5))

    plt.subplot(1, 3, 1)
    sns.boxplot(x='Cluster', y='Age', data=clustered_df)
    plt.title('Age Distribution by Cluster')

    plt.subplot(1, 3, 2)
    sns.boxplot(x='Cluster', y='Annual Income (k$)', data=clustered_df)
    plt.title('Income Distribution by Cluster')

    plt.subplot(1, 3, 3)
    sns.boxplot(x='Cluster', y='Spending Score (1-100)', data=clustered_df)
    plt.title('Spending Score Distribution by Cluster')

    plt.tight_layout()

    # 2. 3D Scatter plot of clusters
    fig = plt.figure(figsize=(12, 10))
    ax = fig.add_subplot(111, projection='3d')

    scatter = ax.scatter(
        clustered_df['Annual Income (k$)'],
        clustered_df['Spending Score (1-100)'],
        clustered_df['Age'],
        c=clustered_df['Cluster'],
        cmap='viridis',
        s=50,
        alpha=0.7
    )

    ax.set_xlabel('Annual Income (k$)')
    ax.set_ylabel('Spending Score (1-100)')
    ax.set_zlabel('Age')
    ax.set_title('3D Visualization of Customer Clusters')
    plt.colorbar(scatter, label='Cluster')

    # Create personas for each cluster
    n_clusters = clustered_df['Cluster'].nunique()
    personas = {}

    for cluster in range(n_clusters):
        cluster_data = clustered_df[clustered_df['Cluster'] == cluster]

        avg_age = cluster_data['Age'].mean()
        avg_income = cluster_data['Annual Income (k$)'].mean()
        avg_spending = cluster_data['Spending Score (1-100)'].mean()

        # Determine persona characteristics
        # Age category
        if avg_age < 30:
            age_category = "Young"
        elif avg_age < 50:
            age_category = "Middle-aged"
        else:
            age_category = "Senior"

        # Income category
        if avg_income < 40:
            income_category = "Budget-conscious"
        elif avg_income < 70:
            income_category = "Middle-income"
        else:
            income_category = "Affluent"

        # Spending category
        if avg_spending < 40:
            spending_category = "Conservative Spenders"
        elif avg_spending < 70:
            spending_category = "Moderate Spenders"
        else:
            spending_category = "Big Spenders"

        # Gender majority if applicable
        gender_majority = ""
        if include_gender and 'Gender' in clustered_df.columns:
            gender_counts = cluster_data['Gender'].value_counts(normalize=True)
            if len(gender_counts) > 0:
                majority_gender = gender_counts.idxmax()
                if gender_counts[majority_gender] > 0.65:  # If more than 65% are one gender
                    gender_majority = f"Predominantly {majority_gender}"

        # Create persona name and description
        persona_name = f"{age_category} {income_category} {spending_category}"
        if gender_majority:
            persona_name = f"{gender_majority} {persona_name}"

        # Business recommendations
        if avg_spending > 70 and avg_income > 70:
            recommendation = "Target with luxury products and premium services"
        elif avg_spending > 70 and avg_income < 70:
            recommendation = "Target with 'affordable luxury' and exclusive deals"
        elif avg_spending < 40 and avg_income > 70:
            recommendation = "Target with value propositions emphasizing quality and longevity"
        elif avg_age < 30 and avg_spending > 50:
            recommendation = "Target with trendy products and digital marketing campaigns"
        elif avg_age > 50:
            recommendation = "Target with comfort and reliability-focused marketing"
        else:
            recommendation = "Target with balanced value-for-money offerings"

        personas[cluster] = {
            'name': persona_name,
            'avg_age': avg_age,
            'avg_income': avg_income,
            'avg_spending': avg_spending,
            'recommendation': recommendation
        }

    print("\nCustomer Personas:")
    for cluster, persona in personas.items():
        print(f"\nCluster {cluster}: {persona['name']}")
        print(f"  Average Age: {persona['avg_age']:.1f}")
        print(f"  Average Income: ${persona['avg_income']:.1f}k")
        print(f"  Average Spending Score: {persona['avg_spending']:.1f}/100")
        print(f"  Marketing Recommendation: {persona['recommendation']}")

    return personas

"""Build a model for predicting clusters for new customers"""
def build_predictive_model(preprocessed_df, pca, model, best_algorithm, scaler, include_gender):

    print("\nStep 10: Building predictive model...")

    # Identify the exact columns used for fitting the PCA
    # These must match between training and prediction
    pca_features = [col for col in preprocessed_df.columns
                   if col not in ['Cluster', 'KMeans_Cluster', 'Hierarchical_Cluster', 'GMM_Cluster', 'Gender']]

    print(f"PCA was trained on these features: {pca_features}")
    print(f"Number of features used in PCA: {len(pca_features)}")

    def predict_cluster(new_customer_data, include_gender=False):
        """
        Predict the cluster for a new customer

        Parameters:
        -----------
        new_customer_data : dict
            Dictionary containing customer data with keys matching the original dataset
        include_gender : bool
            Whether gender should be included in prediction

        Returns:
        --------
        int : Predicted cluster
        """
        # Convert dictionary to DataFrame
        new_df = pd.DataFrame([new_customer_data])

        # Make sure we have all required features
        if 'Gender' in new_df.columns and include_gender:
            # Handle gender encoding consistently
            le = LabelEncoder()
            le.fit(['Male', 'Female'])
            new_df['Gender_Encoded'] = le.transform(new_df['Gender'])

        # Verify that all required features exist
        for feature in pca_features:
            if feature not in new_df.columns:
                raise ValueError(f"Missing feature: {feature}. Required features: {pca_features}")

        # Select exactly the same features in the same order
        X_predict = new_df[pca_features].values

        # Apply the same preprocessing
        X_scaled = scaler.transform(X_predict)

        # Apply PCA transformation
        X_pca = pca.transform(X_scaled)

        # Make prediction
        if best_algorithm == 'KMeans':
            predicted_cluster = model.predict(X_pca)[0]
        elif best_algorithm == 'Hierarchical':
            # For hierarchical clustering, find the nearest cluster
            features_array = preprocessed_df[pca_features].values
            X_all = np.vstack([pca.transform(scaler.transform(features_array)), X_pca])
            predicted_cluster = model.fit_predict(X_all)[-1]
        else:  # GMM
            predicted_cluster = model.predict(X_pca)[0]

        return predicted_cluster

    print("Model successfully built for inference")
    print(f"Using {best_algorithm} algorithm for predictions")

    # Example usage
    example_customer = {
        'Age': 30,
        'Annual Income (k$)': 60,
        'Spending Score (1-100)': 75
    }

    if include_gender:
        example_customer['Gender'] = 'Female'
        if 'Gender_Encoded' in pca_features:
            # We need to perform the encoding here too since it's part of our features
            le = LabelEncoder()
            le.fit(['Male', 'Female'])
            example_customer['Gender_Encoded'] = le.transform([example_customer['Gender']])[0]

    # Make sure all features from PCA are present in the example
    for feature in pca_features:
        if feature not in example_customer and feature != 'Gender_Encoded':
            print(f"Warning: Adding missing feature {feature} with value 0")
            example_customer[feature] = 0

    example_cluster = predict_cluster(example_customer, include_gender)
    print(f"\nExample prediction:")
    print(f"Customer data: {example_customer}")
    print(f"Predicted cluster: {example_cluster}")

    return predict_cluster

def main():
    """Main function to run the complete customer segmentation pipeline"""
    print("========= CUSTOMER SEGMENTATION ANALYSIS =========\n")

    # 1. Load and explore data
    df = load_and_explore_data('Mall_Customers.csv')

    # 2. Preprocess data
    processed_df = preprocess_data(df)

    # 3. Analyze gender relevance
    include_gender, df_for_analysis = analyze_gender_relevance(processed_df)

    # 4. Standardize features
    scaled_df, scaler = standardize_features(df_for_analysis)

    # 5. Perform EDA
    correlation_matrix = exploratory_data_analysis(scaled_df, df_for_analysis)

    # 6. Reduce dimensions
    pca_result, tsne_result, pca = reduce_dimensions(scaled_df)

    # 7. Determine optimal clusters
    optimal_k = determine_optimal_clusters(scaled_df, pca_result)

    # 8. Perform clustering
    clustered_df, final_labels, model, best_algorithm = perform_clustering(
        pca_result, optimal_k, df_for_analysis, include_gender)

    # 9. Profile clusters
    personas = profile_clusters(clustered_df, include_gender)

    # 10. Build predictive model
    predict_cluster = build_predictive_model(
        scaled_df, pca, model, best_algorithm, scaler, include_gender
    )

    print("\n========= ANALYSIS COMPLETE =========")
    print(f"Best clustering algorithm: {best_algorithm}")
    print(f"Number of clusters identified: {len(personas)}")
    print("Instructions:\n")
    print("1. Use the predictive model to classify new customers")
    print("2. Implement recommended targeted marketing strategies based on cluster personas")
    print("3. Re-train the model periodically as new customer data becomes available")

    return clustered_df, personas, predict_cluster

def retrain_model(original_df, new_customers_df, frequency=100):
    """
    Retrain the model after adding a certain number of new customers

    Parameters:
    -----------
    original_df : DataFrame
        Original customer dataset
    new_customers_df : DataFrame
        New customer data to be added
    frequency : int
        Number of customers to add before retraining

    Returns:
    --------
    tuple : Updated dataframe, personas, and prediction function
    """
    print(f"\nRetraining model after adding {len(new_customers_df)} new customers...")

    # Combine original and new data
    combined_df = pd.concat([original_df, new_customers_df], ignore_index=True)

    # Run the entire pipeline again
    # This is simplified - in production, you might want to save parameters and models
    clustered_df, personas, predict_cluster = main()

    print("Model retrained successfully")

    return clustered_df, personas, predict_cluster

# Run the pipeline if script is executed directly
if __name__ == "__main__":
    clustered_df, personas, predict_cluster = main()

    # Example of how to use the prediction function
    new_customer = {
        'Age': 42,
        'Annual Income (k$)': 85,
        'Spending Score (1-100)': 55,
        'Gender': 'Male'
    }

    # Predict cluster for the new customer
    predicted_cluster = predict_cluster(new_customer)

    print(f"\nNew customer details: {new_customer}")
    print(f"Predicted cluster: {predicted_cluster}")
    print(f"Customer persona: {personas[predicted_cluster]['name']}")
    print(f"Recommended marketing approach: {personas[predicted_cluster]['recommendation']}")


Step 1: Loading and exploring data...
File Mall_Customers.csv not found.


UnboundLocalError: cannot access local variable 'df' where it is not associated with a value