In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from yellowbrick.cluster import KElbowVisualizer
from sklearn.cluster import KMeans, AgglomerativeClustering
import pickle

In [2]:
df = pd.read_csv(r'C:\Users\Bildad Otieno\Documents\Billy_Repo\Customer_Segmentation\marketing_campaign.xls', sep = '\t')
df.index = df.index +1
df.head()

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
1,5524,1957,Graduation,Single,58138.0,0,0,04-09-2012,58,635,...,7,0,0,0,0,0,0,3,11,1
2,2174,1954,Graduation,Single,46344.0,1,1,08-03-2014,38,11,...,5,0,0,0,0,0,0,3,11,0
3,4141,1965,Graduation,Together,71613.0,0,0,21-08-2013,26,426,...,4,0,0,0,0,0,0,3,11,0
4,6182,1984,Graduation,Together,26646.0,1,0,10-02-2014,26,11,...,6,0,0,0,0,0,0,3,11,0
5,5324,1981,PhD,Married,58293.0,1,0,19-01-2014,94,173,...,5,0,0,0,0,0,0,3,11,0


In [3]:
def preprocess_and_cluster(input_data_path):
    """
    Preprocesses data, combines existing and new data, and re-runs clustering.

    Parameters:
    - existing_data: DataFrame containing existing data.
    - new_data: DataFrame containing new data.

    Returns:
    - combined_data: DataFrame with combined data and clustering labels.
    - clustering_model: Trained clustering model.
    """
    #Load the input data
    df = pd.read_csv(input_data_path)
    #Feature engineering and preprocessing logic here
    
    #Dt_Customer indicates date Customer joined database
    df['Dt_Customer'] = pd.to_datetime(df['Dt_Customer'])

    End_date = max(df['Dt_Customer'])
    Duration = []
    for d in df['Dt_Customer']:
        diff = End_date - d
        diff = pd.Timedelta(diff)
        Duration.append(diff.days)
        
    #Assigning the dates as a column within the dataframe
    df['Customer_For (Days)'] = Duration
    
    #Creating a new feature showing the Customer Age
    df['Cust_Age'] = 2021 - df['Year_Birth']
    
    #Calculating total expenditure per customer
    df['Spent'] = df['MntWines'] + df['MntFruits'] + df['MntMeatProducts'] + df['MntFishProducts'] + df['MntSweetProducts'] + df['MntGoldProds']

    #Reducing Categorical Values in Marital_Status to Partner and Alone
    df['Lives_With'] = df['Marital_Status'].replace({"Married":"Partner", "Together":"Partner", "Absurd":"Alone", "Widow":"Alone", "YOLO":"Alone", "Divorced":"Alone", "Single":"Alone"})
    
    #Merging values in columns Kidhome and Teenhome
    df["No_of_Children"] = df['Kidhome'] + df['Teenhome']
    
    #Creating a new Column "Family_Size" indicating size of Customer Household
    df["Family_Size"] = df['Lives_With'].replace({"Alone":1, "Partner":2}) + df['No_of_Children']
    
    #If the condition is true (i.e., if the value in the "Children" column is greater than 0), the corresponding element in the new "Is_Parent" column is set to 1. If the condition is false, the corresponding element is set to 0.
    df['Is_Parent'] = np.where(df['No_of_Children'] > 0, 1, 0)
    
    #Reducing Categorical Values of Education into 3: UnderGrad, Grad, PostGrad
    df["Education"] = df['Education'].replace({"Basic":"UnderGrad", "2n Cycle":"UnderGrad", "Graduation":"Grad", "Master":"PostGrad", "PhD":"PostGrad"})
    
    #Renaming Columns
    df = df.rename(columns={"MntWines":"Wines", "MntFruits":"Fruits", "MntMeatProducts":"Meat", "MntFishProducts":"Fish", "MntSweetProducts":"Sweets", "MntGoldProds":"Gold"})
    
    #Retaining the new features generated and removing the old ones
    df = df.drop(columns={'Dt_Customer', 'Marital_Status', 'Year_Birth', 'ID', 'Z_CostContact', 'Z_Revenue'}, axis = 1)
    
    #Determining non-numerical variables
    for column_name, dt in df.dtypes.items():
        if dt == 'object':
            print(f"{column_name}: {dt}")
    
    #Performing Label Encoding
    enc = LabelEncoder()
    df['Education'] = df[['Education']].apply(enc.fit_transform)
    df['Lives_With'] = df[['Lives_With']].apply(enc.fit_transform)
    
    ds = df.copy()

    del_col = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'Complain', 'Response']

    ds = ds.drop(del_col, axis =1)
    
    scaler = StandardScaler()

    scaler.fit(ds)

    scaled_ds = pd.DataFrame(scaler.transform(ds), columns = ds.columns)
    scaled_ds
    
    #Using PCA to reduce dimensions(features) to 3 
    pca = PCA(n_components=3)
    pca.fit(scaled_ds)
    PCA_ds = pd.DataFrame(pca.transform(scaled_ds), columns = (['Col1', 'Col2', 'Col3']))
    
    
    x = PCA_ds['Col1']
    y = PCA_ds['Col2']
    z = PCA_ds['Col3']
    
    #Elbow Method to Determine the Optimum Number of Clusters
    Elbow_M = KElbowVisualizer(KMeans(), k=10)
    Elbow_M.fit(PCA_ds)
    
    # Load the existing clustering model
    with open('Cluster.pkl', 'rb') as data_file:
        clustering_model = pickle.load(data_file)
    
    yhat_AC = clustering_model.fit_predict(combined_new_data)
    df["Clusters"] = yhat_AC
    
    # Define recommendation strategies based on clusters
    def recommend_products(cluster_label):
        if cluster_label == 0:
            return "Recommend high-value products suitable for older parents. Consider loyalty programs."
        elif cluster_label == 1:
            return "Consider diverse marketing strategies targeting a wide age range. Highlight high-income products."
        elif cluster_label == 2:
            return "Tailor offerings to younger parents with more budget-friendly options."
        elif cluster_label == 3:
            return "Focus on personalized and high-value offerings for the smallest, older parent group."
        else:
            return "No specific recommendation for this cluster."

    # Apply recommendations to each customer in the DataFrame
    df["Recommendation"] = df["Clusters"].apply(recommend_products)

    return df, clustering_model