# 63_inclass_cluster_example

Preprocess data, a bit of feature engineering, PCA, clustering and then plotting

## Imports

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
import pandas as pd
import numpy as np
import datetime

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from IPython import get_ipython
ipython = get_ipython()

%matplotlib inline
%config InlineBackend.figure_format = 'svg'

# Set max rows and columns displayed in jupyter
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)

#the following gives access to utils folder
#where utils package stores shared code
import os
import sys
PROJECT_ROOT = os.path.abspath(os.path.join(
                  os.getcwd(),
                  os.pardir)
)

#only add it once
if (PROJECT_ROOT not in sys.path):
    sys.path.append(PROJECT_ROOT)

# autoreload extension
if 'autoreload' not in ipython.extension_manager.loaded:
    %load_ext autoreload

%autoreload 2

import utils as ut

## Constants and Functions
Migrate these to a separate package when done so they can be used with scripts

In [None]:
F_NEW="Mall_CustomersNew.csv"
RSEED=7

#lets compare each feature to all other features
def show_pairplot(df, hue):
    '''
    Shows a pairplot comparing all the features in a dataframe
    '''
    #this works on a small dataset, will be prohibitivly slow on a larger one
    To_Plot = [ col for col in df.columns]
    print("Relative Plot Of Some Selected Features: A Data Subset")
    plt.figure()
    sns.pairplot(data=df[To_Plot], hue=hue, palette=ut.colors1)
    plt.show();

## Load Data

In [None]:
#load
df= pd.read_csv(F_NEW)
# df

### sns.pairplot to see if anything looks funny
In this case Active has no variance and CustomerID is weirdly correlated with Annual Income.  It appears that they collected all customer data, sorted by income, and then assigned a customer ID to the sorted data

In [None]:
# show_pairplot(df,hue='Gender')

## Clean Data

In [None]:
from utils.transforms import *

### Drop duplicates 

In [None]:
df=ut.remove_duplicates(df)

### Handle nulls
Just drop if there are not too many and you can't figure out how to estimate<br>
(You could estimate by taking the value before and after since the data was sorted by income before CustomerIDs were assigned)

In [None]:
#how many rows have nulls?
df.isna().sum(axis=1).sum()

#see em
# df[df.isnull().any(axis=1)]

In [None]:
#only 1 null row, drop it
df.dropna(axis=0, inplace=True)

#### Birthday-convert to Age in years

In [None]:
#convert birthday to age in years
import datetime
def getyear(v):
    return datetime.datetime.now().year - pd.to_datetime(v).year
df['Birthday']=df.Birthday.map(getyear) 
df.rename(columns={"Birthday": "Age"}, inplace=True)

### Correlations?
Careful you want to drop the columns with the least info

In [None]:
#any correlations?
ut.get_correlated_columns(df)

In [None]:
sns.lineplot(data=df, y='Annual Income (k$)',x='CustomerID')

#### Annual Income is highly (and weirdly) correlated with CustomerID, drop one.  But be sure to keep the one that has the most information!

Looks like they sorted the dataset by incone then assigned consecutive IDs

CustomerID is likely unique for every customer, and contains no info, drop it

In [None]:
print(f'There are {df.CustomerID.nunique()} unique customer IDs and {len(df)} rows in df')

In [None]:
df.drop(columns=['CustomerID'], inplace=True)

### Drop no variance columns

In [None]:
df=ut.drop_no_variance_columns(df)

### Feature engineering- combine all spending columns into 1.  This assummes one value can accurately capture spending patterns. This also reduces number of features

In [None]:
#lets combine the last 3 into 1
def combine_columns(df, newcolname, cols ):
    '''
    df: dataframe
    newcolname: the name of the column to create that has the sum of all columns in cols
    cols: list of columns to add
    return: modded dataframe
    '''
    df[newcolname] = df[cols].sum(axis=1)
    df.drop(cols, axis=1, inplace=True)
    return df

# cols=[col for col in df.columns if "spending" in col]
# data=combine_columns(df,"spending_total",cols)

### Categoricals

In [None]:
#find categoricals, use df.dtypes, look for the object columns
# df.dtypes

#### Gender 

In [None]:
#looks like Gender and Birthday
print (df.Gender.unique())

In [None]:
#gender is nominal, but it is binary ie male or female
#try a binary variable instead of dummies
feats=['Gender']

#either of these
#convert Gender
# d= {v:i for i,v in enumerate(df.Gender.unique().tolist())}
# df.Gender=df.Gender.map(d)
df=ut.cat_ordinal(df, features=feats, order={'Gender':{'Male':0, 'Female':1}})

### Save a copy of dataframe to append clusters to

In [None]:
dforig=df.copy()

### Standardize

In [None]:
df=ut.scale(df)

### PCA

In [None]:
## PCA

from sklearn.decomposition import PCA

pca = PCA(n_components=.95, whiten=True)
features_pca=pd.DataFrame(pca.fit_transform(df))
print(f'Orig #features={df.shape[1]}, number features containing 95% of variance={features_pca.shape[1]}')

features_pca
pca.explained_variance_ratio_

# HDBscan

In [None]:
#hdbscan is not part of scikitlearn or a standard anaconda distribution, here is how to install
# !conda install -c conda-forge hdbscan -y

import hdbscan

class hdbres():
    '''
    Bookkeeping class to hold data
    '''
    def __init__(self,min_cluster_size,min_samples,n_clusters,n_noise,labels ):
        self.min_cluster_size=min_cluster_size
        self.min_samples=min_samples
        self.n_clusters=n_clusters
        self.n_noise=n_noise
        self.labels=labels  #all the cluster labels for my data
    def __repr__(self):
        return str(f'self.min_cluster_size:{self.min_cluster_size},min_samples:{self.min_samples}, numb clusters:{self.n_clusters},noise points: {self.n_noise}\n')


def run_hdbscan(df, min_cluster_size,min_samples, verbose=True):
    '''
    a hdbscan run for a set of parameters
    returns: hdbres object with all initializing params and results
    '''
    db = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples).fit(df)

    #cluster labels for dataset
    cluster_labels = db.labels_

    # Number of clusters in labels, ignoring noise if present.
    n_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
    n_noise = list(cluster_labels).count(-1)
    if(verbose):
        print(f'numb clusters:{n_clusters},noise points: {n_noise}')
    return hdbres(min_cluster_size,min_samples,n_clusters,n_noise,cluster_labels)

#chack all combonations of following parameters
cluster_sizes=[6,7,8,9,10,11,12,13,14,15]
min_samps=[2,3,4]

#holds hdbres objects
res=[]
for min_samp in min_samps:
    for cluster_size in cluster_sizes:       
          res.append(run_hdbscan(features_pca, cluster_size,min_samp,False))      

In [None]:
print(sorted(res, key=lambda x: x.n_clusters))

In [None]:
#get the labels for a particulr run of interest
# for instance try the one with self.min_cluster_size:10,min_samples:2
cluster_labels=[x for x in res if x.min_cluster_size==10 and x.min_samples==2][0].labels

### add the cluster predictions to orig df 

In [None]:
#add the cluster predictions(np.array) to pandas dataframe
dforig['Cluster']=cluster_labels.tolist()

### What do the plots mean? If anything.
You can only look at 3 features at a time if you are showing clusters as colors.<br>
So look at a plot of all females, and then all males.  ARe any clusters exclusively female or male? <br>
ARe there any other clusters that clearly indicate similarities?<br>
Are there any that appear to be pointlessly mixed with other clusters?<br>
If so should you try reducing the number of clusters calculated?

In [None]:
#only show non outliers
show_pairplot(dforig[dforig['Cluster']!=-1],hue='Cluster');

In [None]:
#looks like there is a relationship between Gender and age for cluster membership, lets plot
# sns.barplot(data=dforig[dforig['Cluster']!=-1], y='Age',x='Gender',hue='Cluster');
sns.barplot(data=dforig[dforig['Cluster']!=-1], y='spending_sale',x='Gender',hue='Cluster');
# sns.barplot(data=dforig[dforig['Cluster']!=-1], y='spending_alcohol',x='Annual Income (k$)',hue='Cluster');

In [None]:
dforig.nunique()

In [None]:
#here is the problem, we should reduce the cardinality of every column except gender and Cluster
#we will do it by binning