In [1]:
# Importing Required Libraries in Python
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from mpl_toolkits import mplot3d
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
import pandas as pd

In [None]:
# load data
df = pd.read_csv("")

In [None]:
# Preprocessing and Feature Engineering the Data
df.head()

# Define which columns contain our features
feature_cols = df.columns.to_list()

# Discretize each column into quantiles
for column in feature_cols:
  df[column] = 
pd.qcut(df[column].sort_values().rank(method='first'), q=5, 
	duplicates='raise', labels=False)  

# Convert to a numpy array
X = df[feature_cols]
X = np.array(X)

# Scale the values
scaler = StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)

In [None]:
# Dimensionality Reduction with PCA
# Instantiate PCA
pca = PCA(n_components=16, random_state=99).fit(X_scaled)

# Store explained variance results in a DataFrame
evar_df =  pd.DataFrame(data=pca.explained_variance_ratio_, 
		index=range(1,len(feature_cols)+1)).rename(columns={0:'pct_explained_variance'})

# Calculate cumulative explained variance
evar_df['cum_explained_variance'] = evar_df.cumsum()

# Plot using matplotlib
fig = plt.figure(figsize=(6,4))
ax = fig.gca()

sns.lineplot(data=evar_df, x=evar_df.index, 
		y='cum_explained_variance', ax=ax, color='blue').set_title('PCA Cumulative Explained Variance')
plt.ylabel('cumulative explained variance')
plt.xlabel('principal components')
plt.xticks(evar_df.index)
plt.ylim(0,1)

for i, ev in enumerate(round(((evar_df.cum_explained_variance)*100),1).to_list()):
    plt.text(evar_df.index[i]-.24, 
	evar_df.cum_explained_variance[i+1]-.05, ev, color='blue')

# Instantiate PCA with 10 principal components & fit to our dataset
pca = PCA(n_components=10, random_state=99).fit(X_scaled)

# Transform our dataset
X_scaled_red = pca.transform(X_scaled)

In [None]:
# Implementing K-Means Clustering in Python

# Instantiate KMeans class
clusterer = KMeans(n_clusters=6, random_state=99)

# Compute cluster centers and predict cluster for each sample
cluster_labels = clusterer.fit_predict(X_scaled_red)

In [None]:
# Create DataFrame with samples and corresponding cluster labels
cluster_df = pd.DataFrame(data=X_scaled_red, 
columns=['pc1','pc2','pc3','pc4','pc5','pc6','pc7','pc8','pc9','pc10'])
cluster_df['cluster'] = cluster_labels

In [None]:
# Define which features to plot
feature1 = 'pc1'
feature2 = 'pc2'
feature3 = 'pc3'

# Set up figure
fig = plt.figure(figsize=(8,6))
ax = plt.axes(projection='3d')

# Define our custom color list
color_list = ['deeppink', 'blue', 'forestgreen', 'orange', 'palegreen', 'darkviolet', 'moccasin', 'crimson', 'lightsteelblue', 'cyan']

# Iterate over each cluster, plotting on our figure
for i in range(cluster_df.cluster.nunique()):
    label = "cluster=" + str(i)
    ax.scatter3D(cluster_df[cluster_df.cluster==i][feature1],cluster_df[cluster_df.cluster==i][feature2], cluster_df[cluster_df.cluster==i][feature3], c=color_list[i], label=label)

# Set labels and legend
ax.set_xlabel(feature1)
ax.set_ylabel(feature2)
ax.set_zlabel(feature3)
ax.set_title('Holiday Shopping KMeans Clusters')
ax.legend()