# DATA MINING PROJECT: Analysis of a Supermarket’s Customers
## 2.3-optional) Clustering Analysis: Pyclustering
### *Antonio Strippoli, Valerio Mariani*

In [None]:
!pip3 install pyclustering

In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

In [None]:
def load_and_scale(range=(0,1)):
    # Load dataset for clustering
    cdf = pd.read_csv("../DM_25_TASK1/customer_profilation.csv", index_col=0)

    # We choosed these 3 attributes, since they usually classify very well customers
    attr_cluster = ['Recency', 'Frequency', 'Monetary']
    cdf_cluster = cdf[attr_cluster]

    # Normalize values
    scaler = MinMaxScaler(feature_range= range) 
    X = scaler.fit_transform(cdf_cluster.values)

    return X

In [None]:
# x-means to automatically determine number of clusters
from pyclustering.cluster import cluster_visualizer
from pyclustering.cluster.xmeans import xmeans
from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer

# Prepare initial center
X = load_and_scale()
amount_initial_centers = 1
initial_centers = kmeans_plusplus_initializer(X, amount_initial_centers).initialize()
# Create instance of X-Means algorithm. The algorithm will start analysis from 1 cluster, the maximum number of clusters that can be allocated is 100.
xmeans_instance = xmeans(X, initial_centers, 100)
xmeans_instance.process()
# Extract clustering results: clusters and their centers
clusters = xmeans_instance.get_clusters()
centers = xmeans_instance.get_centers()
# Visualize clustering results
visualizer = cluster_visualizer()
visualizer.append_clusters(clusters, X)
visualizer.append_cluster(centers, None, marker='*')
visualizer.show()

print( len(clusters) ) 

### Fuzzy c-means

In [None]:
from pyclustering.cluster import cluster_visualizer
from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
from pyclustering.cluster.fcm import fcm

import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (7,5)

def test_fcm(n_init_centers, m):
  # initialize
  X = load_and_scale()
  initial_centers = kmeans_plusplus_initializer(X, n_init_centers, kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE).initialize()
  # create instance of Fuzzy C-Means algorithm
  fcm_instance = fcm(X, initial_centers, m=m)
  # run cluster analysis and obtain results
  fcm_instance.process()
  clusters = fcm_instance.get_clusters()
  centers = fcm_instance.get_centers()
  # visualize clustering results
  visualizer = cluster_visualizer()
  visualizer.append_clusters(clusters, X)
  #visualizer.append_cluster(centers, marker='*', markersize=2)
  visualizer.show()

test_fcm(5,2)
test_fcm(6,2.1)
test_fcm(6,2.2)
test_fcm(6,2.5)

### Expectation Maximization (EM) to reach Maximum Likelihood Estimation

In [None]:
from pyclustering.cluster.ema import ema, ema_visualizer
from pyclustering.utils import read_sample

# Create EM istance
X = load_and_scale()
ema_instance = ema(X, 5)
# Run clustering process.
ema_instance.process()
# Get clustering results.
clusters = ema_instance.get_clusters()
covariances = ema_instance.get_covariances()
means = ema_instance.get_centers()
# Visualize obtained clustering results.
ema_visualizer.show_clusters(clusters, X, covariances, means)

### Genetic algorithm

In [None]:
%%time
from pyclustering.cluster.ga import genetic_algorithm, ga_observer, ga_visualizer


X=load_and_scale(range=(.01,.9)) # bug: contrary to what docs say, interval needs to be included
observer_instance = ga_observer(True, True, True)

# Create genetic algorithm for clustering
X = load_and_scale()
ga_instance = genetic_algorithm(data=X,
                                count_clusters=4,# number of clusters
                                chromosome_count=500, # elementi per popolazione
                                population_count=2000, # The amount of populations that essentially defines the amount of iterations. 
                                count_mutation_gens=1,
                                observer = observer_instance)
 
# Start processing
ga_instance.process()
 
# Obtain results and show'em
clusters = ga_instance.get_clusters()
ga_visualizer.show_clusters(X, observer_instance)

# Print n. of clustersstrict
print("Amount of clusters: '%d'" % len(clusters))

### Optic

In [None]:
from pyclustering.cluster import cluster_visualizer
from pyclustering.cluster.optics import optics, ordering_analyser, ordering_visualizer
import matplotlib.pyplot as plt

# Run cluster analysis where connectivity radius is bigger than real.
radius = 0.08
neighbors = 4
X = load_and_scale()
optics_instance = optics(X, radius, neighbors)
 
# Performs cluster analysis.
optics_instance.process()
 
# Obtain results of clustering.
clusters = optics_instance.get_clusters()
noise = optics_instance.get_noise()
ordering = optics_instance.get_ordering()

plt.rcParams["figure.figsize"] = (7,5)

# Visualize clustering results (clusters and outliers).
visualizer = cluster_visualizer()
visualizer.append_clusters(clusters, X)
visualizer.append_cluster(noise, X, marker='x')
visualizer.show()

plt.rcParams["figure.figsize"] = (20,7)

# Display ordering.
analyser = ordering_analyser(ordering)
ordering_visualizer.show_ordering_diagram(analyser,4)