# Unsupervised Learning

# <font color="blue"> Case 1: Unsupervised Machine Learning using Synthetic Dataset (K-Means)<font>

Install required libraries

In [None]:
pip install 

To generate synthetic dataset

In [None]:
import numpy as np
import matplotlib.pyplot as 
from sklearn.cluster import 
from sklearn.datasets import 

# Create a synthetic dataset
n_samples = 
n_features = 
n_clusters = 

X, true_label = make_blobs(n_samples=n_samples, n_features=n_features, centers=n_clusters, random_state=42)
print (true_label)
print (X)

**make_blobs** is a function provided by the scikit-learn library in Python, which is commonly used for generating synthetic datasets for clustering and classification tasks. It creates a specified number of blobs or clusters of data points, each characterized by its own center and spread. This is particularly useful for testing and visualizing machine learning algorithms.

The function's purpose is to generate datasets with well-defined cluster structures, making it easier to experiment with different algorithms and observe their behavior.

Parameters:<br>
(1) n_samples: The total number of data points to generate.<br>
(2) n_features: The number of features (or dimensions) for each data point.<br>
(3) centers: The number of clusters to generate.<br>
(4) cluster_std: The standard deviation of each cluster.<br>
(5) random_state: Seed for random number generation (optional).

In [None]:
# Visualize the dataset
plt.scatter(X[], X[], s=)   #https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.scatter.html
plt.title("Synthetic Dataset")
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.show()

In [None]:
# Apply K-Means clustering
kmeans = KMeans(n_clusters=)  #try to modify this to other values in the second round
kmeans.fit_predict(X)

# Get cluster centers and labels
cluster_centers = kmeans.cluster_centers_
cluster_labels = kmeans.labels_
print (cluster_centers)
print (cluster_labels)

In [None]:
# Visualize the clustering results
plt.scatter(X[:, 0], X[:, 1], c=cluster_labels, s=15, cmap='viridis')
plt.scatter(cluster_centers[:, 0], cluster_centers[:, 1], c='red', marker='x', s=200, label='Cluster Centers')
plt.title("K-Means Clustering Results")
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.legend()
plt.show()

# <font color="blue"> Case 2: Unsupervised Machine Learning using Synthetic Dataset (Mean-shift) <font>

In [None]:
from sklearn.cluster import 

Defined synthetic dataset

In [None]:
n_samples = 
n_features = 
n_clusters = 

Modify the **random_state parameter**

random_state is a parameter often used in machine learning libraries, such as scikit-learn in Python, to control the randomness or randomness seeding in various processes. It ensures that the random numbers generated during different runs of your code are reproducible.

When you set the random_state parameter to a specific value, you're essentially providing a seed for the random number generator. This allows you to get the same random values every time you run your code with that particular seed.

In [None]:
X, y = make_blobs(n_samples=n_samples, n_features=n_features, centers=n_clusters, random_state=42)

# Visualize the dataset
plt.scatter(X[], X[], s=10)
plt.title("Synthetic Dataset")
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.show()

The quantile parameter in the estimate_bandwidth function from scikit-learn's sklearn.cluster module is used to control the bandwidth estimation in the Mean Shift clustering algorithm. Mean Shift is a density-based clustering algorithm that aims to find dense regions of data points in the feature space. The quantile parameter in the estimate_bandwidth function from scikit-learn's sklearn.cluster module is used to control the bandwidth estimation in the Mean Shift clustering algorithm. Mean Shift is a density-based clustering algorithm that aims to find dense regions of data points in the feature space.

In [None]:
#Estimate bandwidth for Mean Shift
bandwidth = estimate_bandwidth(X,quantile=0.2) #Silverman's Rule of Thumb/Scott's Rule/Grid Search/Cross-validation
meanshift = MeanShift(bandwidth=bandwidth, bin_seeding=True) 
meanshift.fit_predict(X)
print(bandwidth)  #comment this section and uncomment the code below with customize bandwidth

# #  Apply Mean Shift clustering
# meanshift = MeanShift(bandwidth=0.5)
# meanshift.fit_predict(X)

# Get cluster centers and labels
cluster_centers = meanshift.cluster_centers_
labels = meanshift.labels_

# Visualize the clustering results
plt.scatter(X[:, 0], X[:, 1], c=labels, s=50, cmap='viridis')
plt.scatter(cluster_centers[:, 0], cluster_centers[:, 1], c='red', marker='x', s=200, label='Cluster Centers')
plt.title("Mean Shift Clustering Results")
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.legend()
plt.show()

# <font color="blue"> Exercise <ont>

Try to use different unsupervised machine learning algorithms on the synthetic dataset above

Reference:
https://scikit-learn.org/stable/modules/clustering.html#hierarchical-clustering

In [None]:
#AgglomerativeClustering

from sklearn.cluster import 

# Generate synthetic data
n_samples = 
n_features = 
n_clusters = 

X, _ = make_blobs(n_samples=n_samples, n_features=n_features, centers=n_clusters, random_state=42)

# Apply AgglomerativeClustering
agg_clustering = (n_clusters=n_clusters, affinity='euclidean', linkage='ward')
predicted_labels = agg_clustering.fit_predict(X)

# Plot the data and clusters
plt.scatter(X[:, 0], X[:, 1], c=predicted_labels, cmap='viridis')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Agglomerative Clustering')
plt.show()

In [None]:
#GaussianMixture

import numpy as 
import matplotlib.pyplot as 
from sklearn.mixture import
from sklearn.datasets import 

# Generate synthetic data
n_samples = 
n_features = 
n_clusters = 

X, _ = make_blobs(n_samples=n_samples, n_features=n_features, centers=n_clusters, random_state=42)

# Apply GaussianMixture clustering
gmm = (n_components=n_clusters, covariance_type='full')
predicted_labels = gmm.fit_predict(X)

# Plot the data and clusters
plt.scatter(X[:, 0], X[:, 1], c=predicted_labels, cmap='viridis')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Gaussian Mixture Clustering')
plt.show()

# <font color="blue"> Case 3: Unsupervised Machine Learning using Iris Flower Dataset (K-means) <font>

In [None]:
from sklearn import datasets
import sklearn.metrics as sm

import pandas as pd
import numpy as np

In [None]:
# import data
iris = pd.read_csv('')

## Split the data into feature and label

In [None]:
# split the data into feature and label, no preprocessing is required as this just invlove number
X = iris.iloc[] # inputs into model
y = iris.species # output of model, but bear in mind in unsupervised this is not included!but in this practical want to show students the accuracy and we to show the exact cluster

In [None]:
X.head()

In [None]:
y.head()

In [None]:
num_row=len(y)
print(num_row)

In [None]:
iris_species = y.tolist()
print(iris_species)

![](https://i1221.photobucket.com/albums/dd476/kk_yin/u1.png)

## Building the Kmeans model

You'll now create a KMeans model to find 3 clusters, and fit it to the data points from the previous exercise. After the model has been fit, you'll obtain the cluster labels for some new points using the <font color="blue">.predict()</font> method.

You are given the array points from the previous exercise, and also an array new_points.

### Instructions

- Import KMeans from sklearn.cluster.
- Using KMeans(), create a KMeans inst
ance called model to find 3 clusters. To specify the number of clusters, use the n_clusters keyword argument.
- Use the .fit_predict() method of model to fit the model to the array of points points.
- Use the .predict() method of model to predict the cluster labels of new_points, assigning the result to labels.



In [None]:
# Import KMeans
from sklearn.cluster import 

# Create a KMeans instance with 3 clusters: model
km = KMeans(n_clusters=)

# Fit model to points
km.fit_predict()

# Determine the cluster labels of new_points: labels
labels = km.labels_

# Print cluster labels of new_points
labels #from here we can see group 0-vessicolor， group 1-setosa,  and group2-verginica

### Estimate the optimal number of clusters in K-means clustering

inertia_ attribute is commonly used to estimate the optimal number of clusters in K-means clustering. The concept is known as the "elbow method." The idea is to plot the inertia values for different numbers of clusters and look for an "elbow point" on the plot. The elbow point represents a point of diminishing returns where adding more clusters doesn't significantly decrease the inertia.

The location of the elbow point can give you a heuristic estimate of the optimal number of clusters to use in your K-means clustering. However, it's important to note that the elbow method is not always a definitive criterion, and the choice of the number of clusters should also be guided by domain knowledge, problem-specific considerations, and other validation techniques.


In [None]:
ks = range(1, 7)
inertias = []

for k in ks:
    # Create a KMeans instance with k clusters: model
    model = KMeans(n_clusters=k)

    # Fit model to samples
    model.fit_predict(X)

    # Append the inertia to the list of inertias
    inertias.append(model.inertia_)

# Plot ks vs inertias
plt.plot(ks, inertias, '-o')
plt.xlabel('number of clusters, k')
plt.ylabel('inertia')
plt.xticks(ks)
plt.show()

## Correspondence with iris species

The purpose of the following code is to compare side by side between Actual label and Cluster label (from K-means) so that we can obtain the accuracy and conclude if the unsupervised machine learning is performing well. However, in most of the cases, there will be no Actual label.

### Instructions

Use the <font color="blue">pd.crosstab()</font> function on df['labels'] and df['varieties'] to count the number of times each iris species coincides with each cluster label. Assign the result to ct


In [None]:
# Create a DataFrame with labels and varieties as columns: df
df = pd.DataFrame({'km_labels': labels, 'species': iris_species})
df

In [None]:
print(df.to_string())

In [None]:
# Create crosstab: ct
ct = pd.crosstab(df['km_labels'], df['species'])
print(ct)  #predicted vs actual table (confusion matrix)

#This method also allow us to measure the performance of the clustering method by comparing the actual and cluster result (similar to confusion metrics)

## Measuring Quality of Clustering 1 (When label is unknown)

In K-means clustering, inertia_ is an attribute of the fitted KMeans model in scikit-learn that represents the sum of squared distances between each data point and its assigned cluster center. It is often used as a measure of how well the data points are grouped around their respective cluster centroids. Lower values of inertia indicate that the data points are closer to their cluster centers and, therefore, the clustering is tighter.

In [None]:
from sklearn.cluster import KMeans
km = KMeans(n_clusters=3)
km.fit_predict(X)

print(km.inertia_)

#Lower inertia indicates better clustering (i.e., clusters are tighter and more compact), but it is not the only metric to evaluate the clustering quality

## Measuring Quality of Clustering 2 (When label is known)
**[Clustering metrics](https://scikit-learn.org/stable/modules/classes.html)**
### Convert class to integer

In [None]:
for i in range(0,num_row): # loop and stop before total number of instance
    # Convert class to integer "Iris-Setosa = 0", "Iris -Versicolor = 1", and "Iris-Virginica = 2"
    if y[i] == "Iris-setosa":
        y[i] = "0"
    elif y[i] == "Iris-versicolor":
        y[i] = "1"
    elif y[i] == "Iris-virginica":
        y[i] = "2"

In [None]:
from sklearn.metrics.cluster import v_measure_score
v_measure_score(labels, y)
# print (labels)
# print (y)
# https://www.geeksforgeeks.org/ml-v-measure-for-evaluating-clustering-performance/

# <font color="red"> Exercise </font>

# 1.0 Mean Shift

### Import the library

In [None]:
from sklearn.cluster import 

bandwidth = estimate_bandwidth(X)
ms = 

### Find out the number of estimated clusters by Mean Shift

In [None]:
# Number of clusters by Mean Shift
labels = ms.labels_
labels_unique = np.unique(labels)
n_clusters= len(labels_unique)

print("number of estimated clusters : %d" % n_clusters)

### Fit Mean Shift model and generate the crosstab

In [None]:
# Create a KMeans model with 3 clusters: model
ms = 

# Use fit_predict to fit model and obtain cluster labels: labels
ms_labels = 

# Create a DataFrame with labels and varieties as columns: df
df = 

# Create crosstab: ct
ms_ct = 

#print(df.to_string())

print(ms_ct)

### Calculate the score using  *v_measure_score()*

In [None]:
v_measure_score(ms_labels, y)

__Example output:__ 0.6994

# 2.0 GMM

### Import library

In [None]:
from sklearn.mixture import 
gmm = 

### Fit GMM model

In [None]:
gmm_labels = 
print(gmm_labels)

### Generate the ct

In [None]:
# Create a DataFrame with labels and varieties as columns: df
df = 

# Create crosstab: ct
gmm_ct = 
print(gmm_ct)

### Calculate the score using *v_measure_score()*

In [None]:
v_measure_score(gmm_labels, y)

__Example output:__ 0.8997

## 3.0 Agglomerative Hierarchical  Clustering

### Import library

In [None]:
from sklearn.cluster import
ac = 

### Fit the model

In [None]:
ac_labels = 
print(ac_labels)

### Generate the ct

In [None]:
# Create a DataFrame with labels and varieties as columns: df
df = 

# Create crosstab: ct
ac_ct = 
print(ac_ct)

__Example output:__ 0.7701