In [140]:
from ipaddress import summarize_address_range
from uu import encode

import numpy as np
import pandas as pd
import networkx as nx
from datasets import load_dataset
from distributed.utils_test import cluster
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import eigs
from sklearn.linear_model import Lasso

In [160]:
!pip install nltk



## Pre-processing

In [2]:
# Read data with 'label' column
user_features_df = pd.read_csv('/Users/cookie/Desktop/FYP/1000k/parameter_labeled.csv')

# Separate data to two part: labeled and unlabeled
labeled_data = user_features_df[user_features_df['labels'].notna()]
unlabeled_data = user_features_df[user_features_df['labels'].isna()]

In [162]:
type(labeled_data)

pandas.core.frame.DataFrame

In [3]:
labeled_data

Unnamed: 0,user_id,average_time_float,purchase_ratio,brand_loyalty_ratio,most_freq_category_1,most_freq_category_2,most_freq_category_3,average_price,labels
0,548705437,30448.36,0.00,0.45,apparel.shoes,apparel.shirt,appliances.kitchen.coffee_grinder,90.77,Exploratory.Fashion.Culinary_enthusiasts
1,524063431,41331.56,0.00,0.44,kids.skates,apparel.shoes,apparel.shoes.keds,112.03,Exploratory.Fashion
2,551478714,28864.61,0.05,0.58,construction.tools.light,appliances.kitchen.refrigerators,appliances.personal.massager,218.13,Exploratory.Brand_loyalty.Culinary_enthusiasts
3,577195981,46225.21,0.00,0.57,auto.accessories.compressor,kids.toys,construction.components.faucet,330.63,Exploratory.Brand_loyalty
4,518942986,55148.62,0.00,0.88,construction.tools.light,electronics.smartphone,appliances.kitchen.refrigerators,461.83,Exploratory.Brand_loyalty.Culinary_enthusiasts...
...,...,...,...,...,...,...,...,...,...
986,519165542,54785.86,0.00,0.36,apparel.shoes,apparel.shoes.moccasins,furniture.living_room.chair,105.21,Exploratory.Fashion
987,580049641,21453.24,0.17,0.63,construction.tools.light,appliances.kitchen.refrigerators,construction.components.faucet,710.62,Early_bird.Decisive.Brand_loyalty.Culinary_ent...
988,524842491,27035.82,0.00,0.45,apparel.shirt,apparel.costume,construction.tools.generator,75.30,Exploratory
989,578570529,47626.60,0.00,0.35,computers.notebook,electronics.audio.headphone,construction.tools.light,710.80,Exploratory.Geek.high_consumer


In [4]:
# read category embedding result
access_token = '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'
dataset = load_dataset(
    "CookieLyu/Category_Codes",
    revision="1000k_average_embedded",
    token=access_token
)

In [5]:
# change embedding to DataFrame
user_embeddings_df = dataset['train'].to_pandas()

user_embeddings_df.head()

Unnamed: 0,average_embedding,user_id
0,"[-0.052206106, 0.012066072, 0.005803698, -0.03...",354197964
1,"[-0.07651164, 0.07044784, -0.023678316, 0.0057...",416533463
2,"[-0.052098688, 0.01875619, -0.0018646835, 0.00...",429051181
3,"[-0.041831866, 0.048917726, 0.0015723156, 0.04...",442487275
4,"[-0.07230235, 0.06587962, -0.027627505, 0.0325...",453521292


In [6]:
# Combine average embedding and other features
labeled_data = labeled_data.merge(user_embeddings_df, on='user_id')
unlabeled_data = unlabeled_data.merge(user_embeddings_df, on='user_id')

In [7]:
# Standardization
numerical_features = ['average_time_float', 'purchase_ratio', 'brand_loyalty_ratio', 'average_price']

scaler = StandardScaler()
labeled_data[numerical_features] = scaler.fit_transform(labeled_data[numerical_features])
scaler = StandardScaler()
unlabeled_data[numerical_features] = scaler.fit_transform(unlabeled_data[numerical_features])

In [8]:
def convert_embedding(x):
    if isinstance(x, str):
        # If it's a string, assume it's a JSON-like list and convert
        return np.array(eval(x))
    elif isinstance(x, (list, np.ndarray)):
        # If it's already a list or array, convert it directly
        return np.array(x)
    else:
        # If it's an unexpected type, raise an error for manual inspection
        raise ValueError(f"Unexpected type {type(x)} for embedding: {x}")

# Apply the function to convert embeddings to numpy arrays
labeled_data['average_embedding'] = labeled_data['average_embedding'].apply(convert_embedding)
unlabeled_data['average_embedding'] = unlabeled_data['average_embedding'].apply(convert_embedding)

In [9]:
labeled_features = np.hstack([labeled_data[numerical_features].values, np.vstack(labeled_data['average_embedding'].values)])
unlabeled_features = np.hstack([unlabeled_data[numerical_features].values, np.vstack(unlabeled_data['average_embedding'].values)])

In [10]:
# Concatenate labeled and unlabeled features to create the complete feature matrix
all_features = np.vstack([labeled_features, unlabeled_features])

seed_indices = list(range(len(labeled_data)))

In [38]:
seed_indices

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,


In [21]:
all_features

array([[ 6.10635369e-01, -2.66431399e-01,  1.22997171e-01, ...,
        -6.27513677e-02, -7.59735180e-04,  9.29126795e-03],
       [ 9.71186401e-01, -2.66431399e-01,  7.90696099e-01, ...,
         2.32733116e-02,  4.40258868e-02,  5.36147691e-02],
       [ 1.62863936e+00, -2.66431399e-01,  2.38290124e+00, ...,
        -2.34026313e-02,  4.01783995e-02,  4.36246246e-02],
       ...,
       [ 1.01202230e+00, -2.61890085e-01,  1.45704195e-01, ...,
         7.01272711e-02,  2.48669311e-02,  1.93553194e-02],
       [-2.45351960e-01, -2.61890085e-01,  1.43743823e+00, ...,
        -4.90485989e-02,  1.32980980e-02, -5.28923944e-02],
       [-3.87132745e-01, -2.61890085e-01, -2.67650698e-01, ...,
        -3.54005359e-02, -1.06524071e-02, -2.73802876e-03]])

# Label One-Hot Encoding

In [12]:
from sklearn.preprocessing import OneHotEncoder

In [13]:
# Example: Labels (categories)
labels = np.array(['Night_owl', 'Early_bird', 'Decisive', 'Exploratory', 'Brand_loyalty', 'Fashion',  'Geek', 'Culinary_enthusiasts', 'high_consumer']).reshape(-1, 1)

# Initialize the OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)

# Fit and transform the labels
one_hot_encoded = encoder.fit_transform(labels)

print("Original Labels:")
print(labels.flatten())

print("\nOne-Hot Encoded:")
print(one_hot_encoded)

# Mapping of categories
print("\nCategories Mapping:")
print(encoder.categories_)


Original Labels:
['Night_owl' 'Early_bird' 'Decisive' 'Exploratory' 'Brand_loyalty'
 'Fashion' 'Geek' 'Culinary_enthusiasts' 'high_consumer']

One-Hot Encoded:
[[0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1.]]

Categories Mapping:
[array(['Brand_loyalty', 'Culinary_enthusiasts', 'Decisive', 'Early_bird',
       'Exploratory', 'Fashion', 'Geek', 'Night_owl', 'high_consumer'],
      dtype='<U20')]


In [14]:
split_labels = labeled_data['labels'].str.split('.')
split_labels

0                                 [Exploratory, Fashion]
1                           [Exploratory, Brand_loyalty]
2      [Exploratory, Brand_loyalty, Culinary_enthusia...
3                    [Exploratory, Culinary_enthusiasts]
4                           [Exploratory, Brand_loyalty]
                             ...                        
679    [Early_bird, Exploratory, Fashion, Culinary_en...
680                            [Early_bird, Exploratory]
681                               [Exploratory, Fashion]
682                                        [Exploratory]
683                   [Exploratory, Geek, high_consumer]
Name: labels, Length: 684, dtype: object

In [15]:
from tqdm import tqdm
encoded_labels = []

for label in tqdm(split_labels):
    n = np.array(label)
    e_label = encoder.transform(n.reshape(-1, 1))
    encoded_labels.append(e_label)


100%|██████████| 684/684 [00:00<00:00, 12936.92it/s]


In [16]:
encoded_labels

[array([[0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0.]]),
 array([[0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0.]]),
 array([[0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1.]]),
 array([[0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0.]]),
 array([[0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0.]]),
 array([[0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 1., 0., 0.]]),
 array([[0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0.]]),
 array([[0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0.]]),
 array([[0., 0., 0., 0., 1., 0., 0., 0., 0.]]),
 array([[0., 0.,

## Graph-based semi-supervised clustering

In [None]:
def build_graph(features):
    # Compute similarity between users using cosine similarity
    similarity_matrix = cosine_similarity(features)
    np.fill_diagonal(similarity_matrix, 0)  # No self-loops
    graph = nx.from_numpy_array(similarity_matrix)
    return graph

In [None]:
def compute_laplacian(graph):
    adj_matrix = nx.to_numpy_array(graph)
    degree_matrix = np.diag(np.sum(adj_matrix, axis=1))
    laplacian_matrix = degree_matrix - adj_matrix
    return laplacian_matrix

In [None]:
def compute_laplacian_eigenvectors(laplacian_matrix, k):
    # Calculate the eigenvalues and eigenvectors of the Laplacian matrix
    _, eigvecs = eigs(laplacian_matrix, k=k, which='SM')
    return np.real(eigvecs)

In [None]:
def compressive_sensing_clustering(laplacian_matrix, seed_nodes, num_clusters=2):
    # Extract nodes related to the seed nodes using iterative clustering (simplified version)
    # Placeholder implementation: This is where the compressive sensing approach should be applied.
    clusters = {i: [] for i in range(num_clusters)}
    for idx, node in enumerate(seed_nodes):
        clusters[idx % num_clusters].append(node)
    return clusters

### big dataset

In [26]:
def construct_graph(data, k=5):
    """
    Construct a k-NN graph from data points using cosine similarity.
    """
    # Compute cosine similarity matrix
    similarity = cosine_similarity(data)
    n_samples = data.shape[0]
    adj_matrix = np.zeros((n_samples, n_samples))

    for i in range(n_samples):
        # Get indices of k most similar neighbors (excluding self)
        k_neighbors = np.argsort(similarity[i])[-(k + 1):-1][::-1]
        
        for j in k_neighbors:
            # Use similarity directly for the adjacency matrix
            adj_matrix[i][j] = similarity[i][j]
            adj_matrix[j][i] = adj_matrix[i][j]

    return adj_matrix

In [29]:
adj = construct_graph(all_features)
len(adj)

6754

In [31]:
len(all_features)

6754

In [32]:
# Step 2: Laplacian Matrix Calculation
def compute_laplacian(adj_matrix):
    """
    Compute the random walk Laplacian matrix.
    """
    
    # Create degree matrix
    degree_matrix = np.diag(np.sum(adj_matrix, axis=1))
    
    # # Handle cases where the degree matrix might be singular
    # with np.errstate(divide='ignore', invalid='ignore'):
    #     degree_matrix_inv = np.linalg.inv(degree_matrix)
    #     degree_matrix_inv[np.isinf(degree_matrix_inv)] = 0  # Replace infinities (from 1/0) with 0
    #     degree_matrix_inv = np.nan_to_num(degree_matrix_inv) # Replace NaNs (from 0/0) with 0
    
    # P is transition matrix, which derived from the adjacency matrix A and the degree matrix D
    # P = D^(-1) * A
    # Each entry Pij represents the probability of transitioning from node i to node j during the random walk
    # L = I - P
    
    # linalg.inv() compute the inverse of a matrix
    # matmul() matrix product of two arrays
    # np.eye() Return a 2-D array with ones on the diagonal and zeros elsewhere.
    # I = identify matrix 单位矩阵
    laplacian = np.eye(len(adj_matrix)) - np.matmul(np.linalg.inv(degree_matrix), adj_matrix)
    return laplacian

In [122]:
# Step 3: Initial Random Walk
def random_walk(laplacian, label, data, seed_indice, steps=3):
    """
    Perform a random walk on the graph starting from the seed nodes.
    
    If X^(t) is the label score matrix at step t, the random walk process can be expressed as:
        X^(t+1) = P * X^(t) or equivalently X^(t+1) = (I - L) * X^(t)
    Here, L is the random walk Laplacian.
    
    label: list
    """
    n_samples = data.shape[0]
    n_classes = label[0].shape[1]
    
    # Create empty labels for unlabeled usr 
    emp = [np.array([0.]*n_classes)]*(n_samples - len(seed_indice))
    # print('emp: ', emp)
    _label = []
    for x in label:
        _label.append(np.sum(x, axis=0))
        
    label = _label + emp
    label_matrix = np.matrix(label)
    print(label_matrix.shape)
    print(label_matrix)
    
    # Normalize the initial distribution for labeled nodes
    # initial_vector = initial_vector / (initial_vector.sum(axis=1, keepdims=True) + 1e-10)
    
    # Transition matrix (random walk matrix)
    transition_matrix = np.eye(n_samples) - laplacian

    # Perform the random walk with laplacian
    # matrix_power(), raise a square matrix to the (integer) power n.
    random_walk_result = np.linalg.matrix_power(transition_matrix, steps).dot(label_matrix)
    
    # Normalize rows of the result (optional: interpret as probabilities)
    row_sums = random_walk_result.sum(axis=1).reshape(-1, 1)
    normalized_result = random_walk_result / (row_sums + 1e-10)
    
    return normalized_result

In [123]:
lap = compute_laplacian(adj)
rw = random_walk(lap, encoded_labels, all_features, seed_indices, steps=3)

(6754, 9)
[[0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 1. 0. ... 0. 0. 1.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [133]:
rw

matrix([[0.1552312 , 0.01035398, 0.        , ..., 0.00937703, 0.        ,
         0.        ],
        [0.29253523, 0.19472133, 0.        , ..., 0.17339231, 0.        ,
         0.02042895],
        [0.2815258 , 0.1828134 , 0.        , ..., 0.02860971, 0.        ,
         0.1436035 ],
        ...,
        [0.07367334, 0.2921818 , 0.        , ..., 0.1878148 , 0.        ,
         0.        ],
        [0.29822664, 0.2006717 , 0.        , ..., 0.05926276, 0.        ,
         0.        ],
        [0.        , 0.07486562, 0.        , ..., 0.07333305, 0.0134168 ,
         0.        ]])

In [138]:
# Step 4: Candidate Set Extraction
def extract_candidate_set(random_walk_result, threshold=0.3):
    """
    Extract candidate nodes with multiple labels based on random walk scores.

    """
    candidates = {}
    n_samples, n_classes = random_walk_result.shape

    for i in range(n_samples):
        # Find all classes for the current node that exceed the threshold
        labels_with_probs = [
            (c, f"{random_walk_result[i,c] * 100:.2f}%") for c in range(n_classes) if random_walk_result[i, c] > threshold]
        if labels_with_probs:
            candidates[i] = labels_with_probs

    
    return candidates

In [139]:
extraction = extract_candidate_set(rw)
extraction

{0: [(4, '41.51%'), (5, '41.00%')],
 3: [(4, '46.13%')],
 4: [(0, '38.87%'), (4, '35.55%')],
 5: [(4, '57.55%')],
 6: [(0, '36.91%')],
 7: [(0, '30.25%'), (4, '30.25%')],
 8: [(4, '78.02%')],
 9: [(4, '53.81%')],
 10: [(4, '34.17%')],
 11: [(4, '40.40%')],
 13: [(4, '39.54%')],
 14: [(1, '37.26%'), (4, '43.80%')],
 15: [(4, '39.19%')],
 16: [(0, '38.28%'), (4, '31.15%')],
 18: [(4, '33.02%')],
 19: [(4, '30.26%')],
 20: [(4, '47.84%')],
 21: [(4, '45.14%'), (5, '36.91%')],
 22: [(4, '32.39%')],
 24: [(4, '46.86%')],
 25: [(4, '67.38%')],
 26: [(0, '33.25%'), (4, '34.20%')],
 28: [(0, '30.61%')],
 29: [(0, '37.57%'), (4, '37.57%')],
 30: [(4, '46.87%')],
 31: [(4, '48.58%')],
 32: [(4, '51.99%'), (5, '31.52%')],
 33: [(4, '36.10%'), (5, '31.33%')],
 34: [(4, '39.52%')],
 37: [(4, '45.33%'), (5, '36.88%')],
 38: [(0, '40.75%'), (4, '40.75%')],
 39: [(4, '47.27%')],
 40: [(4, '49.98%')],
 41: [(4, '36.77%'), (5, '32.69%')],
 43: [(4, '41.89%')],
 45: [(4, '43.27%')],
 46: [(4, '36.55%')],

In [161]:
with open('/Users/cookie/Desktop/FYP/extraction.txt', 'w') as f:
    for k, v in extraction.items():
        sub_v = [x[0] for x in v]
        f.write(f"{k}: {v}\n")

In [141]:
# # Step 5: Compressive Sensing Optimization
# def compressive_sensing_laplacian(L, y, sparsity_level):
#     """
#     Solve compressive sensing using Iterative Hard Thresholding or similar methods.
#     
#     L: laplacian matrix
#     y: observed labels or measurements
#     sparsity_level: desired sparsity percentage
#     """
#     model = Lasso(alpha=0.01, max_iter=1000)
#     model.fit(L, y)
#     x = model.coef_
#     # Threshold to ensure sparsity
#     x[np.abs(x) < np.percentile(np.abs(x), 100 - sparsity_level)] = 0
#     return x

In [157]:
encoded_labels_matrix = np.matrix([np.sum(x, axis=0) for x in encoded_labels])
encoded_labels_matrix

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 1., 0., ..., 0., 0., 1.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 1., 0., 1.]])

In [153]:
sp = compressive_sensing_laplacian(np.asarray(lap_matrix), np.asarray(encoded_labels_matrix), 10)
sp

ValueError: Found input variables with inconsistent numbers of samples: [6754, 684]

In [None]:
# # Step 6: Thresholding to Extract the Cluster
# def extract_cluster_from_solution(x, rejection_threshold=0.1):
#     """
#     Extract nodes based on the compressive sensing solution.
#     """
#     return np.where(x > rejection_threshold)[0]


In [None]:
# # Main function for Semi-supervised Local Clustering
# def graph_based_semi_supervised_clustering(data, encoded_labels):
#     # Step 1: Construct k-NN Graph
#     adj_matrix = construct_graph(data)
#     
#     # Step 2: Compute Laplacian
#     laplacian = compute_laplacian(adj_matrix)
#     
#     # Step 3: Random Walk
#     random_walk_result = random_walk(laplacian, encoded_labels)
#     
#     # Step 4: Extract Candidate Set
#     candidate_set = extract_candidate_set(random_walk_result)
#     
#     # Step 5: Compressive Sensing to Improve Cluster Extraction
#     y = np.sum(laplacian[candidate_set, :], axis=0)
#     x = compressive_sensing_laplacian(laplacian, y, sparsity_level=10)
#     
#     # Step 6: Threshold to Extract Cluster
#     cluster = extract_cluster_from_solution(x)
#     return cluster

In [None]:
from sklearn.decomposition import PCA

# Reduce to 2D
pca = PCA(n_components=2)
reduced_features = pca.fit_transform(all_features)


In [None]:
cluster.shape

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Cluster matrix visualization
plt.figure(figsize=(10, 8))
sns.heatmap(cluster, cmap='coolwarm', cbar=True, xticklabels=True, yticklabels=False)
plt.title('Node Membership Across Clusters')
plt.xlabel('Cluster')
plt.ylabel('Nodes')
plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Assuming `cluster` contains cluster labels
# and `seed_indices` are indices of seed nodes

# Scatter plot
plt.figure(figsize=(8, 6))
for label in np.unique(cluster):
    cluster_points = reduced_features[cluster == label]
    plt.scatter(cluster_points[:, 0], cluster_points[:, 1], label=f"Cluster {label}", alpha=0.7)

# Highlight seed nodes
seed_points = reduced_features[seed_indices]
plt.scatter(seed_points[:, 0], seed_points[:, 1], color='red', edgecolor='k', label='Seed Nodes', s=100, marker='x')

plt.title("Graph-Based Semi-Supervised Clustering")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.legend()
plt.show()
