<a href="https://colab.research.google.com/github/CaSh001/short-meetup-scheduler/blob/master/classfication.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
import scipy.io

In [3]:
import numpy as np
import scipy.io

# 1. data loading
def load_data(file_path):
    # Use the loadmat function in the SciPy library to load a data file in MATLAB format
    data = scipy.io.loadmat(file_path)
    # Extract a specific data matrix from the loaded data, e.g. 'salinas_corrected' or other appropriate key name
    # Return the data matrix
    return data['salinas_corrected']

# Call the load_data function to load a hyperspectral dataset
data = load_data('/content/Salinas_corrected.mat')

# 2. Normalization
def normalize_data(data):
    # Normalise the data
    # Use max-min normalisation to scale the data to a range from 0 to 1
    # Calculate the minimum and maximum values of the data
    min_value = np.min(data)
    max_value = np.max(data)
    # Perform normalisation calculations
    normalized_data = (data - min_value) / (max_value - min_value)
    return normalized_data

# Call the normalize_data function for normalization
normalized_data = normalize_data(data)

# 3. loading ground truth tagging data
gt_data = scipy.io.loadmat('/content/Salinas_gt.mat')['salinas_gt']

# Print normalised data and ground truth classification label data
print("Normalized Data:")
print(normalized_data)
print("Ground Truth Data:")
print(gt_data)


Normalized Data:
[[[0.0326535  0.04252549 0.06465611 ... 0.00509872 0.00238663 0.00303754]
  [0.0326535  0.04252549 0.0574962  ... 0.00509872 0.00238663 0.00282057]
  [0.0326535  0.04252549 0.06465611 ... 0.00444782 0.00238663 0.00238663]
  ...
  [0.04165763 0.05348232 0.06563246 ... 0.00184422 0.00162725 0.00119332]
  [0.04165763 0.05348232 0.07268388 ... 0.00119332 0.00119332 0.00173573]
  [0.03428076 0.04382729 0.07224995 ... 0.0013018  0.0013018  0.00086787]]

 [[0.04230853 0.04458668 0.06628336 ... 0.00466479 0.00238663 0.00282057]
  [0.03460621 0.04458668 0.05923194 ... 0.00423085 0.00282057 0.0026036 ]
  [0.04230853 0.0535908  0.05923194 ... 0.00401389 0.00173573 0.00282057]
  ...
  [0.04198308 0.0529399  0.06519852 ... 0.00151877 0.00119332 0.0006509 ]
  [0.04198308 0.0529399  0.06519852 ... 0.0019527  0.00119332 0.0026036 ]
  [0.03428076 0.04382729 0.07224995 ... 0.0013018  0.0013018  0.00086787]]

 [[0.04122369 0.0548926  0.06671729 ... 0.00433934 0.00216967 0.00303754]
  [0.

In [4]:
print(data.shape)
print(normalized_data.shape)
print(gt_data.shape)


(512, 217, 204)
(512, 217, 204)
(512, 217)


In [5]:
import numpy as np
from sklearn.decomposition import PCA

# Convert the 3D hyperspectral data to a 2D feature matrix
X = normalized_data.reshape(-1, normalized_data.shape[2])

# Create a PCA object and specify the number of components for dimensionality reduction
pca = PCA(n_components=150)

# Perform PCA dimensionality reduction
X_pca = pca.fit_transform(X)

# Print the shape of the reduced data
print("Shape of the reduced data:", X_pca.shape)


Shape of the reduced data: (111104, 150)


#####In the Salinas hyperspectral dataset, we treat it as a complete hyperspectral image. Thus, there is one hyperspectral image in the dataset. His shape of (512, 217, 204) indicates that the dataset contains 512 rows, 217 columns and 204 bands. This means that the dataset consists of 512 pixel samples, each with 204 different band values measured at 217 spatial locations.\

Split Data

In [6]:
import numpy as np

# WE have normalized_data and gt_data defined

# Reshape the normalized data to match the shape of the ground truth data
X = normalized_data.reshape(-1, normalized_data.shape[2])

# Flatten the ground truth data to a 1-dimensional array
labels = gt_data.ravel()

# Print the shapes of X and labels
print("Shape of X:", X.shape)
print("Shape of labels:", labels.shape)


Shape of X: (111104, 204)
Shape of labels: (111104,)


In [7]:

# Define the number of rows for the test set
test_rows_upper = 128

# Split the dataset into train and test based on spatial base
X_train = X[test_rows_upper:, :]
X_test = X[:test_rows_upper, :]
labels_train = labels[test_rows_upper:]
labels_test = labels[:test_rows_upper]

# Print the shapes of train and test data
print("Train data shape:", X_train.shape)
print("Test data shape:", X_test.shape)
print("Train labels shape:", labels_train.shape)
print("Test labels shape:", labels_test.shape)


Train data shape: (110976, 204)
Test data shape: (128, 204)
Train labels shape: (110976,)
Test labels shape: (128,)


In [8]:
# Split the dataset into train and test based on spatial base
X_train = X[test_rows_upper:, :]
X_test = X[:test_rows_upper, :]
y_train = labels[test_rows_upper:]
y_test = labels[:test_rows_upper]

Cross Validation

In [None]:
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

# Define your classifier
model = SVC()

# Concatenate the train and test data
X = np.concatenate((X_train, X_test), axis=0)
y = np.concatenate((labels_train, labels_test), axis=0)

# Perform cross-validation and calculate scores
scores = cross_val_score(model, X, y, cv=5)

# Print the cross-validation scores
print("Cross-validation scores:", scores)
print("Mean score:", np.mean(scores))




Cross-validation scores: [0.54786013 0.72597993 0.76625714 0.79249359 0.75450045]
Mean score: 0.7174182485077183


In [None]:
print("Number of training set samples. ", len(X_train))
print("Number of training set labels. ", len(y_train))



Number of training set samples.  110976
Number of training set labels.  409


DBSCAN

In [None]:
from sklearn.cluster import DBSCAN
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Define the batch size for processing the data
batch_size = 128

# Initialize the metrics lists
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []

# Create a DBSCAN model
dbscan = DBSCAN(eps=0.5, min_samples=5)

# Process the data in batches
num_batches = int(np.ceil(X_test.shape[0] / batch_size))
for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, X_test.shape[0])

    # Get the current batch for testing
    X_batch = X_test[start_idx:end_idx]
    y_batch = labels_test[start_idx:end_idx]

    # Fit the DBSCAN model on the current batch
    dbscan.fit(X_batch)

    # Predict the clusters for the current batch
    y_pred = dbscan.labels_

    # Calculate evaluation metrics for the current batch
    accuracy = accuracy_score(y_batch, y_pred)
    precision = precision_score(y_batch, y_pred, average='weighted')  # Update average parameter
    recall = recall_score(y_batch, y_pred, average='weighted')  # Update average parameter
    f1 = f1_score(y_batch, y_pred, average='weighted')  # Update average parameter

    # Append the metrics to the respective lists
    accuracy_list.append(accuracy)
    precision_list.append(precision)
    recall_list.append(recall)
    f1_list.append(f1)

    # Clear memory by deleting variables
    del X_batch, y_batch, dbscan, y_pred

    # Garbage collect to free up memory
    import gc
    gc.collect()

    # Recreate the DBSCAN model
    dbscan = DBSCAN(eps=0.5, min_samples=5)

# Calculate the average metrics across all batches
accuracy_avg = np.mean(accuracy_list)
precision_avg = np.mean(precision_list)
recall_avg = np.mean(recall_list)
f1_avg = np.mean(f1_list)

# Print the average evaluation metrics
print("Average Accuracy:", accuracy_avg)
print("Average Precision:", precision_avg)
print("Average Recall:", recall_avg)
print("Average F1 Score:", f1_avg)




Average Accuracy: 0.921875
Average Precision: 1.0
Average Recall: 0.921875
Average F1 Score: 0.959349593495935


  _warn_prf(average, modifier, msg_start, len(result))


Spectral Clustering

In [None]:
from sklearn.cluster import SpectralClustering
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np

# 定义训练数据集的大小
train_size = min(5000, len(X_train))  # 根据可用RAM进行调整，这里假设为5000

# 随机选择训练数据集的子集
subset_indices = np.random.choice(X_train.shape[0], size=train_size, replace=False)
subset_data = X_train[subset_indices]
subset_labels = labels_train[subset_indices]

# 训练 Spectral Clustering 模型
spectral_clustering = SpectralClustering(n_clusters=10, random_state=42)
spectral_clustering.fit(subset_data)
train_labels_spectral = spectral_clustering.labels_

# 对测试集进行聚类预测
test_labels_spectral = []
for sample in X_test:
    closest_cluster_label = train_labels_spectral[np.argmin(pairwise_distances([sample], subset_data))]
    predicted_class = np.argmax(np.bincount(subset_labels[train_labels_spectral == closest_cluster_label]))
    test_labels_spectral.append(predicted_class)

# 计算评估指标
accuracy_spectral = accuracy_score(labels_test, test_labels_spectral)
precision_spectral = precision_score(labels_test, test_labels_spectral, average='weighted')
recall_spectral = recall_score(labels_test, test_labels_spectral, average='weighted')
f1_spectral = f1_score(labels_test, test_labels_spectral, average='weighted')

# 打印结果
print("Spectral Clustering Evaluation Metrics:")
print("Accuracy: {}".format(accuracy_spectral))
print("Precision: {}".format(precision_spectral))
print("Recall: {}".format(recall_spectral))
print("F1 Score: {}".format(f1_spectral))



Spectral Clustering Evaluation Metrics:
Accuracy: 0.2734375
Precision: 1.0
Recall: 0.2734375
F1 Score: 0.4294478527607362


  _warn_prf(average, modifier, msg_start, len(result))


The Constrained Laplacian Rank Algorithm

In [None]:
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.cluster import SpectralClustering
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from scipy.sparse import lil_matrix

# Define the number of clusters
num_clusters = 16  # You can adjust the number of clusters as needed


# Define the value of sigma (adjust according to  data)
sigma = 1.0

# Construct the similarity graph
similarity_graph = lil_matrix((X_train.shape[0], X_train.shape[0]), dtype=np.float32)

# Function to generate batches of data
def generate_batches(X, batch_size):
    num_samples = X.shape[0]
    num_batches = num_samples // batch_size
    if num_samples % batch_size != 0:
        num_batches += 1

    for batch_idx in range(num_batches):
        start_idx = batch_idx * batch_size
        end_idx = min((batch_idx + 1) * batch_size, num_samples)
        yield X[start_idx:end_idx]

# Perform spectral clustering
spectral = SpectralClustering(n_clusters=num_clusters, affinity='precomputed')

for batch in generate_batches(X_train, batch_size=1000):
    # Construct the similarity graph for the current batch
    similarity_matrix = pairwise_distances(batch, metric='euclidean')
    similarity_graph_batch = np.exp(-similarity_matrix ** 2 / (2 * sigma ** 2))

    # Update the similarity graph
    similarity_graph[np.ix_(range(batch.shape[0]), range(batch.shape[0]))] = similarity_graph_batch

    # Fit the spectral clustering model on the current batch
    labels_train_spectral = spectral.fit_predict(similarity_graph)

# Establish connections between clusters and classes (CLaRA)
connections = {}
for cluster_label in range(num_clusters):
    # Find the samples belonging to the current cluster
    cluster_indices = np.where(labels_train_spectral == cluster_label)[0]
    cluster_classes = labels_train[cluster_indices]
    # Calculate the unique classes and their frequencies within the cluster
    unique_classes, class_counts = np.unique(cluster_classes, return_counts=True)
    # Assign the dominant class as the class with the highest frequency
    dominant_class = unique_classes[np.argmax(class_counts)]
    # Store the mapping between cluster label and dominant class
    connections[cluster_label] = dominant_class

# Classify the test set based on the learned connections
test_labels_spectral = []
for sample in X_test:
    # Find the closest cluster label to the test sample
    distances = np.linalg.norm(X_train - sample, axis=1)
    closest_cluster_label = labels_train_spectral[np.argmin(distances)]
    # Retrieve the dominant class associated with the closest cluster label
    predicted_class = connections[closest_cluster_label]
    # Append the predicted class to the test labels
    test_labels_spectral.append(predicted_class)

# Calculate weighted evaluation metrics
accuracy_spectral_weighted = accuracy_score(labels_test, test_labels_spectral)
precision_spectral_weighted = precision_score(labels_test, test_labels_spectral, average='weighted')
recall_spectral_weighted = recall_score(labels_test, test_labels_spectral, average='weighted')
f1_spectral_weighted = f1_score(labels_test, test_labels_spectral, average='weighted')

# Print the results
print("Spectral Clustering Evaluation Metrics:")
print("Weighted Accuracy: {}".format(accuracy_spectral_weighted))
print("Weighted Precision: {}".format(precision_spectral_weighted))
print("Weighted Recall: {}".format(recall_spectral_weighted))
print("Weighted F1 Score: {}".format(f1_spectral_weighted))








Spectral Clustering Evaluation Metrics:
Weighted Accuracy: 1.0
Weighted Precision: 1.0
Weighted Recall: 1.0
Weighted F1 Score: 1.0


In [10]:
import numpy as np
from scipy.spatial.distance import euclidean
from sklearn.cluster import SpectralClustering
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
#Step 1: Construct the data graph
def euclidean_distance_batch(data_batch):
    num_samples = data_batch.shape[0]
    similarity_matrix_batch = np.zeros((num_samples, num_samples))

    for i in range(num_samples):
        for j in range(i+1, num_samples):
            similarity_matrix_batch[i, j] = euclidean(data_batch[i], data_batch[j])
            similarity_matrix_batch[j, i] = similarity_matrix_batch[i, j]

    return similarity_matrix_batch

batch_size = 1000  # Adjust the batch size based on available memory
num_samples = X.shape[0]
num_batches = num_samples // batch_size + 1

similarity_matrix = np.zeros((num_samples, num_samples))

for batch_index in range(num_batches):
    start_index = batch_index * batch_size
    end_index = min((batch_index + 1) * batch_size, num_samples)
    data_batch = X[start_index:end_index]
    similarity_matrix_batch = euclidean_distance_batch(data_batch)
    similarity_matrix[start_index:end_index, start_index:end_index] = similarity_matrix_batch

print("Similarity Matrix:")
print(similarity_matrix)


Similarity Matrix:
[[0.         0.01785796 0.04407631 ... 0.         0.         0.        ]
 [0.01785796 0.         0.04254099 ... 0.         0.         0.        ]
 [0.04407631 0.04254099 0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.01421215 0.02303759]
 [0.         0.         0.         ... 0.01421215 0.         0.02432158]
 [0.         0.         0.         ... 0.02303759 0.02432158 0.        ]]


In [11]:
#Step 2: Construct the adjacency matrix
from scipy.spatial.distance import euclidean
from scipy import sparse
import numpy as np

def euclidean_distance(x, y):
    return euclidean(x, y)

def construct_similarity_matrix(data):
    num_samples = data.shape[0]
    similarity_matrix = np.zeros((num_samples, num_samples))

    for i in range(num_samples):
        for j in range(i+1, num_samples):
            similarity_matrix[i, j] = euclidean_distance(data[i], data[j])
            similarity_matrix[j, i] = similarity_matrix[i, j]

    return similarity_matrix

def construct_adjacency_matrix(similarity_matrix, epsilon):
    adjacency_matrix = sparse.csr_matrix(similarity_matrix <= epsilon, dtype=np.int8)
    adjacency_matrix.setdiag(0)  # Set diagonal elements to 0
    return adjacency_matrix


# Rest of the code

# Split the data into smaller subsets
epsilon = 0.5  # Adjust based on the desired similarity threshold
num_subsets = 10  # Adjust the number of subsets based on available memory
subset_size = len(X) // num_subsets

adjacency_matrices = []
for i in range(num_subsets):
    subset_data = X[i * subset_size : (i + 1) * subset_size]
    subset_similarity_matrix = construct_similarity_matrix(subset_data)
    subset_adjacency_matrix = construct_adjacency_matrix(subset_similarity_matrix, epsilon)
    adjacency_matrices.append(subset_adjacency_matrix)

# Merge the adjacency matrices from all subsets
adjacency_matrix = sparse.vstack(adjacency_matrices)

print("Adjacency Matrix:")
print(adjacency_matrix.toarray())  # Convert to dense matrix for printing




Adjacency Matrix:
[[0 1 1 ... 0 0 0]
 [1 0 1 ... 0 0 0]
 [1 1 0 ... 0 0 0]
 ...
 [1 1 1 ... 0 1 1]
 [1 1 1 ... 1 0 1]
 [1 1 1 ... 1 1 0]]


In [12]:
from sklearn.cluster import KMeans
import numpy as np

# Step 3: Optimize the data graph using CLR algorithm
def clr_optimization(adjacency_matrix, k):
    num_samples = adjacency_matrix.shape[0]
    kmeans = KMeans(n_clusters=k)
    labels = kmeans.fit_predict(adjacency_matrix)

    optimized_adjacency_matrix = np.zeros((num_samples, num_samples), dtype=np.int8)
    for i in range(num_samples):
        for j in range(i+1, num_samples):
            if labels[i] == labels[j]:
                optimized_adjacency_matrix[i, j] = 1
                optimized_adjacency_matrix[j, i] = 1

    return optimized_adjacency_matrix

k = 16  # The desired number of clusters

# Split the adjacency matrix into smaller subsets
subset_size = adjacency_matrix.shape[0] // 10  # Adjust the number of subsets based on available memory

optimized_adjacency_matrices = []
for i in range(0, adjacency_matrix.shape[0], subset_size):
    subset_adjacency_matrix = adjacency_matrix[i:i+subset_size]
    subset_optimized_adjacency_matrix = clr_optimization(subset_adjacency_matrix, k)
    optimized_adjacency_matrices.append(subset_optimized_adjacency_matrix)

# Merge the optimized adjacency matrices from all subsets
optimized_adjacency_matrix = np.vstack(optimized_adjacency_matrices)

print("Optimized Adjacency Matrix:")
print(optimized_adjacency_matrix)















Optimized Adjacency Matrix:
[[0 1 1 ... 0 0 0]
 [1 0 1 ... 0 0 0]
 [1 1 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 1 1]
 [0 0 0 ... 1 0 1]
 [0 0 0 ... 1 1 0]]


In [13]:
from sklearn.cluster import MiniBatchKMeans
import numpy as np

# Step 4: Perform graph-based clustering on the optimized graph
def perform_clustering(adjacency_matrix, k):
    num_samples = adjacency_matrix.shape[0]
    cluster_labels = np.zeros(num_samples, dtype=np.int32)
    subset_size = 1000  # Adjust the subset size based on available memory
    num_subsets = num_samples // subset_size

    for i in range(num_subsets):
        start_index = i * subset_size
        end_index = (i + 1) * subset_size
        subset_adjacency_matrix = adjacency_matrix[start_index:end_index]
        kmeans = MiniBatchKMeans(n_clusters=k)
        subset_labels = kmeans.fit_predict(subset_adjacency_matrix)
        cluster_labels[start_index:end_index] = subset_labels

    return cluster_labels

# Assuming you have already generated the optimized_adjacency_matrix and defined the variable k

cluster_labels = perform_clustering(optimized_adjacency_matrix, k)

print("Cluster Labels:")
print(cluster_labels)





Cluster Labels:
[3 3 3 ... 0 0 0]


In [21]:
import numpy as np

# Assuming labels and cluster_labels are your original arrays
labels = np.array(labels)
cluster_labels = np.array(cluster_labels)

# Check lengths
print(len(labels))
print(len(cluster_labels))

# Adjust the length of arrays
if len(labels) > len(cluster_labels):
    indices = np.random.choice(len(labels), size=len(cluster_labels), replace=False)
    labels = labels[indices]
elif len(cluster_labels) > len(labels):
    indices = np.random.choice(len(cluster_labels), size=len(labels), replace=False)
    cluster_labels = cluster_labels[indices]

# Now the lengths should be the same
print(len(labels))
print(len(cluster_labels))



111104
111100
111100
111100


In [23]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np

def evaluate_model(cluster_labels, true_labels):
    accuracy = accuracy_score(true_labels, cluster_labels)
    precision = precision_score(true_labels, cluster_labels, average='weighted')
    recall = recall_score(true_labels, cluster_labels, average='weighted')
    f1 = f1_score(true_labels, cluster_labels, average='weighted')
    return accuracy, precision, recall, f1

accuracy, precision, recall, f1 = evaluate_model(cluster_labels, labels)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)



Accuracy: 0.08772277227722772
Precision: 0.285998963091963
Recall: 0.08772277227722772
F1 Score: 0.1197278945872158


  _warn_prf(average, modifier, msg_start, len(result))
