## Clustering Using K-mean

In [6]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import euclidean_distances

# Read bounding box data from Excel
excel_path = '05_Excel_Output_Sample/03_JSONOutputcleaned.xlsx'
df = pd.read_excel(excel_path)

# Group by document name and append all rows for the column "Combined"
grouped_df = df.groupby('File Name & Page')['CombinedFactorized'].agg(lambda x: ' '.join(map(str, x))).reset_index()

# Convert a specific column to a list
column_name = 'CombinedFactorized'
column_list = grouped_df[column_name].astype(str).tolist()  # Ensure data is converted to string


# TF-IDF Vectorization
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(column_list)

# Number of clusters (you need to specify this)
num_clusters = 7

# K-Means clustering
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(tfidf_matrix)

# Get cluster assignments for each document
cluster_assignments = kmeans.labels_

# Get cluster centers (representative points)
cluster_centers = kmeans.cluster_centers_

# Function to calculate Euclidean distance between a document and cluster center
def euclidean_distance(document_vector, cluster_center):
    return euclidean_distances(document_vector, cluster_center.reshape(1, -1))

# Assign documents to the nearest cluster based on Euclidean distance
document_clusters = {i: [] for i in range(num_clusters)}
for i, document_vector in enumerate(tfidf_matrix):
    distances = [euclidean_distance(document_vector, cluster_centers[cluster_id]) for cluster_id in range(num_clusters)]
    nearest_cluster = np.argmin(distances)
    document_clusters[nearest_cluster].append(column_list[i])

# Print the cluster for each document
for i, document in enumerate(column_list):
    cluster = cluster_assignments[i]
    print(f"Document {i + 1} belongs to Cluster {cluster + 1}")

# Print the documents in each cluster
for cluster_id, documents_in_cluster in document_clusters.items():
    print(f"Cluster {cluster_id + 1}:", documents_in_cluster)



Document 1 belongs to Cluster 2
Document 2 belongs to Cluster 6
Document 3 belongs to Cluster 6
Document 4 belongs to Cluster 3
Document 5 belongs to Cluster 1
Document 6 belongs to Cluster 1
Document 7 belongs to Cluster 1
Document 8 belongs to Cluster 2
Document 9 belongs to Cluster 6
Document 10 belongs to Cluster 6
Document 11 belongs to Cluster 6
Document 12 belongs to Cluster 1
Document 13 belongs to Cluster 2
Document 14 belongs to Cluster 6
Document 15 belongs to Cluster 6
Document 16 belongs to Cluster 6
Document 17 belongs to Cluster 6
Document 18 belongs to Cluster 3
Document 19 belongs to Cluster 2
Document 20 belongs to Cluster 6
Document 21 belongs to Cluster 6
Document 22 belongs to Cluster 6
Document 23 belongs to Cluster 2
Document 24 belongs to Cluster 6
Document 25 belongs to Cluster 6
Document 26 belongs to Cluster 6
Document 27 belongs to Cluster 2
Document 28 belongs to Cluster 6
Document 29 belongs to Cluster 3
Document 30 belongs to Cluster 2
Document 31 belongs

In [7]:
import pandas as pd

# Initialize a list to store dictionaries of cluster and document pairs
cluster_document_pairs = []

# Iterate through document_clusters and create dictionaries
for cluster_id, documents_in_cluster in document_clusters.items():
    for document in documents_in_cluster:
        cluster_document_pairs.append({'Cluster': cluster_id + 1, 'Document': document})

# Create a DataFrame from the list of dictionaries
cluster_document_df = pd.DataFrame(cluster_document_pairs)

# Remove duplicate entries from cluster_document_df
cluster_document_df.drop_duplicates(inplace=True)

# Merge cluster_document_df with grouped_df
merged_df = pd.merge(grouped_df, cluster_document_df, left_on='CombinedFactorized', right_on='Document', how='left')

# Drop the "Document" column as it is redundant
merged_df.drop(columns='Document', inplace=True)

# Print the merged DataFrame
print(merged_df)

# Save the merged DataFrame to a new Excel file
output_excel_path = '05_Excel_Output_Sample/04_JSONOutputclustered.xlsx'
merged_df.to_excel(output_excel_path, index=False)

# Print confirmation message
print(f"Final output saved to: {output_excel_path}")


                                       File Name & Page  \
0        1482H-2018-11-22-PRBI-CL2121A-01-BB-VI 3_page1   
1        1482H-2018-11-22-PRBI-CL2121A-01-BB-VI 3_page2   
2        1482H-2018-11-22-PRBI-CL2121A-01-BB-VI 3_page3   
3        1482H-2018-11-22-PRBI-CL2121A-01-BB-VI 3_page4   
4        1482H-2018-11-22-PRBI-CL2121A-01-BB-VI 3_page5   
...                                                 ...   
1137   PPMSB-PESB-2022-03-U29-E29229-SSIPSR2-UTTM_page2   
1138  PPMSB-PESB-2022-08-U21-E-21236-2-SSIPSR2-UTTM_...   
1139  PPMSB-PESB-2022-08-U21-E-21236-2-SSIPSR2-UTTM_...   
1140   PPMSB-PESB-2022-09-U25-E25218-SSIPSR2-UTTM_page1   
1141   PPMSB-PESB-2022-09-U25-E25218-SSIPSR2-UTTM_page2   

                                     CombinedFactorized  Cluster  
0     0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18...        2  
1     228 229 230 231 232 233 234 235 236 237 238 23...        6  
2     228 229 230 231 232 233 234 235 236 237 238 23...        6  
3     228 229 230 231 2

## Assign Image to cluster folder

In [8]:
import os
import pandas as pd
from shutil import copyfile

# Load the Excel file into a DataFrame
excel_file_path = '05_Excel_Output_Sample/04_JSONOutputclustered.xlsx'
df = pd.read_excel(excel_file_path)

# Create a folder for each cluster #'c:/Users/mohamadfauzan.paimen/OneDrive - PETRONAS/Desktop/New folder/Clustering/Cluster'
output_folder_path = '04_Cluster'
os.makedirs(output_folder_path, exist_ok=True)

# Iterate through each row in the DataFrame and copy images to respective folders
for index, row in df.iterrows():
    # Add .jpg or .jpeg to 'File Name & Page'
    image_name = f"{row['File Name & Page']}.jpg" if not row['File Name & Page'].lower().endswith(('.jpg', '.jpeg')) else row['File Name & Page']
    cluster = str(row['Cluster'])
    
    # Create a folder for the cluster if it doesn't exist
    cluster_folder_path = os.path.join(output_folder_path, f'Cluster_{cluster}')
    os.makedirs(cluster_folder_path, exist_ok=True)
    
    # Copy the image to the cluster folder
    try: 
        image_source_path = os.path.join('03_Image_Sample/', image_name)  # Adjust this path accordingly
        image_destination_path = os.path.join(cluster_folder_path, image_name)
        copyfile(image_source_path, image_destination_path)
    except FileNotFoundError:
        print(f"File not found: {image_name}")

print(f"Images have been saved into folders based on the cluster in: {output_folder_path}")

Images have been saved into folders based on the cluster in: 04_Cluster
