In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN

# Load the Excel file into a DataFrame
file_path = 'merged_file.xlsx'
df = pd.read_excel(file_path)

# Specify the column containing the embeddings (replace 'Embeddings_Column' with the actual column name)
embeddings_column_name = 'Embeddings'


In [None]:
# Function to parse embeddings
def parse_embeddings(embeddings_str):
    # Split the string by comma and remove any leading/trailing whitespace
    values = embeddings_str.strip('[]').split(',')
    #print(len(values))
    # Convert the values to float and create a NumPy array
    return np.array([float(val) for val in values]).round(10)

# Apply the parse_embeddings function to the 'Embeddings' column
df['Embeddings'] = df['Embeddings'].apply(parse_embeddings)

# Create a new DataFrame with 768 columns, each containing one embedding value
embedding_df = pd.DataFrame(df['Embeddings'].to_list())


In [None]:
columns_to_keep = embedding_df.shape[1] // 2

# Keep only the first 'columns_to_keep' columns and drop the rest
embedding_df = embedding_df.iloc[:, :columns_to_keep]

# Remove any rows with NaN values
embedding_df.dropna(inplace=True)

# Reset the index of the DataFrame
embedding_df.reset_index(drop=True, inplace=True)

Scale the data before appying DBSCAN 

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_X = scaler.fit_transform(embedding_df)

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score,silhouette_samples
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

In [None]:
from sklearn.cluster import DBSCAN


#file path for the output text file
output_file_path = 'output_messages.txt'

# Open the file for writing
with open(output_file_path, 'w') as output_file:

    cluster = []

    for k in range(1, 20):

        model = DBSCAN(eps=k * 0.1)

        cluster_labels = model.fit_predict(embedding_df)

        unique_labels, label_counts = np.unique(cluster_labels, return_counts=True)

        
        for label, count in zip(unique_labels, label_counts):
            message = f"Cluster {label}: {count} instances"
            output_file.write(message + '\n')

Below code is to create a dataframe for each epsilon values and how many clusters were created and how many tenders are available in each clusters.

In [None]:
# Initialize a list to collect cluster information
cluster_info = []

# Define the range of epsilon values you want to test
epsilon_range = np.arange(0.1, 2.0, 0.1)

for epsilon in epsilon_range:
    model = DBSCAN(eps=epsilon)
    cluster_labels = model.fit_predict(embedding_df)
    unique_labels, label_counts = np.unique(cluster_labels, return_counts=True)

    # Create a dictionary to store cluster information
    cluster_data = {
        "Epsilon": epsilon,
        "Num_Clusters": len(unique_labels),
        "Cluster_Info": {label: count for label, count in zip(unique_labels, label_counts)}
    }

    # Append the cluster information to the list
    cluster_info.append(cluster_data)

# Create a DataFrame from the list of cluster information
dbscan_cluster_df = pd.DataFrame(cluster_info)



To create a barplot which shows top 10 clusters which has most tenders for each epsilon values.

In [None]:
import matplotlib.pyplot as plt

for index, row in dbscan_cluster_df.iterrows():
    epsilon = row['Epsilon']
    cluster_info = row['Cluster_Info']

    # Sort the cluster_info by the number of instances in descending order
    sorted_clusters = sorted(cluster_info.items(), key=lambda item: item[1], reverse=True)

    # Get the top 10 clusters and their counts
    top_clusters = [item[0] for item in sorted_clusters[:10]]
    top_counts = [item[1] for item in sorted_clusters[:10]]
    cluster_labels = [f'Cluster {cluster}' for cluster in top_clusters]

    # Create a bar plot for the top clusters
    plt.barh(cluster_labels, top_counts)
    plt.xlabel('Number of Tenders')
    plt.ylabel('Cluster Label')
    plt.title(f'Top 10 Clusters for Epsilon = {epsilon}')
    plt.gca().invert_yaxis()  # Invert the y-axis to show the highest count at the top
    plt.show()


To calculate the silhouette_score for DBSCAN model and plot the scores. ( Might not be useful )

In [None]:

# Define the range of epsilon values
epsilon_range = np.linspace(1.0, 2.0, num=10)  # Adjust the range as needed

# Create an empty list to store silhouette scores for each epsilon
silhouette_scores = []

# Iterate over the range of epsilon values
for epsilon in epsilon_range:
    # Create and fit a DBSCAN model with the current epsilon
    dbscan = DBSCAN(eps=epsilon, min_samples=5)
    cluster_labels = dbscan.fit_predict(embedding_df)
    
    # Calculate the silhouette score
    score = silhouette_score(embedding_df, cluster_labels)
    silhouette_scores.append(score)

# Plot the silhouette scores for different epsilon values
plt.plot(epsilon_range, silhouette_scores, 'o--')
plt.xlabel("Epsilon Value")
plt.ylabel("Silhouette Score")
plt.title("Silhouette Score vs. Epsilon Value (DBSCAN)")
plt.show()


# Kmeans Clustering

In [None]:
warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn.cluster._kmeans")

ssd = []

for k in range(25,50):
    
    model = KMeans(n_clusters=k,random_state=5508)
    
    
    model.fit(embedding_df)
    
    #Sum of squared distances of samples to their closest cluster center.
    ssd.append(model.inertia_)
plt.plot(range(150,200),ssd,'o--')
plt.xlabel("K Value")
plt.ylabel(" Sum of Squared Distances");

When applying the elbow method to determine the optimal value of K, it is challenging to identify clear elbow points in the sum of squared distances (SSD) plot. In such cases, an alternative technique like silhouette analysis can be employed to make a more informed decision about the appropriate number of clusters.

In [None]:
# Ignore the warning
warnings.filterwarnings("ignore", category=UserWarning)
# define range of K values
k_range = range(2, 25)

# empty list to store silhouette scores for each K
silhouette_scores = []

# loop over K values
for k in k_range:
    # fit K-means model
    kmeans = KMeans(n_clusters=k, random_state=5508).fit(embedding_df)
    # compute silhouette score
    score = silhouette_score(embedding_df, kmeans.labels_, metric='euclidean')
    silhouette_scores.append(score)
plt.plot(k_range,silhouette_scores,'o--')
plt.xlabel("K Value")
plt.ylabel(" Silhouette Score")
plt.title("Silhouette Score vs. Number of Clusters")
plt.show()

So far we are considering 415 tenders , open individually looking at the tenders, i feel like more than 10 clusters should be performed.

When the K value increases K value doesn't change much which indicates that additional clusters beyond K=3 do not improve the clustering quality significantly. The clusters may continue to have similar patterns of separation and cohesion, leading to similar average silhouette widths.

By looking at this it seems that k= 2 could be an optimal value, let's look at the silhouette diagram and see how individual clusters performs for each K value.

In [None]:
#Using silhouette analysis to find optimal K value

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.pyplot as plt
import numpy as np

# Set random state
np.random.seed(5508)

# Define the range of K values
K_values = range(6, 19)

# Initialize a list to store the silhouette scores
silhouette_scores = []

# Create a subplot for each K value
fig, axs = plt.subplots(len(K_values), 1, figsize=(8, 6 * len(K_values)))

# Iterate over the range of K values
for i, K in enumerate(K_values):
    # Fit the K-means model
    kmeans = KMeans(n_clusters=K, random_state=5508)
    kmeans.fit(embedding_df)

    # Obtain the cluster labels
    labels = kmeans.labels_

    # Compute the silhouette score
    silhouette_avg = silhouette_score(embedding_df, labels)
    silhouette_scores.append(silhouette_avg)

    # Plot the silhouette diagram
    ax = axs[i]
    y_lower = 10

    for j in range(K):
        # Collect silhouette scores for samples in the current cluster
        cluster_silhouette_scores = silhouette_samples(embedding_df, labels)[labels == j]
        cluster_silhouette_scores.sort()

        size_cluster_j = cluster_silhouette_scores.shape[0]
        y_upper = y_lower + size_cluster_j

        color = plt.cm.get_cmap("Spectral")(j / K)
        ax.fill_betweenx(np.arange(y_lower, y_upper), 0, cluster_silhouette_scores, facecolor=color, edgecolor=color, alpha=0.7)

        # Label the cluster with its silhouette score
        ax.text(-0.05, y_lower + 0.5 * size_cluster_j, str(j))
        y_lower = y_upper + 10

    ax.axvline(x=silhouette_avg, color="red", linestyle="--")
    ax.set_yticks([])
    ax.set_xlabel("Silhouette coefficient values")
    ax.set_ylabel("Cluster label")
    ax.set_title(f"K={K}")


In [None]:
# Perform DBSCAN clustering
kmeans = KMeans(n_clusters=10, random_state=5508)
cluster_labels = kmeans.fit_predict(embedding_df)

# Add the cluster labels to the original DataFrame
df['Cluster_Label'] = cluster_labels

# Save the updated DataFrame to a new Excel file
updated_file_path = 'updated_excel_file.xlsx'
df.to_excel(updated_file_path, index=False)

# Print the cluster labels
print("Cluster labels:")
print(cluster_labels)

In [None]:
import pandas as pd

# Load the original Excel file into a DataFrame
file_path = 'updated_excel_file.xlsx'
df = pd.read_excel(file_path)

# Specify the columns you want to keep (replace 'Column1' and 'Column2' with the actual column names)
columns_to_keep = ['Reference Number', 'Client Agency','Developing Agency Parent','Contract Title','Description','UNSPSC Title','Cluster_Label']

# Create a new DataFrame with only the specified columns
new_df = df[columns_to_keep]

# Save the new DataFrame to a new Excel file
new_file_path = 'cluster_file.xlsx'
new_df.to_excel(new_file_path, index=False)
