In [None]:
!pip install --upgrade tensorflow
!pip install --upgrade tf_keras
!pip install --upgrade sentence-transformers

Collecting tensorflow
  Downloading tensorflow-2.18.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Downloading tensorflow-2.18.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (615.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m615.3/615.3 MB[0m [31m657.5 kB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tensorflow
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-text 2.15.0 requires tensorflow<2.16,>=2.15.0; platform_machine != "arm64" or platform_system != "Darwin", but you have tensorflow 2.18.0 which is incompatible.
tf-keras 2.15.1 requires tensorflow<2.16,>=2.15, but you have tensorflow 2.18.0 which is incompatible.[0m[31m
[0mSuccessfully installed tensorflow-2.18.0
Collecting tf_keras
  Downloading tf_keras-2.18.0-py3-none-any.whl.metadata (1.6 kB)
Downl

In [None]:
import pandas as pd
import re

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import AgglomerativeClustering
from sentence_transformers import SentenceTransformer

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# Download NLTK data (only needed once)
nltk.download('stopwords')
nltk.download('punkt_tab')

# Load NCR data from CSV
data = pd.read_csv('NCR_DATA.csv', encoding='latin-1')

In [None]:
#Checking missing values
data.isnull().sum()

Unnamed: 0,0
Estimated_Cost_of_NCR,0
Cluster,0
Detailed Description,0


**Preprocessing Data**

In [None]:
data.dropna(subset=['Detailed Description'], inplace=True)

In [None]:

pd.set_option('display.max_colwidth', 200)

def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stopwords (e.g., "the," "and")
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    # Join tokens back into a single string
    cleaned_text = ' '.join(tokens)
    return cleaned_text

# Apply the cleaning function to the Descriptions column
data['Cleaned_Descriptions'] = data['Detailed Description'].apply(clean_text)


# Display a sample of the cleaned descriptions
print(data[['Detailed Description', 'Cleaned_Descriptions']].head())


                                                                                                                                                                                      Detailed Description  \
0                                                                                                                          The incorrect value of auto-transformer impedance has been used in Cxxx studies   
1  Front sheet of specification C131-MMD-C2-RSP-B071-00002 Rev 2.0 states that it was formerly C131-MMD-C2-COM-B071-00002 and therefore supersedes the former document. This former document has not be...   
2  Two registers listing the same drawings are stored and available in eB with no indication of which is the definitive current register.\nBoth C131-MMD-Z-LRG-B071-00006 Rev 9.0 and C131-MMD-Z-LRG-B0...   
3                                                                              Technical Reference Sheet, C131-MMD-A-RSP-B071-00004, does not include the specification referenc

In [None]:
# Load Sentence-BERT for Embedding
model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight and effective embedding model
# Reset index to ensure it's a continuous sequence
data.reset_index(drop=True, inplace=True)
embeddings = model.encode(data['Cleaned_Descriptions'].tolist(), convert_to_tensor=True)

# Step 2: Compute Cosine Similarity
cos_sim_matrix = cosine_similarity(embeddings.cpu())



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# Step 3: Clustering with AgglomerativeClustering
threshold = 0.8  # Cosine similarity threshold for grouping
clustering = AgglomerativeClustering(
    n_clusters=None,
    metric='precomputed',
    linkage='complete',
    distance_threshold=(1 - threshold)
)

labels = clustering.fit_predict(1 - cos_sim_matrix)  # Dissimilarity = 1 - similarity

In [None]:
# Assign clusters
data['Cluster'] = labels


# Step 5: Display Example Clusters
print(data.groupby('Cluster').head(10)['Cleaned_Descriptions'])  # Show a sample of each cluster

print(data.groupby('Cluster').head(10)['Cluster'])

0                                                                                                                                                    incorrect value autotransformer impedance used cxxx studies
1        front sheet specification c131mmdc2rspb07100002 rev 20 states formerly c131mmdc2comb07100002 therefore supersedes former document former document issued superseded resulting two current different ...
2        two registers listing drawings stored available eb indication definitive current register c131mmdzlrgb07100006 rev 90 c131mmdzlrgb07100009 rev 50 lists riba f drawings examination 70 drawings list...
3                                                                                                                       technical reference sheet c131mmdarspb07100004 include specification reference tag str08
4                                                                                                                                                               head

In [None]:
data.to_csv('/content/NCR_with_clusters.csv', index=False)

# Provide a link to download the file
from google.colab import files
files.download('/content/NCR_with_clusters.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Function to get descriptions for a specific cluster label
def print_cluster_descriptions(cluster_label):
    # Filter the rows where 'Cluster' equals the specified cluster_label
    specific_cluster = data[data['Cluster'] == cluster_label]

    # Print the descriptions of that cluster
    print(f"\nDescriptions for Cluster {cluster_label}:")
    print(specific_cluster['Cleaned_Descriptions'].to_list())  # Convert the descriptions to a list for easy viewing

# Call the function with a specific cluster label
print_cluster_descriptions(20822)  # Replace '1590' with the desired cluster label



Descriptions for Cluster 20822:
['information provided surveillance qs15201 cxxx respect quality management system']


In [None]:
# Group by 'Cluster' and then sort the descriptions within each cluster (if needed)
grouped_data = data.sort_values(by='Cluster')  # Sort by the 'Cluster' column

# Create a new DataFrame with the required columns: 'Estimated_Cost', 'Cluster', and 'Cleaned_Descriptions'
new_df = grouped_data[['Estimated_Cost_of_NCR', 'Cluster', 'Detailed Description']]

# Save this new DataFrame to a CSV file
new_df.to_csv('/content/estimated_cost_clustered.csv', index=False)

# Provide a link to download the CSV
from google.colab import files
files.download('/content/estimated_cost_clustered.csv')



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>