In [4]:
import pandas as pd
import nltk
import torch
import os
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from transformers import BertTokenizer, BertModel
import matplotlib.pyplot as plt
from nltk.tokenize import sent_tokenize

# Load the requirements data
with open('nfr (1).txt', 'r') as file:
    text = file.read()

# Sentence tokenization
sentences = sent_tokenize(text)

# BERT model setup
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Generate embeddings
def get_embeddings(sentences):
    embeddings = []
    for sentence in sentences:
        inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=512)
        outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy())
    return embeddings

embeddings = get_embeddings(sentences)

# Set the number of clusters to 9
num_clusters = 9

# Clustering using K-means with 9 clusters
kmeans = KMeans(n_clusters=num_clusters)
labels = kmeans.fit_predict(embeddings)

# Create directory for clusters
output_dir = 'clusters_output'
os.makedirs(output_dir, exist_ok=True)

# Write sentences to cluster files
clustered_sentences = {i: [] for i in range(num_clusters)}
for i, sentence in enumerate(sentences):
    clustered_sentences[labels[i]].append(sentence)

for cluster_id, cluster_sentences in clustered_sentences.items():
    with open(os.path.join(output_dir, f'cluster_{cluster_id}.txt'), 'w') as file:
        for sentence in cluster_sentences:
            file.write(sentence + '\n')

print(f"Clusters saved in the '{output_dir}' directory with {num_clusters} clusters.")


Clusters saved in the 'clusters_output' directory with 8 clusters.
