In [12]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/dataset/dialogues_text.txt


In [13]:
!pip install transformers torch scikit-learn


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch)
  Downloading nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-nvjitlink-cu12==12.4.127 (from torch)
  Downloading nvidia_nvjitlink_cu12-12.4.127-py3-n

In [15]:
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from transformers import AutoModel, AutoTokenizer


In [17]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"✅ Using device: {device}")


✅ Using device: cuda


In [28]:
import pandas as pd
import re

# 1️⃣ Read the lines
with open("/kaggle/input/dataset/dialogues_text.txt", "r", encoding="utf-8") as f:
    lines = f.read().splitlines()

# 2️⃣ Put lines into a DataFrame
data = pd.DataFrame(lines, columns=["raw_text"])

# 3️⃣ Remove __eou__ tags
data["cleaned_text"] = data["raw_text"].str.replace("__eou__", "", regex=False)
data["cleaned_text"] = data["raw_text"].str.replace(" â€™", "", regex=False)


# 4️⃣ Lowercase the text
data["cleaned_text"] = data["cleaned_text"].str.lower()

# 5️⃣ Remove punctuation and special characters (keeping only letters and spaces)
data["cleaned_text"] = data["cleaned_text"].str.replace(r"[^a-z\s]", "", regex=True)

# 6️⃣ Strip any leading/trailing spaces
data["cleaned_text"] = data["cleaned_text"].str.strip()

# ✅ Done! Preview the results
print(data.head())


                                            raw_text  \
0  The kitchen stinks . __eou__ I'll throw out th...   
1  So Dick , how about getting some coffee for to...   
2  Are things still going badly with your housegu...   
3  Would you mind waiting a while ? __eou__ Well ...   
4  Are you going to the annual party ? I can give...   

                                        cleaned_text  
0  the kitchen stinks  eou ill throw out the garb...  
1  so dick  how about getting some coffee for ton...  
2  are things still going badly with your housegu...  
3  would you mind waiting a while  eou well  how ...  
4  are you going to the annual party  i can give ...  


In [29]:
# Replace this with your actual data load
# data = pd.read_csv("your_file.csv") 
# Must have data["cleaned_text"]

model_name = "sentence-transformers/all-mpnet-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
hf_model = AutoModel.from_pretrained(model_name).to(device)

def embed(texts, batch_size=32):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        enc = tokenizer(batch, return_tensors='pt', padding=True, truncation=True, max_length=128).to(device)
        with torch.no_grad():
            outputs = hf_model(**enc)
            emb = outputs.last_hidden_state.mean(dim=1)
        embeddings.append(emb.cpu().numpy())
    return np.vstack(embeddings)

embedding_array = embed(data["cleaned_text"].tolist())
embedding_array.shape


(13118, 768)

In [25]:
embedding_dim = embedding_array.shape[1]
latent_dim = 64  # Adjust if needed

class Autoencoder(nn.Module):
    def __init__(self, input_dim, latent_dim=64):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, latent_dim),
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 256),
            nn.ReLU(),
            nn.Linear(256, input_dim),
        )
    def forward(self, x):
        latent = self.encoder(x)
        recon = self.decoder(latent)
        return recon, latent

autoencoder = Autoencoder(embedding_dim, latent_dim).to(device)

embedding_tensor = torch.tensor(embedding_array, dtype=torch.float32).to(device)
dataloader = DataLoader(TensorDataset(embedding_tensor), batch_size=32, shuffle=True)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(autoencoder.parameters(), lr=1e-3)

# Training Loop
for epoch in range(30):  # Adjust epochs if needed
    total_loss = 0
    for batch in dataloader:
        batch_data = batch[0]
        recon, latent = autoencoder(batch_data)
        loss = criterion(recon, batch_data)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    if (epoch + 1) % 5 == 0:
        print(f"✅ Epoch [{epoch + 1}/30] - Reconstruction Loss: {total_loss / len(dataloader):.4f}")

# Final Latent Embeddings
with torch.no_grad():
    latent_embeddings = autoencoder.encoder(embedding_tensor).cpu().numpy()


✅ Epoch [5/30] - Reconstruction Loss: 0.0031
✅ Epoch [10/30] - Reconstruction Loss: 0.0029
✅ Epoch [15/30] - Reconstruction Loss: 0.0028
✅ Epoch [20/30] - Reconstruction Loss: 0.0028
✅ Epoch [25/30] - Reconstruction Loss: 0.0027
✅ Epoch [30/30] - Reconstruction Loss: 0.0027


In [27]:


from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

best_score = -1
best_n_clusters = None
best_labels = None

# Try n_clusters from 5 to 10
for n_clusters in range(5, 15):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    labels = kmeans.fit_predict(latent_embeddings)
    sil_score = silhouette_score(latent_embeddings, labels)
    
    
    if sil_score > best_score:
        best_score = sil_score
        best_n_clusters = n_clusters
        best_labels = labels

# Final results
data["cluster"] = best_labels
print(f"✅ Best n_clusters: {best_n_clusters}, with silhouette_score = {best_score:.4f}")

# Save Results
data["cluster"] = labels
data.to_csv("dec_clusters.csv", index=False)
print("✅ Done! Results saved to dec_clusters.csv.")




✅ Best n_clusters: 10, with silhouette_score = 0.0357
✅ Done! Results saved to dec_clusters.csv.
