Importing Libraries

In [31]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import MiniBatchKMeans, AgglomerativeClustering, DBSCAN
from sklearn.metrics.pairwise import cosine_distances
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import Counter
import nltk
import numpy as np

In [32]:
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abhay\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\abhay\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Loading Dataset

In [None]:
df = pd.read_csv("output.csv") 
comments = df['comment'].dropna().astype(str)

Preprocessing

In [34]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    words = text.split()
    return ' '.join([w for w in words if w not in stop_words])

cleaned = comments.apply(clean_text)

In [35]:
vectorizer = TfidfVectorizer(max_features=1000)
X_tfidf = vectorizer.fit_transform(cleaned)

svd = TruncatedSVD(n_components=100, random_state=42)
X_lsa = svd.fit_transform(X_tfidf)


In [36]:
kmeans = MiniBatchKMeans(n_clusters=20, batch_size=1000, random_state=42)
kmeans_labels = kmeans.fit_predict(X_lsa)



In [None]:
dbscan = DBSCAN(eps=0.3, min_samples=5, metric='cosine', n_jobs=-1)
dbscan_labels = dbscan.fit_predict(X_lsa)

In [None]:
sample_size = 5000  
agg = AgglomerativeClustering(n_clusters=20)
agg_labels_partial = agg.fit_predict(X_lsa[:sample_size])
agg_labels = np.full(len(X_lsa), -1)
agg_labels[:sample_size] = agg_labels_partial

In [None]:
import matplotlib.pyplot as plt


plt.figure(figsize=(10, 6))
unique_labels = set(kmeans_labels)
colors = plt.cm.tab20(np.linspace(0, 1, len(unique_labels)))

for label, color in zip(unique_labels, colors):
    mask = kmeans_labels == label
    plt.scatter(X_2d[mask, 0], X_2d[mask, 1], s=10, color=color, label=f'Cluster {label}')

plt.title("KMeans Clustering (2D Projection using SVD)")
plt.xlabel("Component 1")
plt.ylabel("Component 2")
plt.legend(loc='best', markerscale=2)
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt

svd_2d = TruncatedSVD(n_components=2, random_state=42)
X_2d = svd_2d.fit_transform(X_tfidf)


plt.figure(figsize=(10, 6))
unique_labels = set(dbscan_labels)
colors = plt.cm.tab20(np.linspace(0, 1, len(unique_labels)))

for label, color in zip(unique_labels, colors):
    mask = dbscan_labels == label
    plt.scatter(X_2d[mask, 0], X_2d[mask, 1], s=10, color=color, label=f'Cluster {label}' if label != -1 else "Noise")

plt.title("DBSCAN Clustering (2D Projection using SVD)")
plt.xlabel("Component 1")
plt.ylabel("Component 2")
plt.legend(loc='best', markerscale=2)
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
final_labels = []
for i in range(len(comments)):
    votes = [kmeans_labels[i], dbscan_labels[i], agg_labels[i]]
    filtered_votes = [v for v in votes if v != -1]
    label = Counter(filtered_votes).most_common(1)[0][0] if filtered_votes else -1
    final_labels.append(label)

In [None]:
def top_terms_per_cluster(X_tfidf, labels, vectorizer, top_n=5):
    terms = np.array(vectorizer.get_feature_names_out())
    for cluster_num in np.unique(labels):
        if cluster_num == -1:
            continue 
        cluster_indices = np.where(labels == cluster_num)[0]
        mean_tfidf = X_tfidf[cluster_indices].mean(axis=0).A1
        top_terms = terms[mean_tfidf.argsort()[::-1][:top_n]]
        print(f"Cluster {cluster_num}: {', '.join(top_terms)}")

top_terms_per_cluster(X_tfidf, kmeans_labels, vectorizer)

Cluster 0: child, small, young, woman, holding
Cluster 1: jacket, man, wearing, black, blue
Cluster 2: shirt, man, blue, wearing, white
Cluster 3: crowd, people, front, man, large
Cluster 4: playing, guitar, man, two, game
Cluster 5: man, person, wearing, sitting, white
Cluster 6: two, men, women, one, girls
Cluster 7: people, two, three, walking, sitting
Cluster 8: water, body, man, dog, boat
Cluster 9: boy, young, little, shirt, blue
Cluster 10: group, people, men, standing, large
Cluster 11: working, man, men, construction, two
Cluster 12: street, walking, man, city, people
Cluster 13: woman, man, wearing, sitting, black
Cluster 14: children, two, playing, group, three
Cluster 15: jumping, dog, air, boy, man
Cluster 16: girl, little, young, pink, wearing
Cluster 17: dogs, two, running, grass, snow
Cluster 18: holding, man, woman, wearing, baby
Cluster 19: dog, brown, black, running, white


Saving Output in CSV

In [None]:
df['KMeans_Label'] = kmeans_labels
df['DBSCAN_Label'] = dbscan_labels
df['Agglomerative_Label'] = agg_labels
df['Final_Label'] = final_labels

df.to_csv("clustered_output11.csv", index=False)
print("Clustering with stemming complete. Output saved to 'clustered_output_stemmed.csv'")

Clustering with stemming complete. Output saved to 'clustered_output_stemmed.csv'


Saving Model

In [None]:
import pickle
models = {
    'vectorizer': vectorizer,
    'svd': svd,
    'kmeans': kmeans,
    'dbscan': dbscan,
    'agg': agg
}


In [None]:
with open('clustering_pipeline.pkl', 'wb') as f:
    pickle.dump(models, f)

print("All models and vectorizer saved to 'clustering_pipeline.pkl'")

All models and vectorizer saved to 'clustering_pipeline.pkl'
