In [1]:

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

data = pd.read_csv('Product listing.csv')

# Data preprocessing
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Tokenization can be done using regex or libraries like NLTK or spaCy
    # Here, a simple split by space is used
    tokens = text.split()
    # Remove stopwords (you may need to download the stopwords list for your language)
    stopwords = set(['the', 'and', 'is', 'in', 'to', 'it', 'this', 'of', 'for', 'with', 'as'])
    tokens = [token for token in tokens if token not in stopwords]
    return ' '.join(tokens)

data['clean_text'] = data['product'].apply(preprocess_text)

# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # You can adjust max_features as needed
tfidf_matrix = tfidf_vectorizer.fit_transform(data['clean_text'])

# Clustering with K-means
k = 5  # Number of clusters (you can adjust this)
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(tfidf_matrix)

# Assign cluster labels to each review
data['cluster_label'] = kmeans.labels_

# Evaluate clustering using silhouette score
silhouette_avg = silhouette_score(tfidf_matrix, kmeans.labels_)
print(f"Silhouette Score: {silhouette_avg}")

# Display some reviews from each cluster
for cluster_id in range(k):
    cluster_samples = data[data['cluster_label'] == cluster_id].sample(5)  # Displaying 5 samples per cluster
    print(f"\nCluster {cluster_id}:")
    for index, row in cluster_samples.iterrows():
        print(row['product'])
        print('-' * 50)

# You can further analyze the clusters and refine the process as needed


Silhouette Score: 0.057004055728191866

Cluster 0:
Acer ED322QR 31.5 Inch (80.01 cm) Full HD Curved VA Backlit LED Monitor I 144Hz Refresh Rate I Zero Frame I AMD Free Sync I Eye Care Features I Stereo Speakers
--------------------------------------------------
HP 3.1 USB HP 32 GB Flash Drive
--------------------------------------------------
Logitech MX Anywhere 3 Compact Performance Mouse – Wireless, Magnetic Scrolling, Ergonomic, 4000DPI Sensor, Custom Buttons, USB-C, Bluetooth, Apple Mac, iPad, Windows PC, Linux, Chrome - Graphite
--------------------------------------------------
SanDisk Cruzer Blade 32GB USB Flash Drive
--------------------------------------------------
APLT-Portable Slim Wireless Mouse for Laptops 2.4Ghz Silent Wireless Optical Mouse for Laptop, Desktop ( White)
--------------------------------------------------

Cluster 1:
Zebronics Zeb-Corolla In Ear Wired Earphone with Mic, 3.5mm Jack, 1.2 Meter Cable, Multi Function Button
-----------------------------------