In [None]:
# install dataprep
!pip install dataprep -q

In [None]:
# standard lib
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

# sklearn & spaCy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import PCA
import spacy

# visualization
import matplotlib.pyplot as plt
%matplotlib inline

# dataprep
from dataprep import clean

In [None]:
# load file
file = '/kaggle/input/chatgpt-reddit/chatgpt-reddit-comments.csv'
df = pd.read_csv(file, usecols = ['comment_body'])

# drop NULLs & duplicates (if any)
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

# reset index
df.reset_index(drop=True,inplace=True)

# clean data
df = clean.clean_text(df, 'comment_body')

# lemmetize
lem = spacy.load('en_core_web_sm', disable = ['parser','ner'])

def lemmetize(txt):
    doc = lem(txt)
    return (" ".join([token.lemma_ for token in doc]))

df['comment_body'] = df['comment_body'].apply(lambda x: lemmetize(x))

# view
df.head()

In [None]:
# feature extraction
vec = TfidfVectorizer(stop_words='english')
vec.fit(df.comment_body.values)
features = vec.transform(df.comment_body.values)

In [None]:
# model training
cls = MiniBatchKMeans(n_clusters=3, random_state=0)
cls.fit(features)

# prediction
pred = cls.predict(features)

# reduction of features to 2D
pca = PCA(n_components=2, random_state=0)
reduced_features = pca.fit_transform(features.toarray())

# reduction of cluster-centers to 2D
red_cluster_centers = pca.transform(cls.cluster_centers_)

# plotting
plt.figure(figsize=(15,10), dpi=80)
plt.scatter(reduced_features[:,0], reduced_features[:,1], c=pred)
plt.scatter(red_cluster_centers[:,0], red_cluster_centers[:,1], marker='X', s=150, c='b')
plt.show()

Apart from few on the far left & far right, most of the clusters seems to be all around and overlapped.