In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /home/repl/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/repl/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
reviews = pd.read_csv("reviews.csv")
reviews.sample(10)

Unnamed: 0,content,score
9142,How different it is from Google Calendar ? You...,3
3337,Why it is not giving me notifications for dail...,3
4614,💜,5
2063,You block new entry on my existing list and fo...,1
145,"Trash. Yes, it has some nice nifty features bu...",1
939,Love the shopping list integration,5
10709,"if you don't stop to force ads clip watching, ...",1
1238,"Functions fine in general, easy drag to subtas...",2
2211,A very good to do manager. I would take a prem...,2
10741,This is update on 22 Feb 2020 I don't know wha...,1


In [67]:
negative_reviews_tmp = reviews[(reviews['score'] == 1) | (reviews['score'] == 2)]
print(negative_reviews_tmp)

                                                 content  score
0                          I cannot open the app anymore      1
1      I have been begging for a refund from this app...      1
2      Very costly for the premium version (approx In...      1
3      Used to keep me organized, but all the 2020 UP...      1
4                                    Dan Birthday Oct 28      1
...                                                  ...    ...
11940  I loved it until I realized that the very feat...      2
11941  Gave it a test run and tried out the notificat...      2
11942  Looks great but since installing, my device on...      2
11943  This app looked good until I had to purchase i...      2
11944                                           It's OK!      2

[4850 rows x 2 columns]


In [68]:
def process_text(text):
    tokens = word_tokenize(text)
    filtered_tokens = [token for token in tokens if token.isalpha() and token not in stopwords.words("english")]
    return " ".join(filtered_tokens)

In [69]:
negative_reviews_tmp["review"] = negative_reviews_tmp["content"].apply(process_text)

In [70]:
negative_reviews_tmp

Unnamed: 0,content,score,review
0,I cannot open the app anymore,1,I open app anymore
1,I have been begging for a refund from this app...,1,I begging refund app month nobody replying
2,Very costly for the premium version (approx In...,1,Very costly premium version approx Indian Rupe...
3,"Used to keep me organized, but all the 2020 UP...",1,Used keep organized UPDATES made mess things Y...
4,Dan Birthday Oct 28,1,Dan Birthday Oct
...,...,...,...
11940,I loved it until I realized that the very feat...,2,I loved I realized feature got download first ...
11941,Gave it a test run and tried out the notificat...,2,Gave test run tried notifications Did hear thi...
11942,"Looks great but since installing, my device on...",2,Looks great since installing device lasts half...
11943,This app looked good until I had to purchase i...,2,This app looked good I purchase get week view ...


In [71]:
preprocessed_reviews = pd.DataFrame({"review" : negative_reviews_tmp['review']})
preprocessed_reviews.head()

Unnamed: 0,review
0,I open app anymore
1,I begging refund app month nobody replying
2,Very costly premium version approx Indian Rupe...
3,Used keep organized UPDATES made mess things Y...
4,Dan Birthday Oct


In [72]:
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(preprocessed_reviews['review'])

In [73]:
print(tfidf_matrix)

  (0, 300)	0.7274120131461655
  (0, 324)	0.26084007427013295
  (0, 4100)	0.6346922236686016
  (1, 5020)	0.515334875276096
  (1, 3934)	0.47123926546563444
  (1, 3786)	0.32475682037507164
  (1, 4892)	0.3572537676591596
  (1, 600)	0.515334875276096
  (1, 324)	0.11703093798205348
  (2, 2164)	0.18872896990968657
  (2, 3404)	0.1414956139755732
  (2, 3680)	0.1845065750453737
  (2, 6519)	0.11246782892983179
  (2, 6698)	0.223531459561281
  (2, 317)	0.3193783604591689
  (2, 1763)	0.20247325622873982
  (2, 641)	0.320005895197322
  (2, 6867)	0.18194184297591445
  (2, 4305)	0.223531459561281
  (2, 5206)	0.2860999522995406
  (2, 2985)	0.3046524302573948
  (2, 355)	0.3046524302573948
  (2, 6577)	0.2471438542051858
  (2, 4528)	0.25762745130408776
  (2, 1286)	0.2794782863018086
  :	:
  (4847, 5420)	0.201672584439719
  (4847, 3039)	0.22428160538643246
  (4847, 3459)	0.177623487129521
  (4847, 2591)	0.129082753732614
  (4847, 3151)	0.10110536595095954
  (4847, 324)	0.06254698510370528
  (4848, 4674)	0.21

In [74]:
clust_kmeans = KMeans(n_clusters=5, random_state=500)
categories = clust_kmeans.fit_predict(tfidf_matrix).tolist()

In [75]:
preprocessed_reviews['category'] = categories

In [76]:
terms = tfidf.get_feature_names_out()
print(terms)

['aaah' 'aak' 'aap' ... 'گزینه' 'ইন' 'লগ']


In [77]:
print(categories)

[1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 2, 1, 3, 3, 1, 1, 3, 3, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 4, 3, 1, 1, 1, 0, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 3, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 2, 1, 1, 1, 1, 1, 1, 4, 2, 1, 3, 1, 1, 1, 1, 4, 4, 1, 1, 2, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 1, 1, 2, 4, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 3, 1, 3, 1, 1, 2, 0, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 3, 1, 2, 1, 1, 1, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 1, 1, 3, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 3, 3, 1, 4, 1, 1, 1, 1, 3, 1, 1, 1, 3, 4, 1, 3, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 2, 1, 4, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 4, 1, 1, 1, 3, 1, 

In [78]:
topic_terms_list = []

for cluster in range(clust_kmeans.n_clusters):
    cluster_indices = [i for i, label in enumerate(categories) if label == cluster]
    cluster_tfidf_sum = tfidf_matrix[cluster_indices].sum(axis=0)
    cluster_term_freq = np.asarray(cluster_tfidf_sum).ravel()
    top_term_index = cluster_term_freq.argsort()[::-1][0]
    topic_terms_list.append(
        {
            "category": cluster,
            "term": terms[top_term_index],
            "frequency": cluster_term_freq[top_term_index],
        }
     )   
    

In [79]:
topic_terms = pd.DataFrame(topic_terms_list)

In [80]:
topic_terms

Unnamed: 0,category,term,frequency
0,0,good,34.793038
1,1,app,168.659979
2,2,calendar,63.831556
3,3,premium,53.451435
4,4,version,66.117594
