In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
# Download necessary files from NLTK:
# punkt -> Tokenization
# stopwords -> Stop words removal
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /home/repl/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /home/repl/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
# Load the reviews dataset and preview it
reviews = pd.read_csv("reviews.csv")
reviews.head()

Unnamed: 0,content,score
0,I cannot open the app anymore,1
1,I have been begging for a refund from this app...,1
2,Very costly for the premium version (approx In...,1
3,"Used to keep me organized, but all the 2020 UP...",1
4,Dan Birthday Oct 28,1


In [4]:
# Your code starts here
# Cells are free! Use as many as you need ;)
print(reviews['score'].unique())
negative_reviews = reviews.loc[reviews['score']< 3]
print(negative_reviews)

[1 2 3 4 5]
                                                 content  score
0                          I cannot open the app anymore      1
1      I have been begging for a refund from this app...      1
2      Very costly for the premium version (approx In...      1
3      Used to keep me organized, but all the 2020 UP...      1
4                                    Dan Birthday Oct 28      1
...                                                  ...    ...
11940  I loved it until I realized that the very feat...      2
11941  Gave it a test run and tried out the notificat...      2
11942  Looks great but since installing, my device on...      2
11943  This app looked good until I had to purchase i...      2
11944                                           It's OK!      2

[4850 rows x 2 columns]


In [5]:
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove non-alpha characters
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

# Apply preprocessing to negative reviews
negative_reviews['preprocessed_content'] = negative_reviews['content'].apply(preprocess_text)


# Save to a new DataFrame
preprocessed_reviews = negative_reviews[['preprocessed_content', 'score']]

# Display the preprocessed DataFrame
print(preprocessed_reviews)

                                    preprocessed_content  score
0                                   [open, app, anymore]      1
1        [begging, refund, app, month, nobody, replying]      1
2      [costly, premium, version, approx, indian, rup...      1
3      [used, keep, organized, updates, made, mess, t...      1
4                                   [dan, birthday, oct]      1
...                                                  ...    ...
11940  [loved, realized, feature, got, download, firs...      2
11941  [gave, test, run, tried, notifications, didnt,...      2
11942  [looks, great, since, installing, device, last...      2
11943  [app, looked, good, purchase, get, week, view,...      2
11944                                               [ok]      2

[4850 rows x 2 columns]


In [6]:
# Convert lists of tokens to strings because TfidfVectorizer want plain strings 
preprocessed_reviews["preprocessed_content"] = preprocessed_reviews["preprocessed_content"].apply(lambda x: ' '.join(x))

# Vectorize the cleaned reviews using TF-IDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(preprocessed_reviews["preprocessed_content"])

# Step 3: Apply K-means clustering to tfidf_matrix

# Apply K-means clustering (store the model as clust_kmeans)
clust_kmeans = KMeans(n_clusters=5, random_state=500)
pred_labels = clust_kmeans.fit_predict(tfidf_matrix)

# Store the predicted labels in a list variable called categories
categories = pred_labels.tolist()
preprocessed_reviews["category"] = categories

# Step 4: For each unique cluster label, find the most frequent term

# Get the feature names (terms) from the vectorizer
terms = vectorizer.get_feature_names_out()

# List to save the top term for each cluster
topic_terms_list = []

# Iterate over each cluster
for cluster in range(clust_kmeans.n_clusters):
    # Get indices of reviews in the current cluster
    cluster_indices = np.where(np.array(categories) == cluster)[0]  # Simplified using NumPy
    
    # Compute the sum of tf-idf scores for terms in the cluster
    cluster_term_freq = tfidf_matrix[cluster_indices].sum(axis=0).A1  # Convert to 1D array
    
    # Append the top term for the current cluster
    top_term_index = cluster_term_freq.argmax()
    topic_terms_list.append(
        {
            "category": cluster,
            "term": terms[top_term_index],
            "frequency": cluster_term_freq[top_term_index],
        }
    )



In [7]:
# Pandas DataFrame to store results from this step
topic_terms = pd.DataFrame(topic_terms_list)

# Output the final result
print(topic_terms)

   category     term   frequency
0         0     work   57.690096
1         1     good   37.197798
2         2  version   66.128081
3         3      app  183.821473
4         4     time   60.083806
