In [1]:
# Loading libraries
import numpy as np
import pandas as pd

from gensim.models import Word2Vec
from transformers import AutoTokenizer, DistilBertModel
from sklearn.decomposition import NMF
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer


### NMF (Non-Negative Matrix Factorization)

In [2]:
# Load cleaned dataset
data_embedded = pd.read_csv('data_embedded.csv')

# Display the first 5 entries of the DataFrame
data_embedded.head(5)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Id,categories,User_id,review_helpfulness,review_score,review_text,processed_text,tokens,embedding_word
0,0,0,1882931173,['Comics & Graphic Novels'],AVCGYZL8FQQTD,7/7,4.0,This is only for Julie Strain fans. It's a col...,this is only for julie strain fans its a colle...,"['julie', 'strain', 'fan', 'collection', 'phot...",[-2.31711984e+00 3.07641804e-01 1.12412512e-...
1,1,1,826414346,['Biography & Autobiography'],A30TK6U7DNS82R,10/10,5.0,I don't care much for Dr. Seuss but after read...,i dont care much for dr seuss but after readin...,"['dont', 'care', 'much', 'dr', 'seuss', 'readi...",[-0.9982359 0.34163076 0.07679285 0.352412...
2,2,2,826414346,['Biography & Autobiography'],A3UH4UZ4RSVO82,10/11,5.0,"If people become the books they read and if ""t...",if people become the books they read and if th...,"['people', 'become', 'book', 'read', 'child', ...",[-1.41883159e+00 2.75954455e-01 -1.68127820e-...
3,3,3,826414346,['Biography & Autobiography'],A2MVUWT453QH61,7/7,4.0,"Theodore Seuss Geisel (1904-1991), aka &quot;D...",theodore seuss geisel aka quotdr seussquot wa...,"['theodore', 'seuss', 'geisel', 'aka', 'quotdr...",[-0.9014187 0.14059153 -0.35499954 0.190779...
4,4,4,826414346,['Biography & Autobiography'],A22X4XUPKF66MR,3/3,4.0,Philip Nel - Dr. Seuss: American IconThis is b...,philip nel dr seuss american iconthis is basi...,"['philip', 'nel', 'dr', 'seuss', 'american', '...",[-1.1215011 0.49693856 -0.15258148 0.195027...


In [43]:
data_nmf = data_embedded.copy()

# Id 0826414346, Title: Dr. Seuss: American Icon
df_0826414346 = data_nmf[data_nmf['Id'] == '0826414346']
df_0826414346.head(5)


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Id,categories,User_id,review_helpfulness,review_score,review_text,processed_text,tokens,embedding_word
1,1,1,826414346,['Biography & Autobiography'],A30TK6U7DNS82R,10/10,5.0,I don't care much for Dr. Seuss but after read...,i dont care much for dr seuss but after readin...,"['dont', 'care', 'much', 'dr', 'seuss', 'readi...",[-0.9982359 0.34163076 0.07679285 0.352412...
2,2,2,826414346,['Biography & Autobiography'],A3UH4UZ4RSVO82,10/11,5.0,"If people become the books they read and if ""t...",if people become the books they read and if th...,"['people', 'become', 'book', 'read', 'child', ...",[-1.41883159e+00 2.75954455e-01 -1.68127820e-...
3,3,3,826414346,['Biography & Autobiography'],A2MVUWT453QH61,7/7,4.0,"Theodore Seuss Geisel (1904-1991), aka &quot;D...",theodore seuss geisel aka quotdr seussquot wa...,"['theodore', 'seuss', 'geisel', 'aka', 'quotdr...",[-0.9014187 0.14059153 -0.35499954 0.190779...
4,4,4,826414346,['Biography & Autobiography'],A22X4XUPKF66MR,3/3,4.0,Philip Nel - Dr. Seuss: American IconThis is b...,philip nel dr seuss american iconthis is basi...,"['philip', 'nel', 'dr', 'seuss', 'american', '...",[-1.1215011 0.49693856 -0.15258148 0.195027...
5,5,5,826414346,['Biography & Autobiography'],A2F6NONFUDB6UK,2/2,4.0,"""Dr. Seuss: American Icon"" by Philip Nel is a ...",dr seuss american icon by philip nel is a thou...,"['dr', 'seuss', 'american', 'icon', 'philip', ...",[-0.9760989 0.07981776 -0.57092327 0.340269...


In [48]:
# Adjust min_df basedon # of records
total_records = len(df_0826414346)
print(total_records)
min_df_percent = 0.05

if total_records > 1:
    min_df = total_records - 1
elif total_records == 1:
    min_df = 1

vector = TfidfVectorizer(min_df=min_df, stop_words='english')
 
# Fit and transform
X = vector.fit_transform(df_0826414346.processed_text)

# Create an NMF instance model
# 5 components will be the topics
model = NMF(n_components=5,random_state=5)
 
# Fit the model to TF-IDF
# Decomposes the TF-IDF matrix X into two matrices W (document-topic matrix) and H (topic-term matrix).
model.fit(X)
 
# Transform the TF-IDF: nmf_features
nmf_features = model.transform(X)

print(X.shape,nmf_features.shape,model.components_.shape)

# Create a DataFrame: df_components
df_components = pd.DataFrame(model.components_, columns=vector.get_feature_names_out())

# The top terms with high weights associated with each topic identified by NM
print(df_components)

# Find the highest value in the matrix
highest_value = np.max(df_components)

# Find the indices (row, column) of the highest value
topic_index, term_index = np.where(df_components == highest_value)

# Extract the corresponding topic and term
highest_topic = topic_index[0]
highest_term = df_components.columns[term_index[0]]

# Print the highest term and its associated topic
print(f"Highest term: {highest_term}, Associated topic: {highest_topic}")

9
(9, 3) (9, 5) (5, 3)
         dr       nel     seuss
0  0.115152  0.031311  0.634700
1  0.490181  0.000000  0.261520
2  0.273966  0.139841  0.014393
3  0.000000  0.593270  0.256339
4  0.000000  0.229767  0.722411
Highest term: seuss, Associated topic: 4


In [None]:
 # Topic 4 ('seuss' heavy topic) might be considered the best if your dataset is about Dr. Seuss books