In [None]:
#Topic Modeling with Latent Semantic Analysis
#https://towardsdatascience.com/topic-modeling-with-latent-semantic-analysis-58aeab6ab2f2

In [2]:

import pandas as pd
path_to_csv = r"C:\Users\User\Documents\BFO Berufsfachschule Oberwallis\Module\Modul Md259 - Machine Learning\Kursmaterial\Amazon musical instruments review\Musical_instruments_reviews.csv"
# load data
df = pd.read_csv(path_to_csv, usecols=['reviewerID', 'reviewText'])
df.head()

Unnamed: 0,reviewerID,reviewText
0,A2IBPI20UZIR0U,"Not much to write about here, but it does exac..."
1,A14VAT5EAX3D9S,The product does exactly as it should and is q...
2,A195EZSQDW3E21,The primary job of this device is to block the...
3,A2C00NNG1ZQQG2,Nice windscreen protects my MXL mic and preven...
4,A94QU4C90B1AX,This pop filter is great. It looks and perform...


In [3]:
df.isna()

Unnamed: 0,reviewerID,reviewText
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
...,...,...
10256,False,False
10257,False,False
10258,False,False
10259,False,False


In [4]:
df=df.dropna()

In [5]:

from gensim.parsing.preprocessing import remove_stopwords, strip_punctuation \
                                        , preprocess_string, strip_short, stem_text

# preprocess given text
def preprocess(text):
    
    # clean text based on given filters
    CUSTOM_FILTERS = [lambda x: x.lower(), 
                                remove_stopwords, 
                                strip_punctuation, 
                                strip_short, 
                                stem_text]
    text = preprocess_string(text, CUSTOM_FILTERS)
    
    return text

In [6]:
# apply function to all reviews 
df['Text (Clean)'] = df['reviewText'].apply(lambda x: preprocess(x))

In [7]:
df['Text (Clean)']

0        [write, here, exactli, suppos, filter, pop, so...
1        [product, exactli, afford, realiz, doubl, scre...
2        [primari, job, devic, block, breath, produc, p...
3        [nice, windscreen, protect, mxl, mic, prevent,...
4        [pop, filter, great, look, perform, like, stud...
                               ...                        
10256                          [great, expect, thank, all]
10257    [think, try, nanoweb, string, while, bit, high...
10258    [tri, coat, string, past, includ, elixir, fond...
10259    [well, elixir, develop, taylor, guitar, string...
10260    [string, good, wouldn, perfect, unwound, strin...
Name: Text (Clean), Length: 10254, dtype: object

In [8]:
# preview of dataset
df.head()

Unnamed: 0,reviewerID,reviewText,Text (Clean)
0,A2IBPI20UZIR0U,"Not much to write about here, but it does exac...","[write, here, exactli, suppos, filter, pop, so..."
1,A14VAT5EAX3D9S,The product does exactly as it should and is q...,"[product, exactli, afford, realiz, doubl, scre..."
2,A195EZSQDW3E21,The primary job of this device is to block the...,"[primari, job, devic, block, breath, produc, p..."
3,A2C00NNG1ZQQG2,Nice windscreen protects my MXL mic and preven...,"[nice, windscreen, protect, mxl, mic, prevent,..."
4,A94QU4C90B1AX,This pop filter is great. It looks and perform...,"[pop, filter, great, look, perform, like, stud..."


In [9]:
from gensim import corpora

# create a dictionary with the corpus
corpus = df['Text (Clean)']
dictionary = corpora.Dictionary(corpus)

# convert corpus into a bag of words
bow = [dictionary.doc2bow(text) for text in corpus]

In [10]:
from gensim.models import LsiModel
from gensim.models.coherencemodel import CoherenceModel

# find the coherence score with a different number of topics
for i in range(2,11):
    lsi = LsiModel(bow, num_topics=i, id2word=dictionary)
    coherence_model = CoherenceModel(model=lsi, texts=df['Text (Clean)'], dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    print('Coherence score with {} clusters: {}'.format(i, coherence_score))

Coherence score with 2 clusters: 0.45969944165248133
Coherence score with 3 clusters: 0.4356385369065509
Coherence score with 4 clusters: 0.3767490046089955
Coherence score with 5 clusters: 0.38116016751232956
Coherence score with 6 clusters: 0.3999360962341283
Coherence score with 7 clusters: 0.3959616405974214
Coherence score with 8 clusters: 0.3923282758009553
Coherence score with 9 clusters: 0.40208198883710033
Coherence score with 10 clusters: 0.4003625441754369


In [11]:
# perform SVD on the bag of words with the LsiModel to extract 2 topics
lsi = LsiModel(bow, num_topics=2, id2word=dictionary)

In [12]:
# find the 5 words with the srongest association to the derived topics
for topic_num, words in lsi.print_topics(num_words=5):
    print('Words in {}: {}.'.format(topic_num, words))

Words in 0: 0.329*"sound" + 0.314*"guitar" + 0.242*"string" + 0.232*"pedal" + 0.217*"amp".
Words in 1: 0.584*"string" + -0.428*"pedal" + 0.380*"guitar" + -0.312*"amp" + 0.161*"tune".


In [13]:
# find the scores given between the review and each topic
corpus_lsi = lsi[bow]
score1 = []
score2 = []
for doc in corpus_lsi:
    score1.append(round(doc[0][1],2))
    score2.append(round(doc[1][1],2))

# create data frame that shows scores assigned for both topics for each review
df_topic = pd.DataFrame()
df_topic['Text'] = df['reviewText']
df_topic['Topic 0 score'] = score1
df_topic['Topic 1 score'] = score2
df_topic['Topic']= df_topic[['Topic 0 score', 'Topic 1 score']].apply(lambda x: x.argmax(), axis=1)
df_topic.head(1)

Unnamed: 0,Text,Topic 0 score,Topic 1 score,Topic
0,"Not much to write about here, but it does exac...",0.88,-0.22,0


In [15]:
df_topic.head(10)

Unnamed: 0,Text,Topic 0 score,Topic 1 score,Topic
0,"Not much to write about here, but it does exac...",0.88,-0.22,0
1,The product does exactly as it should and is q...,1.19,-0.08,0
2,The primary job of this device is to block the...,1.03,-0.21,0
3,Nice windscreen protects my MXL mic and preven...,0.38,0.02,0
4,This pop filter is great. It looks and perform...,0.84,-0.2,0
5,So good that I bought another one. Love the h...,1.24,-0.21,0
6,"I have used monster cables for years, and with...",0.63,-0.04,0
7,I now use this cable to run from the output of...,3.95,-1.16,0
8,Perfect for my Epiphone Sheraton II. Monster ...,0.39,-0.05,0
9,Monster makes the best cables and a lifetime w...,1.01,0.26,0


In [14]:
# find a sample review from each topic
df_topic0 = df_topic[df_topic['Topic'] == 0]
df_topic1 = df_topic[df_topic['Topic']==1]
print('Sample text from topic 0:\n {}'.format(df_topic0.sample(1, random_state=2)['Text'].values))
print('\nSample text from topic 1:\n {}'.format(df_topic1.sample(1, random_state=2)['Text'].values))

Sample text from topic 0:
 ["This holder is spring loaded so it makes inserting and removing picks easy. It has a little sticky tab on the back so you can stick it to something (such at the side of back of your instrument) so you always know where your picks are and always have them on hand. I don't know exactly how many it holds, and the thickness of the picks makes a difference, but I've crammed quite a few of varying sizes in here without issues."]

Sample text from topic 1:
 ["For the money there isn't a better tuner on the market today.  This is the newer version which some have complained about the tuning lines being thicker, really?  This tuner is s[pot on every time and super fast.  You can move from string to string quick."]
