# Topic Mining from Reviews or Text 📚

This application takes a collection of reviews or other texts and mines topics from the corpus.

**Usage**
- Please wait for the app to load.
- Select or upload dataset.
- The uploaded data will be text file with one review per line.
- Pick number of topics and number of keywords (default is 10 for both).
- Then click on the 'Start Mining' button below.  


Cong Chen | congc4@illinois.edu

***

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from time import time
from gensim import models
from gensim import matutils
import ipywidgets as widgets
from IPython.display import display
import codecs

In [2]:
def run_lda(text):
    numfeatures = 500
    if num_words.value:
        num_display_words = num_words.value
    else:
        num_display_words = 10

    if num_topics.value:
        K_clusters = num_topics.value
    else:
        K_clusters = 10
    
    text_split = text.split("\n")
        
    vectorizer = TfidfVectorizer(max_df=0.5, max_features=numfeatures,
                                     min_df=2, stop_words='english',
                                     use_idf=True)
    with output: print("Creating text representation ...")
    X = vectorizer.fit_transform(text_split)
    with output: print("Building corpus ...")
    id2words ={}
    for i,word in enumerate(vectorizer.get_feature_names()):
        id2words[i] = word
        
    corpus = matutils.Sparse2Corpus(X,  documents_columns=False)
    with output: print("Running topic mining algorithm ...")
    lda = models.ldamodel.LdaModel(corpus, num_topics=K_clusters, id2word=id2words)
    
    
    output_text = []
    for i, item in enumerate(lda.show_topics(num_topics=K_clusters, num_words=num_display_words, formatted=False)):
        topic_mined = []
        for term in item[1]:
            topic_mined.append( term[0] )
        output_text.append(topic_mined)
    
    
    return output_text

In [26]:
def run_lsi(text):
    numfeatures = 200
    if num_words.value:
        num_display_words = num_words.value
    else:
        num_display_words = 10

    if num_topics.value:
        K_clusters = num_topics.value
    else:
        K_clusters = 10
    
    text_split = text.split("\n")
    vectorizer = TfidfVectorizer(max_df=0.5, max_features=numfeatures,
                                     min_df=2, stop_words='english',
                                     use_idf=True)

    with output: print("Creating text representation ...")
    X = vectorizer.fit_transform(text_split)
    with output: print("Building corpus ...")

    # mapping from feature id to acutal word
    id2words ={}
    for i,word in enumerate(vectorizer.get_feature_names()):
        id2words[i] = word

    corpus = matutils.Sparse2Corpus(X,  documents_columns=False)
    with output: print("Running topic mining algorithm ...")
        
    lsi = models.LsiModel(corpus, num_topics=K_clusters, id2word=id2words)

    output_text = []
    for i, item in enumerate(lsi.show_topics(num_topics=K_clusters, num_words=num_display_words, formatted=False)):
        topic_mined = []
        for term in item[1]:
            topic_mined.append( term[0] )
        output_text.append(topic_mined)
    
    
    return output_text

In [3]:
def getHTMLTable(output_text):
    # CSS style generated using https://divtable.com/table-styler/
    html='''
    <style type="text/css">
        table.minimalistBlack {
          border: 1px solid #000000;
          width: 100%;
          text-align: left;
          border-collapse: collapse;
        }
        table.minimalistBlack td, table.minimalistBlack th {
          border: 1px solid #000000;
          padding: 4px 4px;
        }
        table.minimalistBlack tbody td {
          font-size: 13px;
        }
        table.minimalistBlack thead {
          background: #CFCFCF;
          background: -moz-linear-gradient(top, #dbdbdb 0%, #d3d3d3 66%, #CFCFCF 100%);
          background: -webkit-linear-gradient(top, #dbdbdb 0%, #d3d3d3 66%, #CFCFCF 100%);
          background: linear-gradient(to bottom, #dbdbdb 0%, #d3d3d3 66%, #CFCFCF 100%);
          border-bottom: 2px solid #000000;
        }
        table.minimalistBlack thead th {
          font-size: 15px;
          font-weight: bold;
          color: #000000;
          text-align: left;
        }
        table.minimalistBlack tfoot td {
          font-size: 14px;
        }
    </style>
    
    <table class="minimalistBlack">
        <tr>
            <th>Topic</th>
            <th>Keywords</th>
        </tr>
        '''
    i=1
    for b in output_text:
        html += '''
        <tr>
            <td>{}</td>
            <td>{}</td>
        </tr>
        '''.format(i,str(b))
        i+=1
    html+="</table>"
    return html

In [4]:
# Set up uploader events
def on_upload(change):
    '''
    Triggered when image is uploaded
    '''
    textDisplay.value = ""
#     output.clear_output()
#     with status: print("start read in file")
    status.clear_output()
    with status: print("Updating dataset..")
    temp = codecs.decode(list(uploader.value.values())[0]['content'], encoding="utf-8")
#     content = temp.split("\n")
    
    textDisplay.value = temp
    
#     for review in content:
#         text.append(review)
    
    button.disabled = False
    display(button)
    
    status.clear_output()
    with status: print("Dataset uploaded! Set parameters and click 'Start Mining'")
    
    return

In [5]:
def on_button_clicked(change):
    button.disabled = True
    display(button)
    
    output.clear_output()
    topicsOutput.value =''
    display(topicsOutput)
    
    print("start topic mining")
    if dropdown.value == "1":
        output_text = run_lda(textDisplay.value)
    else:
        output_text = run_lsi(textDisplay.value)
        
    topicsOutput.value = getHTMLTable(output_text)
    
    button.disabled = False
    display(button)
    display(topicsOutput)
    
    with output:
        print("✅ DONE! Scroll down ⬇")

In [6]:
def on_rbutton_change(change):
    '''
    Triggered when radio button state is changed
    '''
    selection = change["new"]
#     print(selection)
    
    textDisplay.value = ""
    
    mapping = {"Demo 1: Breweries Reviews":"Breweries.txt",
               "Demo 2: Coffee & Tea Reviews":"Coffee_&_Tea.txt",
               "Demo 3: Fish & Chips Reviews":"Fish_&_Chips.txt"}
    
    # Reset the output and segOutput widgets
    output.clear_output()
    topicsOutput.value =''
    display(topicsOutput)
    
#     print(mapping[selection])
    # Update status widget
    status.clear_output()
    with status: print("Updating dataset..")
        
    if selection in ["Demo 1: Breweries Reviews","Demo 2: Coffee & Tea Reviews","Demo 3: Fish & Chips Reviews"]:
        # Disable the uploader
        uploader.disabled = True
        display(uploader)
        
        # read in text
        with open (mapping[selection], 'r', encoding="utf-8") as f:
            content = f.read()
            textDisplay.value = content
        
#         for review in content:
#             text.append(review)
#         print(len(text))
#         imgDisplay.value = tempImage
#         display(imgDisplay)
        # Enable processing button
        button.disabled = False
        display(button)
        
        
        
        status.clear_output()
        with status: print("Dataset updated! Set parameters and click 'Start Mining'")

    else: # upload
        # Enable the uploader
        uploader.disabled = False
        display(uploader)
        # Disable processing button
        button.disabled = True
        display(button)
        # Remove image
#         imgDisplay.value=phImage
#         display(imgDisplay)
        
        status.clear_output()
        with status: print("Select a dataset and wait for the 'Dataset Uploaded' message.")

In [7]:
def on_slider_change(change):
    num_topics.value=change['new']
    # Reset the output and segOutput widgets
#     output.clear_output()
#     topicsOutput.value =''
#     display(topicsOutput)

def on_slider_word_change(change):
    num_words.value=change['new']
    # Reset the output and segOutput widgets
#     output.clear_output()
#     topicsOutput.value =''
#     display(topicsOutput)

def on_dropdown_change(change):
    model_type.value=change['new']
#     print(model_type)

## Load Data

In [23]:
# text = []
slider = widgets.IntSlider(min=0, max=20, step=1, value=10, description='Topics#')
num_topics = widgets.IntText()
slider_word = widgets.IntSlider(min=0, max=20, step=1, value=10, description='Word#')
num_words = widgets.IntText()
dropdown = widgets.Dropdown(
    options=[('LDA', 1), ('LSI', 2)],
    value=1,
    description='Model:'
)
model_type = widgets.IntText(value="1")

In [24]:
# input
# layout = widgets.Layout(width='800px', height='40px')
uploader = widgets.FileUpload( accept='.txt', multiple=False, disabled=False)
textDisplay = widgets.Text(value="Select a dataset and wait for the 'Dataset Uploaded' message.")
rbutton = widgets.RadioButtons(
    options=['Demo 1: Breweries Reviews','Demo 2: Coffee & Tea Reviews','Demo 3: Fish & Chips Reviews', 'Upload'],
    description='Reviews',
    disabled=False,
    value='Upload',
    layout={'width': 'max-content'}
)

button = widgets.Button(
    description='Start mining',
    disabled=True,
    button_style='info', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Process the image to create a list of books',
)

status = widgets.Output()
with status: print("Select a dataset to upload or choose a demo and wait for the 'Dataset Uploaded' message.")
output = widgets.Output()
topicsOutput = widgets.HTML()


display(rbutton,uploader,status)

rbutton.observe(on_rbutton_change,"value")
uploader.observe(on_upload, names='value')
button.on_click(on_button_clicked)
slider.observe(on_slider_change, names='value')
slider_word.observe(on_slider_word_change, names='value')
dropdown.observe(on_dropdown_change, names='value')

display(dropdown,slider,slider_word)

RadioButtons(description='Reviews', index=3, layout=Layout(width='max-content'), options=('Demo 1: Breweries R…

FileUpload(value={}, accept='.txt', description='Upload')

Output()

Dropdown(description='Model:', options=(('LDA', 1), ('LSI', 2)), value=1)

IntSlider(value=10, description='Topics#', max=20)

IntSlider(value=10, description='Word#', max=20)

IntText(value=2)
IntText(value=1)


In [14]:
display(button)

Button(button_style='info', description='Start mining', disabled=True, style=ButtonStyle(), tooltip='Process t…

***
## Processing Results

In [31]:
display(output,topicsOutput)

Output(outputs=({'output_type': 'stream', 'text': 'Creating text representation ...\n', 'name': 'stdout'},))

HTML(value='\n    <style type="text/css">\n        table.minimalistBlack {\n          border: 1px solid #00000…