# Topic Mining from Reviews or Text 📚

This application takes a collection of reviews or other texts and mines topics from the corpus.

**Usage**
- Please wait for the app to load.
- Select or upload dataset and then click on the 'Start Mining' button below.  
- The uploaded data will be text file with one review per line.


Cong Chen | congc4@illinois.edu

***

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from time import time
from gensim import models
from gensim import matutils
import ipywidgets as widgets
from IPython.display import display
import codecs

In [2]:
def run_lda(text):
    numfeatures = 500
    num_display_words = 10

    K_clusters = 10
    vectorizer = TfidfVectorizer(max_df=0.5, max_features=numfeatures,
                                     min_df=2, stop_words='english',
                                     use_idf=True)
    with output: print("Creating text representation ...")
    X = vectorizer.fit_transform(text)
    
    id2words ={}
    for i,word in enumerate(vectorizer.get_feature_names()):
        id2words[i] = word
    with output: print("Building corpus ...")    
    corpus = matutils.Sparse2Corpus(X,  documents_columns=False)
    with output: print("Running topic mining algorithm ...")
    lda = models.ldamodel.LdaModel(corpus, num_topics=K_clusters, id2word=id2words)
    output_text = []
    for i, item in enumerate(lda.show_topics(num_topics=K_clusters, num_words=num_display_words, formatted=False)):
        topic_mined = []
        for term in item[1]:
            topic_mined.append( term[0] )
        output_text.append(topic_mined)
    
    return output_text

In [3]:
def getHTMLTable(output_text):
    # CSS style generated using https://divtable.com/table-styler/
    html='''
    <style type="text/css">
        table.minimalistBlack {
          border: 1px solid #000000;
          width: 100%;
          text-align: left;
          border-collapse: collapse;
        }
        table.minimalistBlack td, table.minimalistBlack th {
          border: 1px solid #000000;
          padding: 4px 4px;
        }
        table.minimalistBlack tbody td {
          font-size: 13px;
        }
        table.minimalistBlack thead {
          background: #CFCFCF;
          background: -moz-linear-gradient(top, #dbdbdb 0%, #d3d3d3 66%, #CFCFCF 100%);
          background: -webkit-linear-gradient(top, #dbdbdb 0%, #d3d3d3 66%, #CFCFCF 100%);
          background: linear-gradient(to bottom, #dbdbdb 0%, #d3d3d3 66%, #CFCFCF 100%);
          border-bottom: 2px solid #000000;
        }
        table.minimalistBlack thead th {
          font-size: 15px;
          font-weight: bold;
          color: #000000;
          text-align: left;
        }
        table.minimalistBlack tfoot td {
          font-size: 14px;
        }
    </style>
    
    <table class="minimalistBlack">
        <tr>
            <th>Topic</th>
            <th>Keywords</th>
        </tr>
        '''
    i=1
    for b in output_text:
        html += '''
        <tr>
            <td>{}</td>
            <td>{}</td>
        </tr>
        '''.format(i,str(b))
        i+=1
    html+="</table>"
    return html

In [4]:
# Set up uploader events
def on_upload(change):
    '''
    Triggered when image is uploaded
    '''
    print("start read in file")
    temp = codecs.decode(list(uploader.value.values())[0]['content'], encoding="utf-8")
    content = temp.split("\n")
    
    for review in content:
        text.append(review)
    
    button.disabled = False
    display(button)
    
    status.clear_output()
    with status: print("Dataset uploaded! Click 'Start Mining'")
    
    return

In [5]:
def on_button_clicked(change):
    button.disabled = True
    display(button)
    
    
    print("start topic mining")
    output_text = run_lda(text)
    topicsOutput.value = getHTMLTable(output_text)
    
    button.disabled = False
    display(button)
    display(topicsOutput)
    
    with output:
        print("✅ DONE! Scroll down ⬇")

In [29]:
def on_rbutton_change(change):
    '''
    Triggered when radio button state is changed
    '''
    
    selection = change["new"]
#     print(selection)
    
    mapping = {"Demo 1: Breweries Reviews":"Breweries.txt",
               "Demo 2: Coffee & Tea Reviews":"Coffee_&_Tea.txt",
               "Demo 3: Fish & Chips Reviews":"Fish_&_Chips.txt"}
    
    # Reset the output and segOutput widgets
    output.clear_output()
    topicsOutput.value =''
    display(topicsOutput)
    
#     print(mapping[selection])
    # Update status widget
    status.clear_output()
    with status: print("Updating dataset..")
        
    if selection in ["Demo 1: Breweries Reviews","Demo 2: Coffee & Tea Reviews","Demo 3: Fish & Chips Reviews"]:
        # Disable the uploader
        uploader.disabled = True
        display(uploader)
        
        # read in text
        with open (mapping[selection], 'r', encoding="utf-8") as f:
            content = f.readlines()
        
        for review in content:
            text.append(review)
#         print(len(text))
#         imgDisplay.value = tempImage
#         display(imgDisplay)
        # Enable processing button
        button.disabled = False
        display(button)
        
        status.clear_output()
        with status: print("Dataset updated! Click 'Start Mining'")

    else: # upload
        # Enable the uploader
        uploader.disabled = False
        display(uploader)
        # Disable processing button
        button.disabled = True
        display(button)
        # Remove image
#         imgDisplay.value=phImage
#         display(imgDisplay)
        
        status.clear_output()
        with status: print("Select a dataset and wait for the 'Dataset Uploaded' message.")

## Load Data

In [24]:
text = []

In [30]:
# input
uploader = widgets.FileUpload( accept='.txt', multiple=False, disabled=False)

rbutton = widgets.RadioButtons(
    options=['Demo 1: Breweries Reviews','Demo 2: Coffee & Tea Reviews','Demo 3: Fish & Chips Reviews', 'Upload'],
    description='Reviews',
    disabled=False,
    value='Upload',
    layout={'width': 'max-content'}
)

button = widgets.Button(
    description='Start mining',
    disabled=False,
    button_style='info', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Process the image to create a list of books',
)

status = widgets.Output()
output = widgets.Output()
topicsOutput = widgets.HTML()


display(rbutton,uploader,status)

rbutton.observe(on_rbutton_change,"value")
uploader.observe(on_upload, names='value')
button.on_click(on_button_clicked)

RadioButtons(description='Reviews', layout=Layout(width='max-content'), options=('Demo 1', 'Demo 2', 'Demo 3',…

FileUpload(value={}, accept='.txt', description='Upload')

Output()

Coffee_&_Tea.txt


FileUpload(value={}, accept='.txt', description='Upload', disabled=True)

Button(button_style='info', description='Start mining', style=ButtonStyle(), tooltip='Process the image to cre…

Button(button_style='info', description='Start mining', disabled=True, style=ButtonStyle(), tooltip='Process t…

start topic mining


Button(button_style='info', description='Start mining', style=ButtonStyle(), tooltip='Process the image to cre…

HTML(value='\n    <style type="text/css">\n        table.minimalistBlack {\n          border: 1px solid #00000…

In [11]:
display(button)

Button(button_style='info', description='Start mining', style=ButtonStyle(), tooltip='Process the image to cre…

***
## Processing Results

In [31]:
display(output,topicsOutput)

Output(outputs=({'output_type': 'stream', 'text': 'Creating text representation ...\n', 'name': 'stdout'},))

HTML(value='\n    <style type="text/css">\n        table.minimalistBlack {\n          border: 1px solid #00000…