# LDA with Sentences

In [1]:
# Add coherence matrix (k by score: k = 1, Coherence score: 0.8242367870330447)
# And a graph of k by score (see 30-NMF-v2.ipynb)

In [2]:
import pandas as pd
import os
import csv
import nltk # If this step fails, rerun 07-Install-NLTK.ipynb
import string

# Identify the working directory and data files
working_directory = './32-LDA-with-Sentences'

# Create the working directory if needed
try:
    os.makedirs(working_directory, exist_ok=True)
except OSError as error:
    print(f"Error creating {working_directory}: {error}")

In [3]:
# Topics range
min_topics = 1
max_topics = 25

# Number of training passes for LDA
passes = 10

# The number of top words per topic
num_top_words = 10

# A row to use for verification of processing
verification_row = 9

In [4]:
%pip install --quiet --upgrade nltk gensim spacy pyldavis sentence-transformers hdbscan mpld3

Note: you may need to restart the kernel to use updated packages.


In [5]:
!python -m nltk.downloader --quiet 'all'
!python -m spacy download en_core_web_sm > /dev/null



In [6]:
# Read the preprocessed data into a dataframe
import pickle

with open('./21-Preprocess-Combined-Data-v2/dataframe.pickle', 'rb') as f:
    df = pickle.load(f)

In [7]:
df.shape

(5736, 20)

In [8]:
df.head(1)

Unnamed: 0,ROW_ID,FOI_TEXT,DEVICE_PROBLEM_CODE,DEVICE_PROBLEM_TEXT,GENERIC_NAME,DEVICE_REPORT_PRODUCT_CODE,UDI-DI,UDI-PUBLIC,DATE_OF_EVENT,REPORTER_OCCUPATION_CODE,REPORT_DATE,EVENT_LOCATION,SOURCE_TYPE,TOKENIZED_TEXT,NOPUNCT_TEXT,NOSTOPWORDS_TEXT,NODIGITS_TEXT,POS_TEXT,LEMMATIZED_TEXT,STEMMED_TEXT
0,1969025,IT WAS REPORTED THAT THE TRANSMITTER LOST CONN...,3283,Wireless Communication Problem,CONTINUOUS GLUCOSE MONITOR,QBJ,,,07/30/2020,0,,I,CONSUMER,"[it, was, reported, that, the, transmitter, lo...","[it, was, reported, that, the, transmitter, lo...","[reported, transmitter, lost, connection, pump...","[reported, transmitter, lost, connection, pump...","[(reported, VBN), (transmitter, NN), (lost, VB...","[report, transmitter, lose, connection, pump, ...","[report, transmitt, lost, connect, pump, great..."


In [9]:
# create a DataFrame for the sentences
sentences_df = pd.DataFrame(
    columns=[
        'SENTENCE', 
        'ROW_ID', 
        'FOI_TEXT', 
        'DEVICE_PROBLEM_CODE',
        'DEVICE_PROBLEM_TEXT'
    ]
)

# Iterate over every row in the FOI_TEXT DataFrame
for index, row in df.iterrows():
    for sentence in nltk.sent_tokenize(row['FOI_TEXT']):
        sentence_data = {
            'SENTENCE': sentence, 
            'ROW_ID': row['ROW_ID'], 
            'FOI_TEXT': row['FOI_TEXT'], 
            'DEVICE_PROBLEM_CODE': row['DEVICE_PROBLEM_CODE'],
            'DEVICE_PROBLEM_TEXT': row['DEVICE_PROBLEM_TEXT']
        }
        
        sentences_df = sentences_df.append(sentence_data, ignore_index=True)

In [10]:
sentences_df.shape

(25686, 5)

In [11]:
sentences_df.head()

Unnamed: 0,SENTENCE,ROW_ID,FOI_TEXT,DEVICE_PROBLEM_CODE,DEVICE_PROBLEM_TEXT
0,IT WAS REPORTED THAT THE TRANSMITTER LOST CONN...,1969025,IT WAS REPORTED THAT THE TRANSMITTER LOST CONN...,3283,Wireless Communication Problem
1,THE TRANSMITTER ULTIMATELY REGAINED CONNECTION...,1969025,IT WAS REPORTED THAT THE TRANSMITTER LOST CONN...,3283,Wireless Communication Problem
2,NO ADDITIONAL PATIENT OR EVENT INFORMATION WAS...,1969025,IT WAS REPORTED THAT THE TRANSMITTER LOST CONN...,3283,Wireless Communication Problem
3,IT WAS REPORTED THAT SIGNAL LOSS OVER ONE HOUR...,1426265,IT WAS REPORTED THAT SIGNAL LOSS OVER ONE HOUR...,3283,Wireless Communication Problem
4,NO PRODUCT OR DATA WAS PROVIDED FOR EVALUATION.,1426265,IT WAS REPORTED THAT SIGNAL LOSS OVER ONE HOUR...,3283,Wireless Communication Problem


In [12]:
# Process the sentences
import re
import nltk
import string

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

stop_words = set(stopwords.words('english'))

punctuations = set(string.punctuation)

def process_sentence(sentence):
    # Lowercase the sentence
    sentence = sentence.lower()

    # Remove any words that start with a digit
    sentence = re.sub(r'\b\d\w*\b', '', sentence)

    # Remove punctuation
    sentence_tokens = sentence.split()
    sentence_tokens = [token.translate(str.maketrans("", "", string.punctuation)) for token in sentence_tokens]

    # Remove stopwords
    sentence_tokens = [token for token in sentence_tokens if token not in stop_words]
    
    # Lemmatize
    lemmatized_tokens = [lemmatizer.lemmatize(token, pos='v') for token in sentence_tokens]
    
    # Rebuild the sentence
    sentence = ' '.join(lemmatized_tokens)

    return sentence

sentences_df['PROCESSED_SENTENCE'] = sentences_df['SENTENCE'].apply(process_sentence)

In [19]:
print(f"'{sentences_df['SENTENCE'][verification_row]}' ==> '{sentences_df['PROCESSED_SENTENCE'][verification_row]}'")

'THE PROBABLE CAUSE COULD NOT BE DETERMINED.' ==> 'probable cause could determine'


In [None]:
import gensim
from gensim import corpora
from gensim.models.coherencemodel import CoherenceModel

step_size = 1
topics_range = range(min_topics, max_topics, step_size)

# split each sentence into a list of words
texts = sentences_df['PROCESSED_SENTENCE'].apply(lambda x: nltk.word_tokenize(x)).tolist()

# Create a dictionary from the words
dictionary = corpora.Dictionary(texts)

# Create a bag-of-words representation of the corpus
corpus = [dictionary.doc2bow(word) for word in texts]

# Initialize the scoreboard
coherence_scores_df = pd.DataFrame(columns=['k', 'Coherence Score'])

pbar = tqdm.tqdm(total=len(topics_range))

for k in range(min_topics, max_topics + 1):
    
    # Train the LDA model
    lda_model = gensim.models.ldamodel.LdaModel(
        corpus=corpus, 
        id2word=dictionary, 
        num_topics=k, 
        passes=passes
    )

    coherence_model = CoherenceModel(
        model=lda_model, 
        texts=texts, 
        dictionary=dictionary, 
        coherence='c_v'
    )

    coherence_score = coherence_model.get_coherence()

    coherence_scores_df = coherence_scores_df.append(
        pd.Series(
            [k, coherence_score], 
            index=coherence_scores_df.columns
        ), 
        ignore_index=True
    )
    
    pbar.update(1)

pbar.close() 
coherence_scores_df

k = 1
k = 2
k = 3
k = 4
k = 5
k = 6
k = 7
k = 8
k = 9
k = 10
k = 11
k = 12
k = 13
k = 14
k = 15


  m_lr_i = np.log(numerator / denominator)
  return cv1.T.dot(cv2)[0, 0] / (_magnitude(cv1) * _magnitude(cv2))


k = 16
k = 17
k = 18
k = 19
k = 20
k = 21


In [None]:
# save the coherence_scores_df code to a file as HTML
with open(f"{working_directory}/lda_using_sentences_coherence_scores_table.html", 'w') as f:
    f.write(coherence_scores_df.to_html(index=False))

In [None]:
import plotly.express as px

fig_1 = px.line(
    coherence_scores_df, 
    x='k', 
    y='Coherence Score', 
    title=f"NMF using Sentences - Coherence Score | {working_directory}\n"
)

# add markers for each point
fig_1.update_traces(mode='lines+markers')

# extend the limits of the x-axis from 0 to 16
fig_1.update_xaxes(range=[0, max_topics + 1])

# show all numbers on the x-axis
fig_1.update_layout(xaxis=dict(tickmode='linear'))

# write the graph to a file in the working directory
fig_1.write_html(f"{working_directory}/nmf_using_sentences_coherence_scores_chart.html")

# show the plot
fig_1.show()

In [None]:
# Select the highest score

# Sort the dataframe by Coherence Score in descending order
sorted_df = coherence_scores_df.sort_values(by='Coherence Score', ascending=False)

# Pick the value of k from the first row of the sorted dataframe
selected_k = sorted_df.iloc[0]['k']

print(f"Highest Coherence Score: {selected_k} = {sorted_df.iloc[0]['Coherence Score']}")

In [None]:
topics_df = pd.DataFrame(columns=['Topic', f"Top {num_top_words} Words"])

# Create an LDA model using the selected k
lda_model = gensim.models.ldamodel.LdaModel(
    corpus=corpus, 
    id2word=dictionary,
    num_topics=selected_k, 
    passes=passes
)

In [16]:
import os
import fnmatch
import subprocess

bucket = "praxis-2023-html-output"
website = f"http://{bucket}.s3-website-us-west-2.amazonaws.com"

# Use the fnmatch module to find all files in the current directory that end in ".html"
file_list = []
for root, dirnames, filenames in os.walk("."):
    for filename in fnmatch.filter(filenames, '*.html'):
        file_list.append(os.path.join(root, filename))

# Sort the file list alphabetically
file_list.sort()

# Create the HTML file and write the header
with open(os.path.join(".", 'index.html'), 'w') as f:
    f.write('''<html>
        <head>
            <title>Praxis 2023 HTML Output</title>
            <style>
                table {
                    border-collapse: collapse;
                    width: 100%;
                }
                th, td {
                    text-align: left;
                    padding: 8px;
                }
                th {
                    background-color: #007bff;
                    color: #fff;
                    font-weight: bold;
                }
                tr:nth-child(even) {
                    background-color: #f2f2f2;
                }
                tr:hover {
                    background-color: #ddd;
                }
            </style>
        </head>
        <body>
            <table>
                <tr><th>Name</th><th>Size</th></tr>\n
    ''')

    # Loop through each file and add a row to the table
    for file_name in file_list:
        if file_name in ['./index.html']:
            continue
            
        file_size = os.path.getsize(file_name)
        f.write(f'<tr><td><a href="{website}/{file_name}" target="_blank" rel="noopener noreferrer">{file_name}</a></td><td>{int(file_size / 1048576)} MB</td></tr>\n')

    # Write the footer and close the file
    f.write('</table></body></html>')

command = ["aws", "s3", "sync", ".", f"s3://{bucket}", "--exclude", "*", "--include", "*.html", "--no-progress"]

# Run the command and wait for it to complete
output = subprocess.run(command, capture_output=True, text=True)

# Print the output
print(output.stdout)
print('fin')

upload: 31-NMF-with-Sentences/nmf_using_sentences_coherence_scores_table.html to s3://praxis-2023-html-output/31-NMF-with-Sentences/nmf_using_sentences_coherence_scores_table.html
upload: 31-NMF-with-Sentences/nmf_using_sentences_topics_and_words_table.html to s3://praxis-2023-html-output/31-NMF-with-Sentences/nmf_using_sentences_topics_and_words_table.html
upload: ./index.html to s3://praxis-2023-html-output/index.html

fin
