# NMF

In [1]:
import pandas as pd
import os
import csv

# Identify the working directory and data files
working_directory = './26-NMF'

# Create the working directory if needed
try:
    os.makedirs(working_directory, exist_ok=True)
except OSError as error:
    print(f"Error creating {working_directory}: {error}")

In [2]:
! pip install --quiet --upgrade nltk

In [3]:
! python -m nltk.downloader --quiet 'all'



In [4]:
# Readthe preprocessed data into a dataframe
import pickle

with open('./21-data_preprocessing/dataframe.pickle', 'rb') as f:
    df = pickle.load(f)

In [5]:
df.shape

(5736, 20)

In [6]:
df.head(4)

Unnamed: 0,ROW_ID,FOI_TEXT,DEVICE_PROBLEM_CODE,DEVICE_PROBLEM_TEXT,GENERIC_NAME,DEVICE_REPORT_PRODUCT_CODE,UDI-DI,UDI-PUBLIC,DATE_OF_EVENT,REPORTER_OCCUPATION_CODE,REPORT_DATE,EVENT_LOCATION,SOURCE_TYPE,TOKENIZED_TEXT,NOPUNCT_TEXT,NOSTOPWORDS_TEXT,NODIGITS_TEXT,POS_TEXT,LEMMATIZED_TEXT,STEMMED_TEXT
0,1969025,IT WAS REPORTED THAT THE TRANSMITTER LOST CONN...,3283,Wireless Communication Problem,CONTINUOUS GLUCOSE MONITOR,QBJ,,,07/30/2020,0,,I,CONSUMER,"[it, was, reported, that, the, transmitter, lo...","[it, was, reported, that, the, transmitter, lo...","[reported, transmitter, lost, connection, pump...","[reported, transmitter, lost, connection, pump...","[(reported, VBN), (transmitter, NN), (lost, VB...","[report, transmitter, lose, connection, pump, ...","[report, transmitt, lost, connect, pump, great..."
1,1426265,IT WAS REPORTED THAT SIGNAL LOSS OVER ONE HOUR...,3283,Wireless Communication Problem,CONTINUOUS GLUCOSE MONITOR,QBJ,386270000385.0,386270000385.0,06/05/2020,0,,I,CONSUMER,"[it, was, reported, that, signal, loss, over, ...","[it, was, reported, that, signal, loss, over, ...","[reported, signal, loss, one, hour, occurred, ...","[reported, signal, loss, one, hour, occurred, ...","[(reported, VBN), (signal, JJ), (loss, NN), (o...","[report, signal, loss, one, hour, occur, produ...","[report, signal, loss, one, hour, occur, produ..."
2,2609625,IT WAS REPORTED THAT TRANSMITTER FAILED ERROR ...,1435,No Device Output,CONTINUOUS GLUCOSE MONITOR,QBJ,386270000385.0,386270000385.0,10/05/2020,0,,I,CONSUMER,"[it, was, reported, that, transmitter, failed,...","[it, was, reported, that, transmitter, failed,...","[reported, transmitter, failed, error, occurre...","[reported, transmitter, failed, error, occurre...","[(reported, VBN), (transmitter, NN), (failed, ...","[report, transmitter, fail, error, occur, data...","[report, transmitt, fail, error, occur, data, ..."
3,2813837,IT WAS REPORTED THAT SIGNAL LOSS OVER ONE HOUR...,3283,Wireless Communication Problem,CONTINUOUS GLUCOSE MONITOR,QBJ,386270000385.0,386270000385.0,10/23/2020,0,,I,CONSUMER,"[it, was, reported, that, signal, loss, over, ...","[it, was, reported, that, signal, loss, over, ...","[reported, signal, loss, one, hour, occurred, ...","[reported, signal, loss, one, hour, occurred, ...","[(reported, VBN), (signal, JJ), (loss, NN), (o...","[report, signal, loss, one, hour, occur, revie...","[report, signal, loss, one, hour, occur, revie..."


In [7]:
bag_of_words_file = "./21-data_preprocessing/bag_of_words_data.csv"

# Read the data into a pandas dataframe
bow_df = pd.read_csv(bag_of_words_file, # The data file being read, from the variable assignment above
                 on_bad_lines='warn',           # This tells Pandas to only warn on bad lines vs causing an error
                 dtype='str')                   # This tells Pandas to treat all numbers as words

bow_df.fillna("", inplace=True)

In [8]:
max_topics = 15

In [9]:
bow_df.shape

(5736, 922)

In [10]:
bow_df.head(4)

Unnamed: 0,abbott,abdomen,abdominal,aberration,able,accessory,accuracy,accurate,acetaminophen,actually,...,work,would,x2,xray,year,yellow,yes,yet,zero,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
from sklearn.decomposition import NMF

num_topics = 15 # or the number of topics you want to extract
nmf_model = NMF(n_components=num_topics)
nmf_model.fit(bow_df)
doc_topic_matrix = nmf_model.transform(bow_df)
topic_word_matrix = nmf_model.components_
num_top_words = 10 # or the number of top words you want to display for each topic



In [12]:
doc_topic_matrix

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.00507946, 0.23390649,
        0.        ],
       [0.2013439 , 0.00220656, 0.        , ..., 0.        , 0.01352555,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.23885901,
        0.        ],
       [0.2013439 , 0.00220656, 0.        , ..., 0.        , 0.01352555,
        0.        ],
       [0.        , 0.        , 0.07333078, ..., 0.00661048, 0.18055754,
        0.        ]])

In [13]:
topic_word_matrix

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 2.02653772e-02],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [4.67378213e-04, 0.00000000e+00, 2.35078607e-05, ...,
        0.00000000e+00, 5.71510480e-04, 0.00000000e+00],
       [0.00000000e+00, 1.59056010e-01, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [14]:
terms = bow_df.columns
len(terms)

922

In [17]:
num_top_words = 10 # or the number of top words you want to display for each topic
for topic_idx, topic in enumerate(topic_word_matrix):
    print("\nTopic #%d:" % topic_idx)
    print(" ".join([terms[i] for i in topic.argsort()[:-num_top_words - 1:-1]]))


Topic #0:
report data confirm evaluate probable determine injury intervention medical allegation

Topic #1:
read bg mgdl cgm meter glucose reportedly customer inaccuracy monitor

Topic #2:
within share log report investigation review find window perform confirm

Topic #3:
transmitter fail error report battery low occur allegation medical intervention

Topic #4:
new evaluation submit return product become information available relevant supplemental

Topic #5:
perform pass inspection visual test voltage product evaluate external pair

Topic #6:
glucose report value data parkes grid fall within zone error

Topic #7:
transmitter customer could root replacement send determine additional information available

Topic #8:
device receive however report yet expect submit return supplemental evaluation

Topic #9:
sensor report expiration early patient alert replace insert b6 occur

Topic #10:
transmitter connection pump patient additional information report available lose event

Topic #11:
signa

In [18]:
%pip install --upgrade --quiet gensim

Note: you may need to restart the kernel to use updated packages.


In [38]:
texts = [word for row in df['LEMMATIZED_TEXT'] for word in row]

In [27]:
from gensim import corpora

# Create a dictionary
# In gensim a dictionary is a mapping between words and their integer id
dictionary = corpora.Dictionary([texts])

# Filter out extremes to limit the number of features
dictionary.filter_extremes(
    no_below=3,
    no_above=0.85,
    keep_n=5000
)