# Preprocess Combined Data
Using the combined QBJ as input, perform data preprocessing including:

- Expand Contractions, Tokenize, and Convert to Lowercase
- Remove Punctuation
- Remove Stop Words
- Parts of Speech (POS) Tagging
- Lemmatize
- Create Bag of Words (BOW)
- Calculate Term Frequency-Inverse Document Frequency (TF-IDF)

_Run [Notebook 07-Install-NLTK](./07-Install-NLTK.ipynb) as needed to set up NLTK._

In [1]:
import pandas as pd
import os
import csv

# Identify the working directory and data files
working_directory = './16-data_preprocessing'

# Create the working directory if needed
try:
    os.makedirs(working_directory, exist_ok=True)
except OSError as error:
    print(f"Error creating {working_directory}: {error}")

In [2]:
# Readthe combined data into a dataframe
data_file = './15-data_combination/qbj_data_combined.csv'

# Read the data into a pandas dataframe
df = pd.read_csv(data_file,           # The data file being read, from the variable assignment above
                 on_bad_lines='warn', # This tells Pandas to only warn on bad lines vs causing an error
                 dtype='str')         # This tells Pandas to treat all numbers as words

In [3]:
df.shape

(5736, 13)

In [4]:
df.head(2)

Unnamed: 0,ROW_ID,FOI_TEXT,DEVICE_PROBLEM_CODE,DEVICE_PROBLEM_TEXT,GENERIC_NAME,DEVICE_REPORT_PRODUCT_CODE,UDI-DI,UDI-PUBLIC,DATE_OF_EVENT,REPORTER_OCCUPATION_CODE,REPORT_DATE,EVENT_LOCATION,SOURCE_TYPE
0,1969025,IT WAS REPORTED THAT THE TRANSMITTER LOST CONN...,3283,Wireless Communication Problem,CONTINUOUS GLUCOSE MONITOR,QBJ,,,07/30/2020,0,,I,CONSUMER
1,1426265,IT WAS REPORTED THAT SIGNAL LOSS OVER ONE HOUR...,3283,Wireless Communication Problem,CONTINUOUS GLUCOSE MONITOR,QBJ,386270000385.0,386270000385.0,06/05/2020,0,,I,CONSUMER


## Load the Natural Language Toolkit (NLTK) and Preprocessing Libraries

In [5]:
# Import the NLTK library; If this step fails, rerun 07-Install-NLTK.ipynb
import nltk
import string
import contractions

## Expand Contractions, Tokenize, and Convert to Lowercase

In [6]:
# This approach takes the FOI_TEXT as a string and creates a new column with tokens
# That is, it removes the contraction _and_ tokenizes at the same time
# no additional function is needed, x.split tokenizes the string (FOI text) at every space
# a call to lower() converts the word to lowercase

df['TOKENIZED_TEXT'] = df['FOI_TEXT'].apply(lambda x: [contractions.fix(word).lower() for word in x.split()])
df['TOKENIZED_TEXT']

0       [it, was, reported, that, the, transmitter, lo...
1       [it, was, reported, that, signal, loss, over, ...
2       [it, was, reported, that, transmitter, failed,...
3       [it, was, reported, that, signal, loss, over, ...
4       [it, was, reported, that, signal, loss, over, ...
                              ...                        
5731    [it, was, reported, that, a, transmitter, fail...
5732    [it, was, reported, that, signal, loss, over, ...
5733    [it, was, reported, that, transmitter, failed,...
5734    [it, was, reported, that, a, transmitter, fail...
5735    [it, was, reported, that, signal, loss, over, ...
Name: TOKENIZED_TEXT, Length: 5736, dtype: object

In [7]:
df['TOKENIZED_TEXT'][0]

['it',
 'was',
 'reported',
 'that',
 'the',
 'transmitter',
 'lost',
 'connection',
 'with',
 'the',
 'pump',
 'for',
 'greater',
 'than',
 '1',
 'hour.',
 'the',
 'transmitter',
 'ultimately',
 'regained',
 'connection',
 'with',
 'the',
 'pump.',
 'no',
 'additional',
 'patient',
 'or',
 'event',
 'information',
 'was',
 'available.']

## Remove Punctuation

In [8]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [9]:
# Define a function to remove punctuation in the data
def remove_punctuation(text):
    text = "".join([character for character in text if character not in string.punctuation])
    return text

df['CLEAN_TEXT'] = df['TOKENIZED_TEXT'].apply(lambda x: [remove_punctuation(word) for word in x])
df['CLEAN_TEXT']

0       [it, was, reported, that, the, transmitter, lo...
1       [it, was, reported, that, signal, loss, over, ...
2       [it, was, reported, that, transmitter, failed,...
3       [it, was, reported, that, signal, loss, over, ...
4       [it, was, reported, that, signal, loss, over, ...
                              ...                        
5731    [it, was, reported, that, a, transmitter, fail...
5732    [it, was, reported, that, signal, loss, over, ...
5733    [it, was, reported, that, transmitter, failed,...
5734    [it, was, reported, that, a, transmitter, fail...
5735    [it, was, reported, that, signal, loss, over, ...
Name: CLEAN_TEXT, Length: 5736, dtype: object

In [10]:
df['CLEAN_TEXT'][0]

['it',
 'was',
 'reported',
 'that',
 'the',
 'transmitter',
 'lost',
 'connection',
 'with',
 'the',
 'pump',
 'for',
 'greater',
 'than',
 '1',
 'hour',
 'the',
 'transmitter',
 'ultimately',
 'regained',
 'connection',
 'with',
 'the',
 'pump',
 'no',
 'additional',
 'patient',
 'or',
 'event',
 'information',
 'was',
 'available']

## Remove Stop Words

In [11]:
stopwords = nltk.corpus.stopwords.words('english')

# Define a function to convert to lowercase and remove stopwords
def remove_stopwords(tokenized_text):
    text = [word for word in tokenized_text if word.lower() not in stopwords]
    return text

df['NOSTOPWORD_TEXT'] = df['CLEAN_TEXT'].apply(lambda x: remove_stopwords(x))
df['NOSTOPWORD_TEXT']

0       [reported, transmitter, lost, connection, pump...
1       [reported, signal, loss, one, hour, occurred, ...
2       [reported, transmitter, failed, error, occurre...
3       [reported, signal, loss, one, hour, occurred, ...
4       [reported, signal, loss, one, hour, occurred, ...
                              ...                        
5731    [reported, transmitter, failed, error, occurre...
5732    [reported, signal, loss, one, hour, occurred, ...
5733    [reported, transmitter, failed, error, occurre...
5734    [reported, transmitter, failed, error, occurre...
5735    [reported, signal, loss, one, hour, occurred, ...
Name: NOSTOPWORD_TEXT, Length: 5736, dtype: object

In [12]:
df['NOSTOPWORD_TEXT'][0]

['reported',
 'transmitter',
 'lost',
 'connection',
 'pump',
 'greater',
 '1',
 'hour',
 'transmitter',
 'ultimately',
 'regained',
 'connection',
 'pump',
 'additional',
 'patient',
 'event',
 'information',
 'available']

## Parts of Speech (POS) Tagging

In [13]:
# Apply the nltk.pos_tag() function to each row of the TOKENIZED_TEXT column
df['POS_TEXT'] = df['NOSTOPWORD_TEXT'].apply(nltk.pos_tag)
df['POS_TEXT']

0       [(reported, VBN), (transmitter, NN), (lost, VB...
1       [(reported, VBN), (signal, JJ), (loss, NN), (o...
2       [(reported, VBN), (transmitter, NN), (failed, ...
3       [(reported, VBN), (signal, JJ), (loss, NN), (o...
4       [(reported, VBN), (signal, JJ), (loss, NN), (o...
                              ...                        
5731    [(reported, VBN), (transmitter, NN), (failed, ...
5732    [(reported, VBN), (signal, JJ), (loss, NN), (o...
5733    [(reported, VBN), (transmitter, NN), (failed, ...
5734    [(reported, VBN), (transmitter, NN), (failed, ...
5735    [(reported, VBN), (signal, JJ), (loss, NN), (o...
Name: POS_TEXT, Length: 5736, dtype: object

In [14]:
df['POS_TEXT'][0]

[('reported', 'VBN'),
 ('transmitter', 'NN'),
 ('lost', 'VBN'),
 ('connection', 'NN'),
 ('pump', 'NN'),
 ('greater', 'JJR'),
 ('1', 'CD'),
 ('hour', 'NN'),
 ('transmitter', 'NN'),
 ('ultimately', 'RB'),
 ('regained', 'VBD'),
 ('connection', 'NN'),
 ('pump', 'NN'),
 ('additional', 'JJ'),
 ('patient', 'NN'),
 ('event', 'NN'),
 ('information', 'NN'),
 ('available', 'JJ')]

## Lemmatization

In [20]:
from nltk.stem import WordNetLemmatizer

# Set the WordNetLemmatizer preference to Nouns ('n') or Verbs ('v')
# This will affect the lemmatization
wordnet_preference = "v"

# define a function to lemmatize each word in a text list based on its POS tag
def lemmatize_text(pos_tagged_text):
    # initialize WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    
    # map NLTK's POS tags to WordNet's POS tags
    pos_map = {'N': 'n', 'V': 'v', 'R': 'r', 'J': 'a'}
    
    # lemmatize each word in the text list based on its POS tag
    lemmatized_text = []
    
    for word, pos in pos_tagged_text:
        
        # get the first character of the POS tag to use as the WordNet POS tag
        # Set the WordNetLemmatizer default to Nouns ('n') or Verbs ('v')
        wn_pos = pos_map.get(pos[0], 'n') 
        
        # lemmatize the word and append it to the lemmatized text list
        lemmatized_word = lemmatizer.lemmatize(word, pos=wn_pos)
        lemmatized_text.append(lemmatized_word)
    
    # return the lemmatized text list as a string
    return lemmatized_text

# apply the lemmatize_text function to each row of the dataframe
df['LEMMATIZED_TEXT'] = df['POS_TEXT'].apply(lemmatize_text)
df['LEMMATIZED_TEXT']

0       [report, transmitter, lose, connection, pump, ...
1       [report, signal, loss, one, hour, occur, produ...
2       [report, transmitter, fail, error, occur, data...
3       [report, signal, loss, one, hour, occur, revie...
4       [report, signal, loss, one, hour, occur, produ...
                              ...                        
5731    [report, transmitter, fail, error, occur, data...
5732    [report, signal, loss, one, hour, occur, indic...
5733    [report, transmitter, fail, error, occur, prod...
5734    [report, transmitter, fail, error, occur, data...
5735    [report, signal, loss, one, hour, occur, data,...
Name: LEMMATIZED_TEXT, Length: 5736, dtype: object

In [21]:
row_check = 9

word_header = "Word"
pos_header = "POS Tag"
lemma_header = "Lemma"
print(f"{word_header:<{16}}{pos_header:<{8}}{lemma_header}")
print("-"*15, "-"*7, "-"*16)

for i in range(len(df['POS_TEXT'][row_check])):
    original_word = df['POS_TEXT'][row_check][i][0]
    pos_tag = df['POS_TEXT'][row_check][i][1]
    lemma = df['LEMMATIZED_TEXT'][row_check][i]
    
    print(f"{original_word:<{16}}{pos_tag:<{8}}{lemma}")


Word            POS Tag Lemma
--------------- ------- ----------------
reported        VBN     report
signal          JJ      signal
loss            NN      loss
one             CD      one
hour            NN      hour
occurred        VBD     occur
product         NN      product
data            NNS     data
provided        VBD     provide
evaluation      NN      evaluation
confirmation    NN      confirmation
allegation      NN      allegation
probable        NN      probable
could           MD      could
determined      VB      determine
injury          VB      injury
medical         JJ      medical
intervention    NN      intervention
reported        VBD     report


## Create Bag of Words (BOW)

In [29]:
from sklearn.feature_extraction.text import CountVectorizer

# create a CountVectorizer object
count_vectorizer = CountVectorizer()

# fit the vectorizer to the text data
count_vectorizer.fit(df['LEMMATIZED_TEXT'].apply(lambda x: ' '.join(x)))

# create a bag of words matrix
bow_matrix = count_vectorizer.transform(df['LEMMATIZED_TEXT'].apply(lambda x: ' '.join(x)))

# convert the bag of words matrix to a DataFrame
bow_df = pd.DataFrame(bow_matrix.toarray(), columns=count_vectorizer.get_feature_names())

bow_df.shape



(5736, 1197)

In [30]:
bow_df.head()

Unnamed: 0,021,021vdc,03142020,045,06142020,07032020,07312020,0v,0vdc,10,...,work,would,x2,xray,year,yellow,yes,yet,zero,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Calculate Term Frequency-Inverse Document Frequency (TF-IDF)

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer

# create a CountVectorizer object and fit it to the text data
tfidf_vectorizer = TfidfVectorizer()

X = tfidf_vectorizer.fit_transform(df['LEMMATIZED_TEXT'].apply(lambda x: ' '.join(x)))

# convert the sparse matrix to a DataFrame
tfidf_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
tfidf_df.shape



(5736, 1197)

In [32]:
tfidf_df.head()

Unnamed: 0,021,021vdc,03142020,045,06142020,07032020,07312020,0v,0vdc,10,...,work,would,x2,xray,year,yellow,yes,yet,zero,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Save the preproecssed data

In [45]:
df.to_csv(f"{working_directory}/preprocessed_data.csv", index=False)
bow_df.to_csv(f"{working_directory}/bag_of_words_data.csv", index=False)
tfidf_df.to_csv(f"{working_directory}/tfidf_data.csv", index=False)

## Upload All Output to an S3 Bucket

In [48]:
import os
import subprocess

# Create the upload command using the AWS command line interface
command = ["aws", "s3", "sync", working_directory, f"s3://praxis-2023-html-output", "--exclude", f"*/.ipynb_checkpoints/*", "--no-progress"]

# Run the command and wait for it to complete
output = subprocess.run(command, capture_output=True, text=True)

# Print the command's output
print(output.stdout)


