# Preprocess Combined Data
Using the combined QBJ as input, perform data preprocessing including:

1. Expand Contractions, Tokenize, and Convert to Lowercase
1. Remove Punctuation
1. Remove Stop Words
1. Parts of Speech (POS) Tagging
1. Lemmatize
1. Stemming
1. Sentencize
1. Create Bag of Words (BOW)
1. Calculate Term Frequency-Inverse Document Frequency (TF-IDF)

_Run [Notebook 07-Install-NLTK](./07-Install-NLTK.ipynb) as needed to set up NLTK._

In [1]:
import pandas as pd
import os
import csv

# Identify the working directory and data files
working_directory = './16-data_preprocessing'

# Create the working directory if needed
try:
    os.makedirs(working_directory, exist_ok=True)
except OSError as error:
    print(f"Error creating {working_directory}: {error}")

In [2]:
# Readthe combined data into a dataframe
data_file = './15-data_combination/qbj_data_combined.csv'

# Read the data into a pandas dataframe
df = pd.read_csv(data_file,           # The data file being read, from the variable assignment above
                 on_bad_lines='warn', # This tells Pandas to only warn on bad lines vs causing an error
                 dtype='str')         # This tells Pandas to treat all numbers as words

In [3]:
df.shape

(5736, 13)

In [4]:
df.head(2)

Unnamed: 0,ROW_ID,FOI_TEXT,DEVICE_PROBLEM_CODE,DEVICE_PROBLEM_TEXT,GENERIC_NAME,DEVICE_REPORT_PRODUCT_CODE,UDI-DI,UDI-PUBLIC,DATE_OF_EVENT,REPORTER_OCCUPATION_CODE,REPORT_DATE,EVENT_LOCATION,SOURCE_TYPE
0,1969025,IT WAS REPORTED THAT THE TRANSMITTER LOST CONN...,3283,Wireless Communication Problem,CONTINUOUS GLUCOSE MONITOR,QBJ,,,07/30/2020,0,,I,CONSUMER
1,1426265,IT WAS REPORTED THAT SIGNAL LOSS OVER ONE HOUR...,3283,Wireless Communication Problem,CONTINUOUS GLUCOSE MONITOR,QBJ,386270000385.0,386270000385.0,06/05/2020,0,,I,CONSUMER


## Assign a Row ID for Verification
Assign a value to a variable that identifies a row from the dataset.  

This will allow the same row to be used for verification of each preprocessing step.

In [5]:
verification_row = 9

## Load the Natural Language Toolkit (NLTK) and Preprocessing Libraries

In [6]:
# Import the NLTK library
import nltk # If this step fails, rerun 07-Install-NLTK.ipynb
import string
import contractions

## 1. Expand Contractions, Tokenize, and Convert to Lowercase

In [7]:
# This approach takes the FOI_TEXT as a string and creates a new column with tokens
# It removes contractions _and_ tokenizes at the same time
# No additional function is needed, x.split tokenizes the string (FOI text) at every space
# A call to lower() converts the word to lowercase

df['TOKENIZED_TEXT'] = df['FOI_TEXT'].apply(lambda x: [contractions.fix(word).lower() for word in x.split()])
df['TOKENIZED_TEXT'].head()

0    [it, was, reported, that, the, transmitter, lo...
1    [it, was, reported, that, signal, loss, over, ...
2    [it, was, reported, that, transmitter, failed,...
3    [it, was, reported, that, signal, loss, over, ...
4    [it, was, reported, that, signal, loss, over, ...
Name: TOKENIZED_TEXT, dtype: object

In [8]:
df['TOKENIZED_TEXT'][verification_row]

['it',
 'was',
 'reported',
 'that',
 'signal',
 'loss',
 'over',
 'one',
 'hour',
 'occurred.',
 'no',
 'product',
 'or',
 'data',
 'was',
 'provided',
 'for',
 'evaluation.',
 'confirmation',
 'of',
 'the',
 'allegation',
 'and',
 'a',
 'probable',
 'because',
 'could',
 'not',
 'be',
 'determined.',
 'no',
 'injury',
 'or',
 'medical',
 'intervention',
 'was',
 'reported.']

## 2. Remove Punctuation

In [9]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [10]:
# Define a function to remove punctuation in the data
def remove_punctuation(text):
    text = "".join([character for character in text if character not in string.punctuation])
    return text

df['NOPUNCT_TEXT'] = df['TOKENIZED_TEXT'].apply(lambda x: [remove_punctuation(word) for word in x])
df['NOPUNCT_TEXT'].head()

0    [it, was, reported, that, the, transmitter, lo...
1    [it, was, reported, that, signal, loss, over, ...
2    [it, was, reported, that, transmitter, failed,...
3    [it, was, reported, that, signal, loss, over, ...
4    [it, was, reported, that, signal, loss, over, ...
Name: NOPUNCT_TEXT, dtype: object

In [11]:
df['NOPUNCT_TEXT'][verification_row]

['it',
 'was',
 'reported',
 'that',
 'signal',
 'loss',
 'over',
 'one',
 'hour',
 'occurred',
 'no',
 'product',
 'or',
 'data',
 'was',
 'provided',
 'for',
 'evaluation',
 'confirmation',
 'of',
 'the',
 'allegation',
 'and',
 'a',
 'probable',
 'because',
 'could',
 'not',
 'be',
 'determined',
 'no',
 'injury',
 'or',
 'medical',
 'intervention',
 'was',
 'reported']

## 3. Remove Stop Words

In [12]:
stopwords = nltk.corpus.stopwords.words('english')

# Define a function to convert to lowercase and remove stopwords
def remove_stopwords(tokenized_text):
    text = [word for word in tokenized_text if word.lower() not in stopwords]
    return text

df['NOSTOPWORD_TEXT'] = df['NOPUNCT_TEXT'].apply(lambda x: remove_stopwords(x))
df['NOSTOPWORD_TEXT'].head()

0    [reported, transmitter, lost, connection, pump...
1    [reported, signal, loss, one, hour, occurred, ...
2    [reported, transmitter, failed, error, occurre...
3    [reported, signal, loss, one, hour, occurred, ...
4    [reported, signal, loss, one, hour, occurred, ...
Name: NOSTOPWORD_TEXT, dtype: object

In [13]:
df['NOSTOPWORD_TEXT'][verification_row]

['reported',
 'signal',
 'loss',
 'one',
 'hour',
 'occurred',
 'product',
 'data',
 'provided',
 'evaluation',
 'confirmation',
 'allegation',
 'probable',
 'could',
 'determined',
 'injury',
 'medical',
 'intervention',
 'reported']

## 4. Parts of Speech (POS) Tagging

In [14]:
# Apply the nltk.pos_tag() function to each row of the TOKENIZED_TEXT column
# pos_tag returns a Tuple for each word consisting of the word and its classification
# TODO: List classifications and their abbreviations
df['POS_TEXT'] = df['NOSTOPWORD_TEXT'].apply(nltk.pos_tag)
df['POS_TEXT'].head()

0    [(reported, VBN), (transmitter, NN), (lost, VB...
1    [(reported, VBN), (signal, JJ), (loss, NN), (o...
2    [(reported, VBN), (transmitter, NN), (failed, ...
3    [(reported, VBN), (signal, JJ), (loss, NN), (o...
4    [(reported, VBN), (signal, JJ), (loss, NN), (o...
Name: POS_TEXT, dtype: object

In [15]:
df['POS_TEXT'][verification_row]

[('reported', 'VBN'),
 ('signal', 'JJ'),
 ('loss', 'NN'),
 ('one', 'CD'),
 ('hour', 'NN'),
 ('occurred', 'VBD'),
 ('product', 'NN'),
 ('data', 'NNS'),
 ('provided', 'VBD'),
 ('evaluation', 'NN'),
 ('confirmation', 'NN'),
 ('allegation', 'NN'),
 ('probable', 'NN'),
 ('could', 'MD'),
 ('determined', 'VB'),
 ('injury', 'VB'),
 ('medical', 'JJ'),
 ('intervention', 'NN'),
 ('reported', 'VBD')]

## 5. Lemmatization

In [16]:
from nltk.stem import WordNetLemmatizer

# define a function to lemmatize each word in a text list based on its POS tag
def lemmatize_text(pos_tagged_text):
    # initialize WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    
    # map NLTK's POS tags to WordNet's POS tags
    # TODO: list the abbreviations for WordNet's parts of speech
    pos_map = {'N': 'n', 'V': 'v', 'R': 'r', 'J': 'a'}
    
    # lemmatize each word in the text list based on its POS tag
    lemmatized_text = []
    
    for word, pos in pos_tagged_text:
        
        # get the first character of the POS tag to use as the WordNet POS tag
        # 
        # Set the WordNetLemmatizer default to Nouns ('n') or Verbs ('v')
        #
        wn_pos = pos_map.get(pos[0], 'n') 
        
        # lemmatize the word and append it to the lemmatized text list
        lemmatized_word = lemmatizer.lemmatize(word, pos=wn_pos)
        lemmatized_text.append(lemmatized_word)
    
    # return the lemmatized text list
    return lemmatized_text

# apply the lemmatize_text function to each row of the dataframe
df['LEMMATIZED_TEXT'] = df['POS_TEXT'].apply(lemmatize_text)
df['LEMMATIZED_TEXT'].head()

0    [report, transmitter, lose, connection, pump, ...
1    [report, signal, loss, one, hour, occur, produ...
2    [report, transmitter, fail, error, occur, data...
3    [report, signal, loss, one, hour, occur, revie...
4    [report, signal, loss, one, hour, occur, produ...
Name: LEMMATIZED_TEXT, dtype: object

## 6. Stemming

In [17]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# define a function to stem each word in a text list based on its POS tag
def stem_words(pos_tagged_text):
    stemmer = PorterStemmer()
    
    stemmed_text = []
    
    for word, pos in pos_tagged_text:
        # stem the word and append it to the stemmed text list
        stemmed_word = stemmer.stem(word)
        stemmed_text.append(stemmed_word)
    
    # return the stemmed text list
    return stemmed_text

df['STEMMED_TEXT'] = df['POS_TEXT'].apply(stem_words)
df['STEMMED_TEXT'].head()

0    [report, transmitt, lost, connect, pump, great...
1    [report, signal, loss, one, hour, occur, produ...
2    [report, transmitt, fail, error, occur, data, ...
3    [report, signal, loss, one, hour, occur, revie...
4    [report, signal, loss, one, hour, occur, produ...
Name: STEMMED_TEXT, dtype: object

## Compare the results of lemmatization and stemming

In [18]:
# For verification, print a table showing the word, POS tag, the lemmatization result, and the stemming result
# TODO: Is the lemmatizer correctly lemmatizing words that end in 'ion', ie: evalutation -> evaluate
word_header = "Word"
pos_header = "POS Tag"
lemma_header = "Lemma"
stem_header = "Stem"
print(f"{word_header:<{16}}{pos_header:<{8}}{lemma_header:<{16}}{stem_header:<{16}}")
print("-"*15, "-"*7, "-"*16, "-"*16)

for i in range(len(df['POS_TEXT'][verification_row])):
    original_word = df['POS_TEXT'][verification_row][i][0]
    pos_tag = df['POS_TEXT'][verification_row][i][1]
    lemma = df['LEMMATIZED_TEXT'][verification_row][i]
    stem = df['STEMMED_TEXT'][verification_row][i]
    
    print(f"{original_word:<{16}}{pos_tag:<{8}}{lemma:<{16}}{stem}")


Word            POS Tag Lemma           Stem            
--------------- ------- ---------------- ----------------
reported        VBN     report          report
signal          JJ      signal          signal
loss            NN      loss            loss
one             CD      one             one
hour            NN      hour            hour
occurred        VBD     occur           occur
product         NN      product         product
data            NNS     data            data
provided        VBD     provide         provid
evaluation      NN      evaluation      evalu
confirmation    NN      confirmation    confirm
allegation      NN      allegation      alleg
probable        NN      probable        probabl
could           MD      could           could
determined      VB      determine       determin
injury          VB      injury          injuri
medical         JJ      medical         medic
intervention    NN      intervention    intervent
reported        VBD     report          repor

## 7. Sentencize

In [19]:
import nltk
from nltk.tokenize import sent_tokenize

# Define a function to tokenize the sentences
def split_sentences(text):
    sentences = sent_tokenize(text)
    return sentences

df['SENTENCIZED_TEXT'] = df['FOI_TEXT'].apply(split_sentences)
df['SENTENCIZED_TEXT'].head()

0    [IT WAS REPORTED THAT THE TRANSMITTER LOST CON...
1    [IT WAS REPORTED THAT SIGNAL LOSS OVER ONE HOU...
2    [IT WAS REPORTED THAT TRANSMITTER FAILED ERROR...
3    [IT WAS REPORTED THAT SIGNAL LOSS OVER ONE HOU...
4    [IT WAS REPORTED THAT SIGNAL LOSS OVER ONE HOU...
Name: SENTENCIZED_TEXT, dtype: object

In [20]:
df['SENTENCIZED_TEXT'][verification_row]

['IT WAS REPORTED THAT SIGNAL LOSS OVER ONE HOUR OCCURRED.',
 'NO PRODUCT OR DATA WAS PROVIDED FOR EVALUATION.',
 'CONFIRMATION OF THE ALLEGATION AND A PROBABLE CAUSE COULD NOT BE DETERMINED.',
 'NO INJURY OR MEDICAL INTERVENTION WAS REPORTED.']

## 8. Create Bag of Words (BOW)

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

# create a CountVectorizer object
count_vectorizer = CountVectorizer()

# fit the vectorizer to the text data
count_vectorizer.fit(df['LEMMATIZED_TEXT'].apply(lambda x: ' '.join(x)))

# create a bag of words matrix
bow_matrix = count_vectorizer.transform(df['LEMMATIZED_TEXT'].apply(lambda x: ' '.join(x)))

# convert the bag of words matrix to a DataFrame
bow_df = pd.DataFrame(bow_matrix.toarray(), columns=count_vectorizer.get_feature_names())

bow_df.shape



(5736, 1197)

In [22]:
bow_df.head()

Unnamed: 0,021,021vdc,03142020,045,06142020,07032020,07312020,0v,0vdc,10,...,work,would,x2,xray,year,yellow,yes,yet,zero,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
# TODO: Plot the BOW results (?)

## 9. Calculate Term Frequency-Inverse Document Frequency (TF-IDF)

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

# create a CountVectorizer object and fit it to the text data
tfidf_vectorizer = TfidfVectorizer()

X = tfidf_vectorizer.fit_transform(df['LEMMATIZED_TEXT'].apply(lambda x: ' '.join(x)))

# convert the sparse matrix to a DataFrame
tfidf_df = pd.DataFrame(X.toarray(), columns=tfidf_vectorizer.get_feature_names())
tfidf_df.shape



(5736, 1197)

In [25]:
tfidf_df.head()

Unnamed: 0,021,021vdc,03142020,045,06142020,07032020,07312020,0v,0vdc,10,...,work,would,x2,xray,year,yellow,yes,yet,zero,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Review the preprocessed data

In [26]:
print("Dataframe columns:")
for col in df.columns:
    print(f"\t{col}")

Dataframe columns:
	ROW_ID
	FOI_TEXT
	DEVICE_PROBLEM_CODE
	DEVICE_PROBLEM_TEXT
	GENERIC_NAME
	DEVICE_REPORT_PRODUCT_CODE
	UDI-DI
	UDI-PUBLIC
	DATE_OF_EVENT
	REPORTER_OCCUPATION_CODE
	REPORT_DATE
	EVENT_LOCATION
	SOURCE_TYPE
	TOKENIZED_TEXT
	NOPUNCT_TEXT
	NOSTOPWORD_TEXT
	POS_TEXT
	LEMMATIZED_TEXT
	STEMMED_TEXT
	SENTENCIZED_TEXT


## Save the preproecssed data

In [27]:
df.to_csv(f"{working_directory}/preprocessed_data.csv", index=False)
bow_df.to_csv(f"{working_directory}/bag_of_words_data.csv", index=False)
tfidf_df.to_csv(f"{working_directory}/tfidf_data.csv", index=False)

## Upload All Output to an S3 Bucket

In [28]:
import os
import subprocess

# Create the upload command using the AWS command line interface
command = ["aws", "s3", "sync", working_directory, f"s3://praxis-2023-html-output", "--exclude", f"*/.ipynb_checkpoints/*", "--no-progress"]

# Run the command and wait for it to complete
output = subprocess.run(command, capture_output=True, text=True)

# Print the command's output
print(output.stdout)

upload: 16-data_preprocessing/bag_of_words_data.csv to s3://praxis-2023-html-output/bag_of_words_data.csv
upload: 16-data_preprocessing/tfidf_data.csv to s3://praxis-2023-html-output/tfidf_data.csv
upload: 16-data_preprocessing/preprocessed_data.csv to s3://praxis-2023-html-output/preprocessed_data.csv

