In [1]:
import nltk
from nltk.corpus import stopwords

In [2]:
import pandas as pd

# Identify the data directory, working directory, and data files
data_directory = "./06-Clean-2020-Data"
working_directory = "./08-NLP-2020"
data_file = f"{data_directory}/2020_data_clean.csv"

import os

# Create the working directory if needed
try:
    os.makedirs(working_directory, exist_ok=True)
except OSError as error:
    print(f"Error creating {working_directory}: {error}")

In [3]:
# Read the data into a pandas dataframe
df = pd.read_csv(
    data_file,  # The data file being read, from the variable assignment above
    on_bad_lines="warn",  # This tells Pandas to only warn on bad lines vs causing an error
    dtype="str",
)  # This tells Pandas to treat all numbers as words

df = df.fillna("")

In [4]:
# How big is the dataset
df.shape

(3487524, 13)

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,FOI_TEXT,DEVICE_PROBLEM_CODE,DEVICE_PROBLEM_TEXT,GENERIC_NAME,DEVICE_REPORT_PRODUCT_CODE,UDI-DI,UDI-PUBLIC,DATE_OF_EVENT,REPORTER_OCCUPATION_CODE,REPORT_DATE,EVENT_LOCATION,SOURCE_TYPE
0,0,THE RESULTS OF THE INVESTIGATION ARE INCONCLUS...,2993,Adverse Event Without Identified Device or Use...,DEFIBRILLATION LEAD,LWS,5414734502085,5414734502085,12/12/2019,1,,I,"COMPANY REPRESENTATIVE,HEALTH"
1,1,IT WAS REPORTED THAT THE PATIENT EXPIRED. THER...,2993,Adverse Event Without Identified Device or Use...,DEFIBRILLATION LEAD,LWS,5414734502085,5414734502085,12/12/2019,1,,I,"COMPANY REPRESENTATIVE,HEALTH"
2,2,INVESTIGATION RESULTS WILL BE PROVIDED IN THE ...,1332,Failure to Interrogate,IMPLANTABLE CARDIOVERTER DEFIBRILLATOR,NVZ,5414734504386,5414734504386,12/12/2019,0,,I,"COMPANY REPRESENTATIVE,HEALTH"
3,3,IT WAS REPORTED THAT THE PATIENT CALLED EMERGE...,1332,Failure to Interrogate,IMPLANTABLE CARDIOVERTER DEFIBRILLATOR,NVZ,5414734504386,5414734504386,12/12/2019,0,,I,"COMPANY REPRESENTATIVE,HEALTH"
4,4,COMMUNICATION FAILURE AND PREMATURE BATTERY DE...,1332,Failure to Interrogate,IMPLANTABLE CARDIOVERTER DEFIBRILLATOR,NVZ,5414734504386,5414734504386,12/12/2019,0,,I,"COMPANY REPRESENTATIVE,HEALTH"


# Pre-processing Text Data

1. Remove punctuation
2. Tokenization
2. Remove stopwords

## Remove Punctuation

In [6]:
# Remove Punctuation
import string

string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [7]:
# Define a function to remove punctuation in the data
def remove_punctuation(text):
    text = "".join(
        [character for character in text if character not in string.punctuation]
    )
    return text


df["text_clean"] = df["FOI_TEXT"].apply(lambda x: remove_punctuation(x))

In [8]:
df.head()

Unnamed: 0.1,Unnamed: 0,FOI_TEXT,DEVICE_PROBLEM_CODE,DEVICE_PROBLEM_TEXT,GENERIC_NAME,DEVICE_REPORT_PRODUCT_CODE,UDI-DI,UDI-PUBLIC,DATE_OF_EVENT,REPORTER_OCCUPATION_CODE,REPORT_DATE,EVENT_LOCATION,SOURCE_TYPE,text_clean
0,0,THE RESULTS OF THE INVESTIGATION ARE INCONCLUS...,2993,Adverse Event Without Identified Device or Use...,DEFIBRILLATION LEAD,LWS,5414734502085,5414734502085,12/12/2019,1,,I,"COMPANY REPRESENTATIVE,HEALTH",THE RESULTS OF THE INVESTIGATION ARE INCONCLUS...
1,1,IT WAS REPORTED THAT THE PATIENT EXPIRED. THER...,2993,Adverse Event Without Identified Device or Use...,DEFIBRILLATION LEAD,LWS,5414734502085,5414734502085,12/12/2019,1,,I,"COMPANY REPRESENTATIVE,HEALTH",IT WAS REPORTED THAT THE PATIENT EXPIRED THERE...
2,2,INVESTIGATION RESULTS WILL BE PROVIDED IN THE ...,1332,Failure to Interrogate,IMPLANTABLE CARDIOVERTER DEFIBRILLATOR,NVZ,5414734504386,5414734504386,12/12/2019,0,,I,"COMPANY REPRESENTATIVE,HEALTH",INVESTIGATION RESULTS WILL BE PROVIDED IN THE ...
3,3,IT WAS REPORTED THAT THE PATIENT CALLED EMERGE...,1332,Failure to Interrogate,IMPLANTABLE CARDIOVERTER DEFIBRILLATOR,NVZ,5414734504386,5414734504386,12/12/2019,0,,I,"COMPANY REPRESENTATIVE,HEALTH",IT WAS REPORTED THAT THE PATIENT CALLED EMERGE...
4,4,COMMUNICATION FAILURE AND PREMATURE BATTERY DE...,1332,Failure to Interrogate,IMPLANTABLE CARDIOVERTER DEFIBRILLATOR,NVZ,5414734504386,5414734504386,12/12/2019,0,,I,"COMPANY REPRESENTATIVE,HEALTH",COMMUNICATION FAILURE AND PREMATURE BATTERY DE...


## Expanding Contraction and Tokenize

In [9]:
import contractions

# This approach takes the FOI_TEXT as a string and creates a new column with tokens
# That is, it removes the contraction _and_ tokenizes at the same time
# no additional function is needed, x.split tokenizes the string (FOI text) at every space

df["no_contractions"] = df["FOI_TEXT"].apply(
    lambda x: [contractions.fix(word) for word in x.split()]
)
df.head()

Unnamed: 0.1,Unnamed: 0,FOI_TEXT,DEVICE_PROBLEM_CODE,DEVICE_PROBLEM_TEXT,GENERIC_NAME,DEVICE_REPORT_PRODUCT_CODE,UDI-DI,UDI-PUBLIC,DATE_OF_EVENT,REPORTER_OCCUPATION_CODE,REPORT_DATE,EVENT_LOCATION,SOURCE_TYPE,text_clean,no_contractions
0,0,THE RESULTS OF THE INVESTIGATION ARE INCONCLUS...,2993,Adverse Event Without Identified Device or Use...,DEFIBRILLATION LEAD,LWS,5414734502085,5414734502085,12/12/2019,1,,I,"COMPANY REPRESENTATIVE,HEALTH",THE RESULTS OF THE INVESTIGATION ARE INCONCLUS...,"[THE, RESULTS, OF, THE, INVESTIGATION, ARE, IN..."
1,1,IT WAS REPORTED THAT THE PATIENT EXPIRED. THER...,2993,Adverse Event Without Identified Device or Use...,DEFIBRILLATION LEAD,LWS,5414734502085,5414734502085,12/12/2019,1,,I,"COMPANY REPRESENTATIVE,HEALTH",IT WAS REPORTED THAT THE PATIENT EXPIRED THERE...,"[IT, WAS, REPORTED, THAT, THE, PATIENT, EXPIRE..."
2,2,INVESTIGATION RESULTS WILL BE PROVIDED IN THE ...,1332,Failure to Interrogate,IMPLANTABLE CARDIOVERTER DEFIBRILLATOR,NVZ,5414734504386,5414734504386,12/12/2019,0,,I,"COMPANY REPRESENTATIVE,HEALTH",INVESTIGATION RESULTS WILL BE PROVIDED IN THE ...,"[INVESTIGATION, RESULTS, WILL, BE, PROVIDED, I..."
3,3,IT WAS REPORTED THAT THE PATIENT CALLED EMERGE...,1332,Failure to Interrogate,IMPLANTABLE CARDIOVERTER DEFIBRILLATOR,NVZ,5414734504386,5414734504386,12/12/2019,0,,I,"COMPANY REPRESENTATIVE,HEALTH",IT WAS REPORTED THAT THE PATIENT CALLED EMERGE...,"[IT, WAS, REPORTED, THAT, THE, PATIENT, CALLED..."
4,4,COMMUNICATION FAILURE AND PREMATURE BATTERY DE...,1332,Failure to Interrogate,IMPLANTABLE CARDIOVERTER DEFIBRILLATOR,NVZ,5414734504386,5414734504386,12/12/2019,0,,I,"COMPANY REPRESENTATIVE,HEALTH",COMMUNICATION FAILURE AND PREMATURE BATTERY DE...,"[COMMUNICATION, FAILURE, AND, PREMATURE, BATTE..."


In [10]:
# Define a function to split our sentences into a list of words
import re


def tokenize(text):
    tokens = re.split("\W+", text)
    return tokens


# df['text_tokenized'] = data['text_clean'].apply(lambda x: tokenize(x.lower()))

In [11]:
df.head()

Unnamed: 0.1,Unnamed: 0,FOI_TEXT,DEVICE_PROBLEM_CODE,DEVICE_PROBLEM_TEXT,GENERIC_NAME,DEVICE_REPORT_PRODUCT_CODE,UDI-DI,UDI-PUBLIC,DATE_OF_EVENT,REPORTER_OCCUPATION_CODE,REPORT_DATE,EVENT_LOCATION,SOURCE_TYPE,text_clean,no_contractions
0,0,THE RESULTS OF THE INVESTIGATION ARE INCONCLUS...,2993,Adverse Event Without Identified Device or Use...,DEFIBRILLATION LEAD,LWS,5414734502085,5414734502085,12/12/2019,1,,I,"COMPANY REPRESENTATIVE,HEALTH",THE RESULTS OF THE INVESTIGATION ARE INCONCLUS...,"[THE, RESULTS, OF, THE, INVESTIGATION, ARE, IN..."
1,1,IT WAS REPORTED THAT THE PATIENT EXPIRED. THER...,2993,Adverse Event Without Identified Device or Use...,DEFIBRILLATION LEAD,LWS,5414734502085,5414734502085,12/12/2019,1,,I,"COMPANY REPRESENTATIVE,HEALTH",IT WAS REPORTED THAT THE PATIENT EXPIRED THERE...,"[IT, WAS, REPORTED, THAT, THE, PATIENT, EXPIRE..."
2,2,INVESTIGATION RESULTS WILL BE PROVIDED IN THE ...,1332,Failure to Interrogate,IMPLANTABLE CARDIOVERTER DEFIBRILLATOR,NVZ,5414734504386,5414734504386,12/12/2019,0,,I,"COMPANY REPRESENTATIVE,HEALTH",INVESTIGATION RESULTS WILL BE PROVIDED IN THE ...,"[INVESTIGATION, RESULTS, WILL, BE, PROVIDED, I..."
3,3,IT WAS REPORTED THAT THE PATIENT CALLED EMERGE...,1332,Failure to Interrogate,IMPLANTABLE CARDIOVERTER DEFIBRILLATOR,NVZ,5414734504386,5414734504386,12/12/2019,0,,I,"COMPANY REPRESENTATIVE,HEALTH",IT WAS REPORTED THAT THE PATIENT CALLED EMERGE...,"[IT, WAS, REPORTED, THAT, THE, PATIENT, CALLED..."
4,4,COMMUNICATION FAILURE AND PREMATURE BATTERY DE...,1332,Failure to Interrogate,IMPLANTABLE CARDIOVERTER DEFIBRILLATOR,NVZ,5414734504386,5414734504386,12/12/2019,0,,I,"COMPANY REPRESENTATIVE,HEALTH",COMMUNICATION FAILURE AND PREMATURE BATTERY DE...,"[COMMUNICATION, FAILURE, AND, PREMATURE, BATTE..."


In [12]:
df.loc[:, ["FOI_TEXT", "text_clean", "no_contractions"]].head()

Unnamed: 0,FOI_TEXT,text_clean,no_contractions
0,THE RESULTS OF THE INVESTIGATION ARE INCONCLUS...,THE RESULTS OF THE INVESTIGATION ARE INCONCLUS...,"[THE, RESULTS, OF, THE, INVESTIGATION, ARE, IN..."
1,IT WAS REPORTED THAT THE PATIENT EXPIRED. THER...,IT WAS REPORTED THAT THE PATIENT EXPIRED THERE...,"[IT, WAS, REPORTED, THAT, THE, PATIENT, EXPIRE..."
2,INVESTIGATION RESULTS WILL BE PROVIDED IN THE ...,INVESTIGATION RESULTS WILL BE PROVIDED IN THE ...,"[INVESTIGATION, RESULTS, WILL, BE, PROVIDED, I..."
3,IT WAS REPORTED THAT THE PATIENT CALLED EMERGE...,IT WAS REPORTED THAT THE PATIENT CALLED EMERGE...,"[IT, WAS, REPORTED, THAT, THE, PATIENT, CALLED..."
4,COMMUNICATION FAILURE AND PREMATURE BATTERY DE...,COMMUNICATION FAILURE AND PREMATURE BATTERY DE...,"[COMMUNICATION, FAILURE, AND, PREMATURE, BATTE..."


## Remove Stopwords

In [13]:
# Remove Stopwords
stopwords = nltk.corpus.stopwords.words("english")

stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [None]:
# Define a function to remove all stopwords
def remove_stopwords(tokenized_text):
    text = [word.lower() for word in tokenized_text if word.lower() not in stopwords]
    return text


df["text_nostop"] = df["no_contractions"].apply(lambda x: remove_stopwords(x))

In [None]:
df.loc[:, ["FOI_TEXT", "text_nostop"]].head()

In [None]:
# Parts of Speech Tagging (POS)

# def pos_tag(tagged_text):
# return nltk.pos_tag(tagged_text)

df["pos_text"] = df["text_nostop"].apply(lambda x: nltk.pos_tag(x))

In [None]:
df.loc[:, ["FOI_TEXT", "pos_text"]].head()

# lemmatization

In [None]:
# Use the wordnet Library to map words to their Lemmatization form

nltk.download("wordnet")
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
df["lemmatized"] = df["text_nostop"].apply(
    lambda x: [lemmatizer.lemmatize(word) for word in x]
)

In [None]:
df.loc[:, ["FOI_TEXT", "lemmatized"]].head()