In [1]:
import nltk
from nltk.corpus import stopwords

In [2]:
import pandas as pd

# Identify the data directory, working directory, and data files
data_directory = './2020_clean'
working_directory = './2020_NLP'
data_file = f"{data_directory}/2020_data_clean.csv"

import os

# Create the working directory if needed
try:
    os.makedirs(working_directory, exist_ok=True)
except OSError as error:
    print(f"Error creating {working_directory}: {error}")

In [3]:
# Read the data into a pandas dataframe
data = pd.read_csv(data_file, # The data file being read, from the variable assignment above
                   on_bad_lines='warn', # This tells Pandas to only warn on bad lines vs causing an error
                   dtype = 'str')       # This tells Pandas to treat all numbers as words

In [4]:
# How big is the dataset
data.shape

(276350, 6)

# Pre-processing Text Data

1. Remove punctuation
2. Tokenization
2. Remove stopwords

## Remove Punctuation

In [5]:
#Remove Punctuation
import string

string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [6]:
# Define a function to remove punctuation in the data
def remove_punctuation(text):
    text = "".join([character for character in text if character not in string.punctuation])
    return text
data['text_clean'] = data['FOI_TEXT'].apply(lambda x: remove_punctuation(x))

In [7]:
data.head()

Unnamed: 0.1,Unnamed: 0,FOI_TEXT,DEVICE_PROBLEM_CODE,DEVICE_PROBLEM_TEXT,GENERIC_NAME,DEVICE_REPORT_PRODUCT_CODE,text_clean
0,734,IT WAS REPORTED THAT A LOSS OF CONNECTION OCCU...,3283,Wireless Communication Problem,CONTINUOUS GLUCOSE MONITOR,QBJ,IT WAS REPORTED THAT A LOSS OF CONNECTION OCCU...
1,742,IT WAS REPORTED THAT A FAILED TRANSMITTER WAS ...,3191,Appropriate Term/Code Not Available,CONTINUOUS GLUCOSE MONITOR,QBJ,IT WAS REPORTED THAT A FAILED TRANSMITTER WAS ...
2,743,IT WAS REPORTED THAT A FAILED TRANSMITTER WAS ...,3283,Wireless Communication Problem,CONTINUOUS GLUCOSE MONITOR,QBJ,IT WAS REPORTED THAT A FAILED TRANSMITTER WAS ...
3,751,IT WAS REPORTED THAT A LOSS OF CONNECTION OCCU...,3283,Wireless Communication Problem,CONTINUOUS GLUCOSE MONITOR,QBJ,IT WAS REPORTED THAT A LOSS OF CONNECTION OCCU...
4,759,IT WAS REPORTED THAT A WARM UP RESTARTED DURIN...,3283,Wireless Communication Problem,CONTINUOUS GLUCOSE MONITOR,QBJ,IT WAS REPORTED THAT A WARM UP RESTARTED DURIN...


## Expanding Contraction and Tokenize

In [8]:
import contractions

# This approach takes the FOI_TEXT as a string and creates a new column with tokens
# That is, it removes the contraction _and_ tokenizes at the same time
# no additional function is needed, x.split tokenizes the string (FOI text) at every space

data['no_contractions'] = data['FOI_TEXT'].apply(lambda x: [contractions.fix(word) for word in x.split()])
data.head()

Unnamed: 0.1,Unnamed: 0,FOI_TEXT,DEVICE_PROBLEM_CODE,DEVICE_PROBLEM_TEXT,GENERIC_NAME,DEVICE_REPORT_PRODUCT_CODE,text_clean,no_contractions
0,734,IT WAS REPORTED THAT A LOSS OF CONNECTION OCCU...,3283,Wireless Communication Problem,CONTINUOUS GLUCOSE MONITOR,QBJ,IT WAS REPORTED THAT A LOSS OF CONNECTION OCCU...,"[IT, WAS, REPORTED, THAT, A, LOSS, OF, CONNECT..."
1,742,IT WAS REPORTED THAT A FAILED TRANSMITTER WAS ...,3191,Appropriate Term/Code Not Available,CONTINUOUS GLUCOSE MONITOR,QBJ,IT WAS REPORTED THAT A FAILED TRANSMITTER WAS ...,"[IT, WAS, REPORTED, THAT, A, FAILED, TRANSMITT..."
2,743,IT WAS REPORTED THAT A FAILED TRANSMITTER WAS ...,3283,Wireless Communication Problem,CONTINUOUS GLUCOSE MONITOR,QBJ,IT WAS REPORTED THAT A FAILED TRANSMITTER WAS ...,"[IT, WAS, REPORTED, THAT, A, FAILED, TRANSMITT..."
3,751,IT WAS REPORTED THAT A LOSS OF CONNECTION OCCU...,3283,Wireless Communication Problem,CONTINUOUS GLUCOSE MONITOR,QBJ,IT WAS REPORTED THAT A LOSS OF CONNECTION OCCU...,"[IT, WAS, REPORTED, THAT, A, LOSS, OF, CONNECT..."
4,759,IT WAS REPORTED THAT A WARM UP RESTARTED DURIN...,3283,Wireless Communication Problem,CONTINUOUS GLUCOSE MONITOR,QBJ,IT WAS REPORTED THAT A WARM UP RESTARTED DURIN...,"[IT, WAS, REPORTED, THAT, A, WARM, UP, RESTART..."


In [9]:
# Define a function to split our sentences into a list of words
import re

def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens

#data['text_tokenized'] = data['text_clean'].apply(lambda x: tokenize(x.lower()))

In [10]:
data.head()

Unnamed: 0.1,Unnamed: 0,FOI_TEXT,DEVICE_PROBLEM_CODE,DEVICE_PROBLEM_TEXT,GENERIC_NAME,DEVICE_REPORT_PRODUCT_CODE,text_clean,no_contractions
0,734,IT WAS REPORTED THAT A LOSS OF CONNECTION OCCU...,3283,Wireless Communication Problem,CONTINUOUS GLUCOSE MONITOR,QBJ,IT WAS REPORTED THAT A LOSS OF CONNECTION OCCU...,"[IT, WAS, REPORTED, THAT, A, LOSS, OF, CONNECT..."
1,742,IT WAS REPORTED THAT A FAILED TRANSMITTER WAS ...,3191,Appropriate Term/Code Not Available,CONTINUOUS GLUCOSE MONITOR,QBJ,IT WAS REPORTED THAT A FAILED TRANSMITTER WAS ...,"[IT, WAS, REPORTED, THAT, A, FAILED, TRANSMITT..."
2,743,IT WAS REPORTED THAT A FAILED TRANSMITTER WAS ...,3283,Wireless Communication Problem,CONTINUOUS GLUCOSE MONITOR,QBJ,IT WAS REPORTED THAT A FAILED TRANSMITTER WAS ...,"[IT, WAS, REPORTED, THAT, A, FAILED, TRANSMITT..."
3,751,IT WAS REPORTED THAT A LOSS OF CONNECTION OCCU...,3283,Wireless Communication Problem,CONTINUOUS GLUCOSE MONITOR,QBJ,IT WAS REPORTED THAT A LOSS OF CONNECTION OCCU...,"[IT, WAS, REPORTED, THAT, A, LOSS, OF, CONNECT..."
4,759,IT WAS REPORTED THAT A WARM UP RESTARTED DURIN...,3283,Wireless Communication Problem,CONTINUOUS GLUCOSE MONITOR,QBJ,IT WAS REPORTED THAT A WARM UP RESTARTED DURIN...,"[IT, WAS, REPORTED, THAT, A, WARM, UP, RESTART..."


In [11]:
data.loc[:,['FOI_TEXT', 'text_clean', 'no_contractions']].head()

Unnamed: 0,FOI_TEXT,text_clean,no_contractions
0,IT WAS REPORTED THAT A LOSS OF CONNECTION OCCU...,IT WAS REPORTED THAT A LOSS OF CONNECTION OCCU...,"[IT, WAS, REPORTED, THAT, A, LOSS, OF, CONNECT..."
1,IT WAS REPORTED THAT A FAILED TRANSMITTER WAS ...,IT WAS REPORTED THAT A FAILED TRANSMITTER WAS ...,"[IT, WAS, REPORTED, THAT, A, FAILED, TRANSMITT..."
2,IT WAS REPORTED THAT A FAILED TRANSMITTER WAS ...,IT WAS REPORTED THAT A FAILED TRANSMITTER WAS ...,"[IT, WAS, REPORTED, THAT, A, FAILED, TRANSMITT..."
3,IT WAS REPORTED THAT A LOSS OF CONNECTION OCCU...,IT WAS REPORTED THAT A LOSS OF CONNECTION OCCU...,"[IT, WAS, REPORTED, THAT, A, LOSS, OF, CONNECT..."
4,IT WAS REPORTED THAT A WARM UP RESTARTED DURIN...,IT WAS REPORTED THAT A WARM UP RESTARTED DURIN...,"[IT, WAS, REPORTED, THAT, A, WARM, UP, RESTART..."


## Remove Stopwords

In [12]:
# Remove Stopwords
stopwords = nltk.corpus.stopwords.words('english')

stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [13]:
# Define a function to remove all stopwords
def remove_stopwords(tokenized_text):
    text = [word.lower() for word in tokenized_text if word.lower() not in stopwords]
    return text

data['text_nostop'] = data['no_contractions'].apply(lambda x: remove_stopwords(x))

In [14]:
data.loc[:,['FOI_TEXT', 'text_nostop']].head()

Unnamed: 0,FOI_TEXT,text_nostop
0,IT WAS REPORTED THAT A LOSS OF CONNECTION OCCU...,"[reported, loss, connection, occurred., review..."
1,IT WAS REPORTED THAT A FAILED TRANSMITTER WAS ...,"[reported, failed, transmitter, reported., rev..."
2,IT WAS REPORTED THAT A FAILED TRANSMITTER WAS ...,"[reported, failed, transmitter, reported., rev..."
3,IT WAS REPORTED THAT A LOSS OF CONNECTION OCCU...,"[reported, loss, connection, occurred., determ..."
4,IT WAS REPORTED THAT A WARM UP RESTARTED DURIN...,"[reported, warm, restarted, sensor, session., ..."


In [15]:
# Parts of Speech Tagging (POS)

#def pos_tag(tagged_text):
    #return nltk.pos_tag(tagged_text)

data['pos_text'] = data['text_nostop'].apply(lambda x: nltk.pos_tag(x))


In [16]:
data.loc[:,['FOI_TEXT', 'pos_text']].head()

Unnamed: 0,FOI_TEXT,pos_text
0,IT WAS REPORTED THAT A LOSS OF CONNECTION OCCU...,"[(reported, VBN), (loss, NN), (connection, NN)..."
1,IT WAS REPORTED THAT A FAILED TRANSMITTER WAS ...,"[(reported, VBN), (failed, VBD), (transmitter,..."
2,IT WAS REPORTED THAT A FAILED TRANSMITTER WAS ...,"[(reported, VBN), (failed, VBD), (transmitter,..."
3,IT WAS REPORTED THAT A LOSS OF CONNECTION OCCU...,"[(reported, VBN), (loss, NN), (connection, NN)..."
4,IT WAS REPORTED THAT A WARM UP RESTARTED DURIN...,"[(reported, VBN), (warm, JJ), (restarted, VBN)..."


# lemmatization

In [17]:
#Use the wordnet Library to map words to their Lemmatization form

nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
data['lemmatized'] = data['text_nostop'].apply(lambda x:[lemmatizer.lemmatize(word) for word in x])

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/Michael.Jenkins/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [18]:
data.loc[:,['FOI_TEXT', 'lemmatized']].head()

Unnamed: 0,FOI_TEXT,lemmatized
0,IT WAS REPORTED THAT A LOSS OF CONNECTION OCCU...,"[reported, loss, connection, occurred., review..."
1,IT WAS REPORTED THAT A FAILED TRANSMITTER WAS ...,"[reported, failed, transmitter, reported., rev..."
2,IT WAS REPORTED THAT A FAILED TRANSMITTER WAS ...,"[reported, failed, transmitter, reported., rev..."
3,IT WAS REPORTED THAT A LOSS OF CONNECTION OCCU...,"[reported, loss, connection, occurred., determ..."
4,IT WAS REPORTED THAT A WARM UP RESTARTED DURIN...,"[reported, warm, restarted, sensor, session., ..."
