In [1]:
import nltk

In [2]:
from nltk.corpus import stopwords

In [3]:
import pandas as pd

# Identify the data directory, working directory, and data files
data_directory = './2020_clean'
working_directory = './2020_NLP'
data_file = f"{data_directory}/2020_data_clean.csv"

import os

# Create the working directory if needed
try:
    os.makedirs(working_directory, exist_ok=True)
except OSError as error:
    print(f"Error creating {working_directory}: {error}")

In [4]:
# Read the data into a pandas dataframe
data = pd.read_csv(data_file, # The data file being read, from the variable assignment above
                   on_bad_lines='warn', # This tells Pandas to only warn on bad lines vs causing an error
                   dtype = 'str')       # This tells Pandas to treat all numbers as words

In [5]:
# How big is the dataset
data.shape

(276350, 6)

# Pre-processing Text Data

1. Remove punctuation
2. Tokenization
2. Remove stopwords

In [6]:
#Remove Punctuation
import string

string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [12]:
# Define a function to remove punctuation in the data
def remove_punctuation(text):
    text = "".join([character for character in text if character not in string.punctuation])
    return text
data['text_clean'] = data['FOI_TEXT'].apply(lambda x: remove_punctuation(x))

In [13]:
data.head()

Unnamed: 0.1,Unnamed: 0,FOI_TEXT,DEVICE_PROBLEM_CODE,DEVICE_PROBLEM_TEXT,GENERIC_NAME,DEVICE_REPORT_PRODUCT_CODE,text_clean
0,734,IT WAS REPORTED THAT A LOSS OF CONNECTION OCCU...,3283,Wireless Communication Problem,CONTINUOUS GLUCOSE MONITOR,QBJ,IT WAS REPORTED THAT A LOSS OF CONNECTION OCCU...
1,742,IT WAS REPORTED THAT A FAILED TRANSMITTER WAS ...,3191,Appropriate Term/Code Not Available,CONTINUOUS GLUCOSE MONITOR,QBJ,IT WAS REPORTED THAT A FAILED TRANSMITTER WAS ...
2,743,IT WAS REPORTED THAT A FAILED TRANSMITTER WAS ...,3283,Wireless Communication Problem,CONTINUOUS GLUCOSE MONITOR,QBJ,IT WAS REPORTED THAT A FAILED TRANSMITTER WAS ...
3,751,IT WAS REPORTED THAT A LOSS OF CONNECTION OCCU...,3283,Wireless Communication Problem,CONTINUOUS GLUCOSE MONITOR,QBJ,IT WAS REPORTED THAT A LOSS OF CONNECTION OCCU...
4,759,IT WAS REPORTED THAT A WARM UP RESTARTED DURIN...,3283,Wireless Communication Problem,CONTINUOUS GLUCOSE MONITOR,QBJ,IT WAS REPORTED THAT A WARM UP RESTARTED DURIN...


In [14]:
# Define a function to split our sentences into a list of words
import re

def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens

data['text_tokenized'] = data['text_clean'].apply(lambda x: tokenize(x.lower()))

In [15]:
data.head()

Unnamed: 0.1,Unnamed: 0,FOI_TEXT,DEVICE_PROBLEM_CODE,DEVICE_PROBLEM_TEXT,GENERIC_NAME,DEVICE_REPORT_PRODUCT_CODE,text_clean,text_tokenized
0,734,IT WAS REPORTED THAT A LOSS OF CONNECTION OCCU...,3283,Wireless Communication Problem,CONTINUOUS GLUCOSE MONITOR,QBJ,IT WAS REPORTED THAT A LOSS OF CONNECTION OCCU...,"[it, was, reported, that, a, loss, of, connect..."
1,742,IT WAS REPORTED THAT A FAILED TRANSMITTER WAS ...,3191,Appropriate Term/Code Not Available,CONTINUOUS GLUCOSE MONITOR,QBJ,IT WAS REPORTED THAT A FAILED TRANSMITTER WAS ...,"[it, was, reported, that, a, failed, transmitt..."
2,743,IT WAS REPORTED THAT A FAILED TRANSMITTER WAS ...,3283,Wireless Communication Problem,CONTINUOUS GLUCOSE MONITOR,QBJ,IT WAS REPORTED THAT A FAILED TRANSMITTER WAS ...,"[it, was, reported, that, a, failed, transmitt..."
3,751,IT WAS REPORTED THAT A LOSS OF CONNECTION OCCU...,3283,Wireless Communication Problem,CONTINUOUS GLUCOSE MONITOR,QBJ,IT WAS REPORTED THAT A LOSS OF CONNECTION OCCU...,"[it, was, reported, that, a, loss, of, connect..."
4,759,IT WAS REPORTED THAT A WARM UP RESTARTED DURIN...,3283,Wireless Communication Problem,CONTINUOUS GLUCOSE MONITOR,QBJ,IT WAS REPORTED THAT A WARM UP RESTARTED DURIN...,"[it, was, reported, that, a, warm, up, restart..."


In [17]:
data.loc[:,['FOI_TEXT', 'text_clean', 'text_tokenized']].head()

Unnamed: 0,FOI_TEXT,text_clean,text_tokenized
0,IT WAS REPORTED THAT A LOSS OF CONNECTION OCCU...,IT WAS REPORTED THAT A LOSS OF CONNECTION OCCU...,"[it, was, reported, that, a, loss, of, connect..."
1,IT WAS REPORTED THAT A FAILED TRANSMITTER WAS ...,IT WAS REPORTED THAT A FAILED TRANSMITTER WAS ...,"[it, was, reported, that, a, failed, transmitt..."
2,IT WAS REPORTED THAT A FAILED TRANSMITTER WAS ...,IT WAS REPORTED THAT A FAILED TRANSMITTER WAS ...,"[it, was, reported, that, a, failed, transmitt..."
3,IT WAS REPORTED THAT A LOSS OF CONNECTION OCCU...,IT WAS REPORTED THAT A LOSS OF CONNECTION OCCU...,"[it, was, reported, that, a, loss, of, connect..."
4,IT WAS REPORTED THAT A WARM UP RESTARTED DURIN...,IT WAS REPORTED THAT A WARM UP RESTARTED DURIN...,"[it, was, reported, that, a, warm, up, restart..."


In [18]:
# Remove Stopwords

stopwords = nltk.corpus.stopwords.words('english')

In [19]:
# Define a function to remove all stopwords
def remove_stopwords(tokenized_text):
    text = [word for word in tokenized_text if word not in stopwords]
    return text

data['text_nostop'] = data['text_tokenized'].apply(lambda x: remove_stopwords(x))

In [20]:
data.loc[:,['FOI_TEXT', 'text_nostop']].head()

Unnamed: 0,FOI_TEXT,text_nostop
0,IT WAS REPORTED THAT A LOSS OF CONNECTION OCCU...,"[reported, loss, connection, occurred, review,..."
1,IT WAS REPORTED THAT A FAILED TRANSMITTER WAS ...,"[reported, failed, transmitter, reported, revi..."
2,IT WAS REPORTED THAT A FAILED TRANSMITTER WAS ...,"[reported, failed, transmitter, reported, revi..."
3,IT WAS REPORTED THAT A LOSS OF CONNECTION OCCU...,"[reported, loss, connection, occurred, determi..."
4,IT WAS REPORTED THAT A WARM UP RESTARTED DURIN...,"[reported, warm, restarted, sensor, session, d..."


In [None]:
import contractions

def expand_contractons(text):
    text = 