# Deploy the Model and Process QBJ Records from Year 2019

1. Create the working directory
1. Unzip the following data files into the working 

    - `./01-Download-MAUDE-Data/device2019.zip`
    - `./01-Download-MAUDE-Data/foitext2019.zip`
    
1. Read the data into dataframes
1. Remove any rows from `device2019` that DO NOT have `DEVICE_REPORT_PRODUCT_CODE == QBJ` 
1. Combine the data using the `MDR_REPORT_KEY` column
1. Sample 5-10% of the data

In [1]:
# Set up the zip and data files to use the selected year
import csv
import os
import pickle
import re
from zipfile import ZipFile

import nltk
import pandas as pd

# Imports for working with the model
import xgboost as xgb
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

data_year = "2019"

foitext_zip_file = f"foitext{data_year}.zip"
device_zip_file = f"device{data_year}.zip"

foitext_data_file = f"foitext{data_year}.txt"
device_data_file = f"device{data_year}.txt"

In [2]:
data_directory = "./01-Download-MAUDE-Data"
working_directory = f"./44-Deploy-XGB-Report-Classification-Model-{data_year}"


# Create the working directory if needed
try:
    os.makedirs(working_directory, exist_ok=True)
except OSError as error:
    print(f"Error creating {data_directory}: {error}")

In [3]:
zip_files = [foitext_zip_file, device_zip_file]

# Unzip the data files into the working directory
for i in zip_files:
    print(f"Unzipping {i}")
    with ZipFile(f"{data_directory}/{i}", "r") as zip:
        zip.extractall(f"{working_directory}")

print("Unzip complete.")

# Change file names in working directory to lower case
for file in os.listdir(working_directory):
    os.rename(f"{working_directory}/{file}", f"{working_directory}/{file.lower()}")

Unzipping foitext2019.zip
Unzipping device2019.zip
Unzip complete.


In [4]:
# Create a dataframe for the foitext file
foitext_df = pd.read_csv(
    f"{working_directory}/{foitext_data_file}",
    sep="|",
    encoding="ISO-8859-1",
    on_bad_lines="warn",
    quoting=csv.QUOTE_NONE,
    dtype="str",
)

# Use the MDR_REPORT_KEY column as the index for the dataframe without creating a new dataframe
foitext_df.set_index("MDR_REPORT_KEY", inplace=True)

# Replace any records that Pandas converted to 'N/A' with an empty string.  This is needed for further processing
foitext_df.fillna("", inplace=True)

print(f"foitext data frame creation complete: {foitext_df.shape}")

foitext data frame creation complete: (3252822, 5)


In [5]:
# Create a dataframe for the device file
device_df = pd.read_csv(
    f"{working_directory}/{device_data_file}",
    sep="|",
    encoding="ISO-8859-1",
    on_bad_lines="warn",
    quoting=csv.QUOTE_NONE,
    dtype="str",
)

# Use the MDR_REPORT_KEY column as the index for the dataframe without creating a new dataframe
device_df.set_index("MDR_REPORT_KEY", inplace=True)

# Identify the unwanted columns
unwanted_columns = [
    "DEVICE_EVENT_KEY",
    "IMPLANT_FLAG",
    "DATE_REMOVED_FLAG",
    "DATE_RECEIVED",
    "MANUFACTURER_D_ADDRESS_1",
    "MANUFACTURER_D_ADDRESS_2",
    "MANUFACTURER_D_CITY",
    "MANUFACTURER_D_STATE_CODE",
    "MANUFACTURER_D_ZIP_CODE",
    "MANUFACTURER_D_ZIP_CODE_EXT",
    "MANUFACTURER_D_COUNTRY_CODE",
    "MANUFACTURER_D_POSTAL_CODE",
    "DEVICE_OPERATOR",
    "EXPIRATION_DATE_OF_DEVICE",
    "CATALOG_NUMBER",
    "LOT_NUMBER",
    "OTHER_ID_NUMBER",
    "DATE_RETURNED_TO_MANUFACTURER",
    "DEVICE_AGE_TEXT",
    "DEVICE_EVALUATED_BY_MANUFACTUR",
    "COMBINATION_PRODUCT_FLAG",
]

# Remove the unwanted columns from the device dataframe
device_df.drop(unwanted_columns, axis=1, inplace=True)

# Replace any records that Pandas converted to 'N/A' with an empty string.  This is needed for further processing
device_df.fillna("", inplace=True)

print(f"device data frame creation complete (rows, columns): {device_df.shape}")

device data frame creation complete (rows, columns): (1333496, 9)


In [6]:
# Filter rows where DEVICE_REPORT_PRODUCT_CODE is "QBJ"
device_df = device_df[device_df["DEVICE_REPORT_PRODUCT_CODE"] == "QBJ"]

print(f"device data frame QBJ filter complete (rows, columns): {device_df.shape}")

device data frame QBJ filter complete (rows, columns): (80807, 9)


In [7]:
# Create the primary dataframe by merging the device and foitext and device dataframes
# Merge with device_df on the left so that only the associated rows from foitext are retained
df = pd.merge(device_df, foitext_df, on="MDR_REPORT_KEY", how="inner")

print(f"device and foitext merge complete (rows, columns): {df.shape}")

device and foitext merge complete (rows, columns): (192942, 14)


In [8]:
# Remove rows where FOI_TEXT contains "(B)(4)"
df = df[~df["FOI_TEXT"].str.contains("\(B\)\(4\)")]

print(f"data clean complete (rows, columns): {df.shape}")

data clean complete (rows, columns): (91217, 14)


In [9]:
# Get rid of some columns that are not needed
columns_to_remove = [
    "BRAND_NAME",
    "DATE_REPORT",
    "DEVICE_AVAILABILITY",
    "DEVICE_REPORT_PRODUCT_CODE",
    "DEVICE_SEQUENCE_NO",
    "GENERIC_NAME",
    "MANUFACTURER_D_NAME",
    "MDR_TEXT_KEY",
    "MODEL_NUMBER",
    "PATIENT_SEQUENCE_NUMBER",
    "TEXT_TYPE_CODE",
    "UDI-DI",
    "UDI-PUBLIC",
]

# Remove the specified columns
df = df.drop(columns=columns_to_remove)

In [10]:
# Calculate the number of rows representing 1% of the DataFrame
n = int(len(df) * 0.01)

# Create a new DataFrame with 1% of rows
sample_df = df.sample(n=n, random_state=42)

In [11]:
sample_df.shape

(912, 1)

In [None]:
nltk.download("stopwords")

ps = PorterStemmer()  # creating an instance of the class


def preprocess_text(text):
    """
    In preprocess_text function we will apply all the things that are given below:
    - removing special characters
    - removing punctuations
    - removing numbers
    - removing stopwords
    - doing stemming
    - transforming in lowercase
    - removing excessive whitespaces
    """
    # remove special chars and numbers
    text = re.sub("[^A-Za-z]+", " ", str(text))

    # remove punctuations in string
    text = re.sub(r"[^\w\s]", "", str(text))

    # 1. tokenize
    tokens = nltk.word_tokenize(text)

    # 2. check if stopword and stemming the word
    tokens = [ps.stem(w) for w in tokens if w.lower() not in stopwords.words("english")]

    # 3. join back together
    text = " ".join(tokens)

    # return text in lower case and stripped of whitespaces
    text = text.lower().strip()

    return text

In [13]:
sample_df["CLEAN_FOI_TEXT"] = sample_df["FOI_TEXT"].apply(lambda x: preprocess_text(x))

In [14]:
# Load the saved TfidfVectorizer
with open("./labeled_data/xgb_tfidf.pickle", "rb") as f:
    tfidf = pickle.load(f)

# Load the saved model
model = xgb.XGBClassifier()
model.load_model("./labeled_data/xgb_model.xgb")

In [15]:
# Classes to be predicted (must be in this order)
classes = [
    "Design Engineer",
    "Management",
    "Manufacturing Engineer",
    "Quality Engineering",
    "Stakeholder Unassigned",
]

In [16]:
# Transform the data
X_tfidff = tfidf.transform(sample_df["CLEAN_FOI_TEXT"].values)

# Convert the sparse matrix to a dense array
X_dense = X_tfidff.toarray()

# Make predictions
predictions = [classes[label] for label in model.predict(X_dense)]

In [17]:
# Create a dataframe for the predictions
predictions_df = pd.DataFrame(
    {"FOI_TEXT": sample_df["FOI_TEXT"], "PREDICTION": predictions}
)

In [18]:
predictions_df.to_csv(f"{working_directory}/predictions_{data_year}.csv")
predictions_df

Unnamed: 0_level_0,FOI_TEXT,PREDICTION
MDR_REPORT_KEY,Unnamed: 1_level_1,Unnamed: 2_level_1
8219546,NO PRODUCT WAS RETURNED FOR EVALUATION. SHOULD...,Stakeholder Unassigned
9470368,IT WAS REPORTED THAT THE TRANSMITTER LOST CONN...,Stakeholder Unassigned
8966264,IT WAS REPORTED THAT THE TRANSMITTER LOST CONN...,Stakeholder Unassigned
9446129,IT WAS REPORTED THAT A BROKEN SENSOR WIRE OCCU...,Management
9269744,IT WAS REPORTED THAT SIGNAL LOSS OVER ONE HOUR...,Management
...,...,...
8782709,IT WAS REPORTED THAT TRANSMITTER FAILED ERROR ...,Quality Engineering
9420589,IT WAS REPORTED THAT SIGNAL LOSS OVER ONE HOUR...,Manufacturing Engineer
8579887,IT WAS REPORTED THAT MISSING SENSOR WIRE OCCUR...,Management
9151465,THE PRODUCT WAS EVALUATED. AN EXTERNAL VISUAL ...,Quality Engineering
