# Deploy the Model and Process QBJ Records from Year 2019

1. Create the working directory
1. Unzip the following data files into the working 

    - `./01-Download-MAUDE-Data/device2019.zip`
    - `./01-Download-MAUDE-Data/foitext2019.zip`
    
1. Read the data into dataframes
1. Remove any rows from `device2019` that DO NOT have `DEVICE_REPORT_PRODUCT_CODE == QBJ` 
1. Combine the data using the `MDR_REPORT_KEY` column
1. Sample 5-10% of the data

In [1]:
# Set up the zip and data files to use the selected year
data_year = "2019"

foitext_zip_file = f"foitext{data_year}.zip"
device_zip_file = f"device{data_year}.zip"

foitext_data_file = f"foitext{data_year}.txt"
device_data_file = f"device{data_year}.txt"

In [2]:
from os.path import exists

import os
import urllib.request

data_directory = "./01-Download-MAUDE-Data"
working_directory = "./42-Report-Classification-Deployment"


# Create the working directory if needed
try:
    os.makedirs(working_directory, exist_ok=True)
except OSError as error:
    print(f"Error creating {data_directory}: {error}")

In [3]:
from zipfile import ZipFile

zip_files = [foitext_zip_file, device_zip_file]

# Unzip the data files into the working directory
for i in zip_files:
    print(f"Unzipping {i}")
    with ZipFile(f"{data_directory}/{i}", "r") as zip:
        zip.extractall(f"{working_directory}")

print("Unzip complete.")

# Change file names in working directory to lower case
for file in os.listdir(working_directory):
    os.rename(f"{working_directory}/{file}", f"{working_directory}/{file.lower()}")

Unzipping foitext2019.zip
Unzipping device2019.zip
Unzip complete.


In [4]:
import pandas as pd
import csv

# Create a dataframe for the foitext file
foitext_df = pd.read_csv(
    f"{working_directory}/{foitext_data_file}",
    sep="|",
    encoding="ISO-8859-1",
    on_bad_lines="warn",
    quoting=csv.QUOTE_NONE,
    dtype="str",
)

# Use the MDR_REPORT_KEY column as the index for the dataframe without creating a new dataframe
foitext_df.set_index("MDR_REPORT_KEY", inplace=True)

# Replace any records that Pandas converted to 'N/A' with an empty string.  This is needed for further processing
foitext_df.fillna("", inplace=True)

print(f"foitext data frame creation complete: {foitext_df.shape}")

foitext data frame creation complete: (3252822, 5)


In [5]:
import pandas as pd
import csv

# Create a dataframe for the device file
device_df = pd.read_csv(
    f"{working_directory}/{device_data_file}",
    sep="|",
    encoding="ISO-8859-1",
    on_bad_lines="warn",
    quoting=csv.QUOTE_NONE,
    dtype="str",
)

# Use the MDR_REPORT_KEY column as the index for the dataframe without creating a new dataframe
device_df.set_index("MDR_REPORT_KEY", inplace=True)

# Identify the unwanted columns
unwanted_columns = [
    "DEVICE_EVENT_KEY",
    "IMPLANT_FLAG",
    "DATE_REMOVED_FLAG",
    "DATE_RECEIVED",
    "MANUFACTURER_D_ADDRESS_1",
    "MANUFACTURER_D_ADDRESS_2",
    "MANUFACTURER_D_CITY",
    "MANUFACTURER_D_STATE_CODE",
    "MANUFACTURER_D_ZIP_CODE",
    "MANUFACTURER_D_ZIP_CODE_EXT",
    "MANUFACTURER_D_COUNTRY_CODE",
    "MANUFACTURER_D_POSTAL_CODE",
    "DEVICE_OPERATOR",
    "EXPIRATION_DATE_OF_DEVICE",
    "CATALOG_NUMBER",
    "LOT_NUMBER",
    "OTHER_ID_NUMBER",
    "DATE_RETURNED_TO_MANUFACTURER",
    "DEVICE_AGE_TEXT",
    "DEVICE_EVALUATED_BY_MANUFACTUR",
    "COMBINATION_PRODUCT_FLAG",
]

# Remove the unwanted columns from the device dataframe
device_df.drop(unwanted_columns, axis=1, inplace=True)

# Replace any records that Pandas converted to 'N/A' with an empty string.  This is needed for further processing
device_df.fillna("", inplace=True)

print(f"device data frame creation complete (rows, columns): {device_df.shape}")

device data frame creation complete (rows, columns): (1333496, 9)


In [6]:
# Filter rows where DEVICE_REPORT_PRODUCT_CODE is "QBJ"
device_df = device_df[device_df["DEVICE_REPORT_PRODUCT_CODE"] == "QBJ"]

print(f"device data frame QBJ filter complete (rows, columns): {device_df.shape}")

device data frame QBJ filter complete (rows, columns): (80807, 9)


In [7]:
# Create the primary dataframe by merging the device and foitext and device dataframes
# Merge with device_df on the left so that only the associated rows from foitext are retained
df = pd.merge(device_df, foitext_df, on="MDR_REPORT_KEY", how="inner")

print(f"device and foitext merge complete (rows, columns): {df.shape}")

device and foitext merge complete (rows, columns): (192942, 14)


In [8]:
# Remove rows where FOI_TEXT contains "(B)(4)"
df = df[~df["FOI_TEXT"].str.contains("\(B\)\(4\)")]

print(f"data clean complete (rows, columns): {df.shape}")

data clean complete (rows, columns): (91217, 14)


In [9]:
# Get rid of some columns that are not needed
columns_to_remove = [
    "BRAND_NAME",
    "DATE_REPORT",
    "DEVICE_AVAILABILITY",
    "DEVICE_REPORT_PRODUCT_CODE",
    "DEVICE_SEQUENCE_NO",
    "GENERIC_NAME",
    "MANUFACTURER_D_NAME",
    "MDR_TEXT_KEY",
    "MODEL_NUMBER",
    "PATIENT_SEQUENCE_NUMBER",
    "TEXT_TYPE_CODE",
    "UDI-DI",
    "UDI-PUBLIC",
]

# Remove the specified columns
df = df.drop(columns=columns_to_remove)

In [10]:
# Calculate the number of rows representing 10% of the DataFrame
n = int(len(df) * 0.01)

# Create a new DataFrame with the first 10% of rows
sample_df = df.sample(n=n, random_state=42)

In [11]:
sample_df.shape

(912, 1)

In [12]:
sample_df.head()

Unnamed: 0_level_0,FOI_TEXT
MDR_REPORT_KEY,Unnamed: 1_level_1
8219546,NO PRODUCT WAS RETURNED FOR EVALUATION. SHOULD...
9470368,IT WAS REPORTED THAT THE TRANSMITTER LOST CONN...
8966264,IT WAS REPORTED THAT THE TRANSMITTER LOST CONN...
9446129,IT WAS REPORTED THAT A BROKEN SENSOR WIRE OCCU...
9269744,IT WAS REPORTED THAT SIGNAL LOSS OVER ONE HOUR...


In [13]:
# Imports for working with the model
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from tabulate import tabulate
from wordcloud import WordCloud
import matplotlib as mpl
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import pickle
import re
import seaborn as sns
import time
import xgboost as xgb

ps = PorterStemmer() #creating an instance of the class
tfidf = TfidfVectorizer(min_df=2)

In [14]:
# Load the model
model_pickle = f"{working_directory}/XGB_model.pickle"


# Load the variable using a "with" clause
with open(model_pickle, 'rb') as file:
    model = pickle.load(file)

In [15]:
def preprocess_text(text):
    """In preprocess_text function we will apply all the things that given below:
    - removing links
    - removing special characters
    - removing punctuations
    - removing numbers
    - removing stopwords
    - doing stemming
    - transforming in lowercase
    - removing excessive whitespaces
    """
    # remove links
    text = re.sub(r"http\S+", "", str(text))
    # remove special chars and numbers
    text = re.sub("[^A-Za-z]+", " ", str(text))
    # remove punctuations in string
    text = re.sub(r'[^\w\s]', "", str(text)) 
    # 1. tokenize
    tokens = nltk.word_tokenize(text)
    # 2. check if stopword and stemming the word
    tokens = [ps.stem(w) for w in tokens if not w.lower() in stopwords.words("english")]
    # 3. join back together
    text = " ".join(tokens)
    # return text in lower case and stripped of whitespaces
    text = text.lower().strip()
    return text

In [16]:
def prediction(text):
  predict_text = pd.DataFrame([text])
  p = predict_text[0].apply(lambda x: preprocess_text(x)).values
  classes = ['Design Engineer', 'Management', 'Manufacturing Engineer',
       'Quality Engineering', 'Stakeholder Unassigned']
  predict = classes[model.predict(tfidf.transform(p).toarray())[0]]
  print("Predicted Text: ", predict)

In [17]:
# Loop through each row of the "FOI_TEXT" column
for foi_text in df['FOI_TEXT']:
    prediction(foi_text)

NotFittedError: The TF-IDF vectorizer is not fitted