# Clean 2020 Data
In previous processing, some columns were removed to minimize the number of columns in the data.

However, the data set contained more than 3M rows of data.  

Turns out we need to keep the following columns to reduce the size of the data set by removing unneeded rows:
- 'DEVICE_REPORT_PRODUCT_CODE'
- 'FOI_TEXT'

In [54]:
# Identify the data directory, working directory, and data files
data_directory = './2020_reprocessed'
working_directory = './2020_clean'
data_file_2020 = './2020_reprocessed/2020_data_complete.csv'

import os

# Create the working directory if needed
try:
    os.makedirs(working_directory, exist_ok=True)
except OSError as error:
    print(f"Error creating {working_directory}: {error}")



In [55]:
import pandas as pd


# Read the data into a pandas dataframe
data_2020 = pd.read_csv(data_file_2020, # The data file being read, from the variable assignment above
                   on_bad_lines='warn', # This tells Pandas to only warn on bad lines vs causing an error
                   dtype = 'str')       # This tells Pandas to treat all numbers as words

In [56]:
# Replace any records that Pandas converted to 'N/A' with an empty string.
data_2020.fillna('', inplace=True)

In [57]:
print(f"Number of: (Rows, Columns) = {data_2020.shape}")

Number of: (Rows, Columns) = (3856740, 29)


## Remove Unwanted Columns

In [58]:
# Remove unwanted columns
unwanted_columns = [
    'MDR_REPORT_KEY',
    'MDR_TEXT_KEY',
    'TEXT_TYPE_CODE',
    'PATIENT_SEQUENCE_NUMBER',
    'DATE_REPORT',
    'DEVICE_SEQUENCE_NO',
    'BRAND_NAME',
    'MANUFACTURER_D_NAME',
    'MODEL_NUMBER',
    'DEVICE_AVAILABILITY',
    'REPORT_NUMBER',
    'REPORT_SOURCE_CODE',
    'NUMBER_DEVICES_IN_EVENT',
    'DATE_RECEIVED',
    'INITIAL_REPORT_TO_FDA',
    'MANUFACTURER_G1_NAME',
    'REMEDIAL_ACTION',
    'EVENT_TYPE',
    'MANUFACTURER_NAME',
    'TYPE_OF_REPORT',
    'SUMMARY_REPORT',
    'NOE_SUMMARIZED',
    'UDI-DI',
    'UDI-PUBLIC',
]

data_2020.drop(unwanted_columns, axis=1, inplace=True)

In [59]:
print(f"Number of: (Rows, Columns) = {data_2020.shape}")

Number of: (Rows, Columns) = (3856740, 5)


In [60]:
data_2020

Unnamed: 0,FOI_TEXT,DEVICE_PROBLEM_CODE,DEVICE_PROBLEM_TEXT,GENERIC_NAME,DEVICE_REPORT_PRODUCT_CODE
0,THE RESULTS OF THE INVESTIGATION ARE INCONCLUS...,2993,Adverse Event Without Identified Device or Use...,DEFIBRILLATION LEAD,LWS
1,IT WAS REPORTED THAT THE PATIENT EXPIRED. THER...,2993,Adverse Event Without Identified Device or Use...,DEFIBRILLATION LEAD,LWS
2,INVESTIGATION RESULTS WILL BE PROVIDED IN THE ...,1332,Failure to Interrogate,IMPLANTABLE CARDIOVERTER DEFIBRILLATOR,NVZ
3,IT WAS REPORTED THAT THE PATIENT CALLED EMERGE...,1332,Failure to Interrogate,IMPLANTABLE CARDIOVERTER DEFIBRILLATOR,NVZ
4,COMMUNICATION FAILURE AND PREMATURE BATTERY DE...,1332,Failure to Interrogate,IMPLANTABLE CARDIOVERTER DEFIBRILLATOR,NVZ
...,...,...,...,...,...
3856735,A REVIEW OF THE SUBJECT DEVICE DHR CONFIRMED T...,2937,Failure of Device to Self-Test,HOLMIUM (HO:YAG) SURGICAL LASERS AND DELIVERY ...,GEX
3856736,THE CUSTOMER STATED THAT THE PREFENSE MONITORI...,3010,Power Problem,CENTRAL MONITORING STATION,DRG
3856737,THE CUSTOMER STATED THAT THE PREFENSE MONITORI...,4032,Unintended Application Program Shut Down,CENTRAL MONITORING STATION,DRG
3856738,THE CUSTOMER STATED THAT THE PREFENSE MONITORI...,3010,Power Problem,CENTRAL MONITORING STATION,DRG


## Cleaning Data by Dropping Rows Matching Specific Criteria

Use [this answer on Stack Overflow](https://stackoverflow.com/questions/13851535/how-to-delete-rows-from-a-pandas-dataframe-based-on-a-conditional-expression) as a reference for dropping rows from a dataframe using regular expressesions.

In summary:
```
new_data_frame = previous_data_frame.drop(previous_data_frame[CONDITION GOES HERE; ie, previous_data_frame.COLUMN_NAME == "Some Text"].index)
```

### Drop rows where GENERIC_NAME starts with "UNK" ("UNKNOWN", "UNKOWN", or "UNK")

In [61]:
# Drop rows where GENERIC_NAME starts with "UNK" ("UNKNOWN", "UNKOWN", or "UNK")
remove_generic_name_starts_with_unk = data_2020.drop(data_2020[data_2020.GENERIC_NAME.str.contains(r'^UNK')].index)

print(f"Previous row count = {data_2020.shape[0]}")
print(f"New row count      = {remove_generic_name_starts_with_unk.shape[0]}")
print(f"Rows removed       = {data_2020.shape[0] - remove_generic_name_starts_with_unk.shape[0]}")

Previous row count = 3856740
New row count      = 3853567
Rows removed       = 3173


### Drop rows where DEVICE_PROBLEM_TEXT == "Insufficient Information"

In [62]:
# Drop rows where DEVICE_PROBLEM_TEXT == "Insufficient Information"
remove_device_problem_text_insufficient_information = remove_generic_name_starts_with_unk.drop(remove_generic_name_starts_with_unk[remove_generic_name_starts_with_unk.DEVICE_PROBLEM_TEXT == "Insufficient Information"].index)

print(f"Previous row count = {remove_generic_name_starts_with_unk.shape[0]}")
print(f"New row count      = {remove_device_problem_text_insufficient_information.shape[0]}")
print(f"Rows removed       = {remove_generic_name_starts_with_unk.shape[0] - remove_device_problem_text_insufficient_information.shape[0]}")

Previous row count = 3853567
New row count      = 3773732
Rows removed       = 79835


### Drop rows where GENERIC_NAME is a number

In [83]:
# Drop rows where GENERIC_NAME is a number
remove_generic_name_is_number = remove_device_problem_text_insufficient_information.drop(remove_device_problem_text_insufficient_information[remove_device_problem_text_insufficient_information.GENERIC_NAME.str.match(r'^\d+$')].index)
print(f"Previous row count = {remove_device_problem_text_insufficient_information.shape[0]}")
print(f"New row count      = {remove_generic_name_is_number.shape[0]}")
print(f"Rows removed       = {remove_device_problem_text_insufficient_information.shape[0] - remove_generic_name_is_number.shape[0]}")

Previous row count = 3773732
New row count      = 3773714
Rows removed       = 18


### Drop rows where GENERIC_NAME is blank


In [84]:
# Drop rows where GENERIC_NAME is blank
remove_generic_name_is_blank = remove_generic_name_is_number.drop(remove_generic_name_is_number[remove_generic_name_is_number.GENERIC_NAME == ''].index)
print(f"Previous row count = {remove_generic_name_is_number.shape[0]}")
print(f"New row count      = {remove_generic_name_is_blank.shape[0]}")
print(f"Rows removed       = {remove_generic_name_is_number.shape[0] - remove_generic_name_is_blank.shape[0]}")

Previous row count = 3773714
New row count      = 3761023
Rows removed       = 12691


### Drop Rows Where FOI_TEXT == '(B)(4).'
[USING TEXT MINING OF FDA REPORTS TO INFORM EARLY SIGNAL DETECTION OF
CARDIOVASCULAR LEAD RECALLS](https://dashboard.digitalcollections.cuanschutz.edu/downloads/326bf216-7e24-40b3-80b5-2c9afda1da55)

In [150]:
# Drop Rows Where FOI_TEXT == '(B)(4).'
remove_foitext_equals_b4_1 = remove_generic_name_is_blank.drop(remove_generic_name_is_blank[remove_generic_name_is_blank.FOI_TEXT.str.match(r'(^\(B\)\s?\(4\)\s?\.$)+')].index)

print(f"Previous row count = {remove_generic_name_is_blank.shape[0]}")
print(f"New row count      = {remove_foitext_equals_b4_1.shape[0]}")
print(f"Rows removed       = {remove_generic_name_is_blank.shape[0] - remove_foitext_equals_b4_1.shape[0]}")

Previous row count = 3761023
New row count      = 3487640
Rows removed       = 273383


In [151]:
# Drop Rows Where FOI_TEXT == '(B)(4). (B)(4).'
remove_foitext_equals_b4_2 = remove_foitext_equals_b4_1.drop(remove_foitext_equals_b4_1[remove_foitext_equals_b4_1.FOI_TEXT == '(B)(4). (B)(4).'].index)

print(f"Previous row count = {remove_foitext_equals_b4_1.shape[0]}")
print(f"New row count      = {remove_foitext_equals_b4_2.shape[0]}")
print(f"Rows removed       = {remove_foitext_equals_b4_1.shape[0] - remove_foitext_equals_b4_2.shape[0]}")

Previous row count = 3487640
New row count      = 3487524
Rows removed       = 116


## Count the Product Code Occurrences

In [152]:
product_code_occurrences = remove_foitext_equals_b4_2.groupby(['DEVICE_REPORT_PRODUCT_CODE']).size().to_frame('COUNT')
product_code_occurrences.sort_values(by=['COUNT'], ascending=False).head(10)

Unnamed: 0_level_0,COUNT
DEVICE_REPORT_PRODUCT_CODE,Unnamed: 1_level_1
DZE,354972
QBJ,276350
OZP,269978
FRN,256585
OZO,236061
OYC,235809
LGW,82639
QFG,71301
LZG,70815
LWS,59780


In [153]:
product_code_occurrences.sort_values(by=['COUNT'], ascending=False).to_csv(f"{working_directory}/product_code_occurrences.csv")

### Identify Rows to Keep Based on Count of Product Code Occurrences
- QBJ

In [155]:
# Drop rows where DEVICE_REPORT_PRODUCT_CODE is not QBJ
remove_device_product_code_not_qbj = remove_foitext_equals_b4_2.drop(remove_foitext_equals_b4_2[remove_foitext_equals_b4_2.DEVICE_REPORT_PRODUCT_CODE != 'QBJ'].index)
print(f"Previous row count = {remove_foitext_equals_b4_2.shape[0]}")
print(f"New row count      = {remove_device_product_code_not_qbj.shape[0]}")
print(f"Rows removed       = {remove_foitext_equals_b4_2.shape[0] - remove_device_product_code_not_qbj.shape[0]}")

Previous row count = 3487524
New row count      = 276350
Rows removed       = 3211174


In [156]:
remove_device_product_code_not_qbj.to_csv(f"{working_directory}/2020_data_clean.csv")

## Download Product Codes
[Product Code Zip file](https://www.accessdata.fda.gov/premarket/ftparea/foiclass.zip) from [the FDA website](https://www.fda.gov/medical-devices/classify-your-medical-device/download-product-code-classification-files).

In [157]:
from os.path import exists

import os
import urllib.request

foiclass_zip = "https://www.accessdata.fda.gov/premarket/ftparea/foiclass.zip"

file_path = f"{data_directory}/foiclass.zip"

if exists(file_path):
  print(f"Already downloaded {file_path}; Skipping!")
else:
  print(f"Downloading {file_path}")
  urllib.request.urlretrieve(foiclass_zip, file_path)

Already downloaded ./2020_reprocessed/foiclass.zip; Skipping!


In [158]:
from zipfile import ZipFile

# Unzip the foiclass_zip files into the working directory
print(f"Unzipping {file_path}")
with ZipFile(f"{file_path}", "r") as zip:
    zip.extractall(f"{working_directory}")

print("Unzip complete.")


Unzipping ./2020_reprocessed/foiclass.zip
Unzip complete.


In [159]:
import csv

foiclass = pd.read_csv(f"{working_directory}/foiclass.txt", 
        sep="|",                # The data is seperated by the '|' character
        encoding="ISO-8859-1",  # This helps with reading the file format
        on_bad_lines='warn',    # This tells Pandas to only warn on bad lines vs causing an error
        quoting=csv.QUOTE_NONE, # This helps Pandas process records that have odd quotes in them
        dtype = 'str')

foiclass.fillna('', inplace=True)

In [160]:
foiclass

Unnamed: 0,REVIEW_PANEL,MEDICALSPECIALTY,PRODUCTCODE,DEVICENAME,DEVICECLASS,UNCLASSIFIED_REASON,GMPEXEMPTFLAG,THIRDPARTYFLAG,REVIEWCODE,REGULATIONNUMBER,SUBMISSION_TYPE_ID,DEFINITION,PHYSICALSTATE,TECHNICALMETHOD,TARGETAREA,Implant_Flag,Life_Sustain_support_flag,SummaryMalfunctionReporting
0,AN,AN,BRW,"Protector, Dental",1,,N,N,,868.5820,4,,,,,N,N,Eligible
1,AN,AN,BRX,"Stool, Anesthesia",1,,N,N,,868.6700,4,,,,,N,N,Eligible
2,AN,AN,BRY,"Cabinet, Table And Tray, Anesthesia",1,,N,N,,868.6100,4,,,,,N,N,Eligible
3,AN,AN,BSE,"Analyzer, Gas, Helium, Gaseous-Phase",2,,N,Y,,868.1640,1,,,,,N,N,Eligible
4,AN,AN,BSF,"Absorber, Carbon-Dioxide",1,,N,N,,868.5310,4,,,,,N,N,Eligible
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6766,,,RIM,"Laser Marker, Cutter, Or Engraver, Tabletop, C...",N,,Y,,,,,"A laser intended for consumer, non-industrial,...",,,,,,Ineligible
6767,,,RIN,Laser Illuminated Lighting Instrument,N,,Y,,,,,A laser illuminated spotlight used for entert...,,,,,,Ineligible
6768,,,RIO,Led Toys And Entertainment Products,N,,Y,,,,,Products that incorporate light emitting diode...,,,,,,Ineligible
6769,,,RIP,Led Diodes Sold As Components To Be Installed ...,N,,Y,,,,,LEDs sold as components to be installed in fin...,,,,,,Ineligible
