# Combine the 2020 and 2021 QBJ Data
Using the stratified data as input, combine the QBJ data from 2020 and 2021 into a single dataframe.

Output the combined data in CSV form for further processing.


In [1]:
import pandas as pd
import os
import csv

# Identify the working directory and data files
working_directory = './15-Combine-2020-2021-Stratified-Data'

# Create the working directory if needed
try:
    os.makedirs(working_directory, exist_ok=True)
except OSError as error:
    print(f"Error creating {working_directory}: {error}")

In [2]:
qbj_2020_strata = "./09-Data-Stratification/qbj_2020_strata.csv"
qbj_2021_strata = "./09-Data-Stratification/qbj_2021_strata.csv"

# read both csv files into separate dataframes
df_2020 = pd.read_csv(qbj_2020_strata, dtype = str)
df_2021 = pd.read_csv(qbj_2021_strata, dtype = str)

# concatenate both dataframes into a single dataframe
df_combined = pd.concat([df_2020, df_2021], ignore_index=True)

# rename the first column to "ROW_ID" in place by its index position
df_combined.columns.values[0] = 'ROW_ID'

# drop the second column in place by its index position
df_combined.drop(df_combined.columns[1], axis = 1, inplace=True)

# convert any NaN values to be empty strings
df_combined.fillna('', inplace=True)

In [3]:
# print the combined dataframe
df_combined.shape

(5736, 13)

In [4]:
df_combined.head(2)

Unnamed: 0,ROW_ID,FOI_TEXT,DEVICE_PROBLEM_CODE,DEVICE_PROBLEM_TEXT,GENERIC_NAME,DEVICE_REPORT_PRODUCT_CODE,UDI-DI,UDI-PUBLIC,DATE_OF_EVENT,REPORTER_OCCUPATION_CODE,REPORT_DATE,EVENT_LOCATION,SOURCE_TYPE
0,1969025,IT WAS REPORTED THAT THE TRANSMITTER LOST CONN...,3283,Wireless Communication Problem,CONTINUOUS GLUCOSE MONITOR,QBJ,,,07/30/2020,0,,I,CONSUMER
1,1426265,IT WAS REPORTED THAT SIGNAL LOSS OVER ONE HOUR...,3283,Wireless Communication Problem,CONTINUOUS GLUCOSE MONITOR,QBJ,386270000385.0,386270000385.0,06/05/2020,0,,I,CONSUMER


## Write the combined data to a CSV file in the working directory

In [5]:
df_combined.to_csv(f"{working_directory}/qbj_data_combined.csv", index=False)

## Upload All Output to an S3 Bucket

In [6]:
import os
import subprocess

# Create the upload command using the AWS command line interface
command = ["aws", "s3", "sync", working_directory, 
           f"s3://praxis-2023-html-output", "--exclude", f"*/.ipynb_checkpoints/*", "--no-progress"]

# Run the command and wait for it to complete
output = subprocess.run(command, capture_output=True, text=True)

# Print the command's output
print(output.stdout)

upload: 15-Combine-2020-2021-Stratified-Data/qbj_data_combined.csv to s3://praxis-2023-html-output/qbj_data_combined.csv

