In [3]:
import requests
import pandas as pd

# Fetch first 500 recall records
url = "https://api.fda.gov/device/recall.json?limit=500"
response = requests.get(url)
data = response.json()

# Convert JSON → DataFrame
df_raw = pd.json_normalize(data["results"])
print("Raw columns:", df_raw.columns.tolist())
print(df_raw.head(3))

df_raw.to_csv("raw_data.csv", index=False)

Raw columns: ['cfres_id', 'product_res_number', 'event_date_initiated', 'event_date_posted', 'recall_status', 'event_date_terminated', 'res_event_number', 'product_code', 'k_numbers', 'product_description', 'code_info', 'recalling_firm', 'address_1', 'city', 'state', 'postal_code', 'additional_info_contact', 'reason_for_recall', 'root_cause_description', 'action', 'product_quantity', 'distribution_pattern', 'openfda.k_number', 'openfda.registration_number', 'openfda.fei_number', 'openfda.device_name', 'openfda.medical_specialty_description', 'openfda.regulation_number', 'openfda.device_class', 'firm_fei_number', 'other_submission_description', 'address_2', 'event_date_created', 'pma_numbers', 'openfda.pma_number']
  cfres_id product_res_number event_date_initiated event_date_posted  \
0    30043          Z-0001-04           2003-10-27        2003-10-29   
1    34578          Z-0001-05           2004-07-08        2004-10-05   
2    41815          Z-0001-06           2005-09-13        20

In [5]:

import numpy as np

# Select useful columns from your dataset
df = df_raw[[
    "cfres_id",
    "reason_for_recall",
    "recall_status",
    "event_date_initiated"
]].copy()

# Rename to match your earlier schema
df.rename(columns={
    "cfres_id": "Document_ID",
    "reason_for_recall": "Design_Input",
    "recall_status": "Risk_Assessment",   # originally classification
    "event_date_initiated": "Last_Reviewed"
}, inplace=True)

# Map risk levels (based on recall_status instead of classification)
risk_map = {
    "Ongoing": "High",
    "Completed": "Medium",
    "Terminated": "Low"
}
df["Risk_Assessment"] = df["Risk_Assessment"].map(risk_map)

# Add compliance score (simulate since not available in dataset)
np.random.seed(42)
df["Compliance_Score"] = np.random.uniform(40, 100, size=len(df)).round(2)

# Extract critical issues (simple keyword search in reason_for_recall)
def count_issues(text):
    if pd.isna(text):
        return 0
    issues = ["injury", "death", "contamination", "failure", "defect"]
    return sum(word in text.lower() for word in issues)

df["Critical_Issues"] = df["Design_Input"].apply(count_issues)

print(df.head())

# Save as CSV
df.to_csv("extracted_data.csv", index=False)


  Document_ID                                       Design_Input  \
0       30043  A minimum source-skin-distance of less than 30...   
1       34578  Wound dressing failed to meet sterility specif...   
2       41815  The packaging may be compromised such that ste...   
3       46524  Weld defect- the weld between the post body an...   
4       53825  Bio-logic System Corp Camera Pole may weaken a...   

  Risk_Assessment Last_Reviewed  Compliance_Score  Critical_Issues  
0             Low    2003-10-27             62.47                0  
1             Low    2004-07-08             97.04                0  
2             Low    2005-09-13             83.92                0  
3             Low    2006-05-31             75.92                1  
4             Low    2007-07-26             49.36                0  


In [10]:

# Clean and preprocess data

# Make a copy to avoid chained assignment issues
df_clean = df.copy()

# Fill missing values
df_clean["Design_Input"] = df_clean["Design_Input"].fillna("Unknown")
df_clean["Risk_Assessment"] = df_clean["Risk_Assessment"].fillna("Medium")

median_score = df_clean["Compliance_Score"].median()
df_clean["Compliance_Score"] = df_clean["Compliance_Score"].fillna(median_score)

df_clean["Critical_Issues"] = df_clean["Critical_Issues"].fillna(0).astype(int)

# Map Risk_Assessment to numeric for composite score
risk_map = {"Low": 1, "Medium": 2, "High": 3}
df_clean["Risk_Score"] = df_clean["Risk_Assessment"].map(risk_map)

# Composite Compliance Score
df_clean["Composite_Compliance_Score"] = (
    0.6 * df_clean["Compliance_Score"] +
    0.3 * (100 - (df_clean["Risk_Score"] * 30)) -
    0.1 * (df_clean["Critical_Issues"] * 10)
).clip(0, 100)

# Drop temporary Risk_Score column
df_clean = df_clean.drop(columns=["Risk_Score"])

# Save preprocessed dataset
df_clean.to_csv("preprocessed_recall_data.csv", index=False)
print("Preprocessed dataset saved successfully!")



Preprocessed dataset saved successfully!
