##1. Extract Data

In [12]:
import pandas as pd

patientsdf = pd.read_csv('https://raw.githubusercontent.com/Christelleelkhoury/Data-Engineering/refs/heads/main/Week%203/patients.csv')
print("Extracted Patient Data:")
print(patientsdf)

Extracted Patient Data:
    patient_id             name  age  gender
0         P001      James Smith   45    Male
1         P002     Mary Johnson   32  Female
2         P003  Robert Williams   56    Male
3         P004   Patricia Brown   29  Female
4         P005       John Jones   67    Male
..         ...              ...  ...     ...
195       P196     Emily Brooks   41  Female
196       P197      Jack Fisher   29    Male
197       P198       Judith Lee   50  Female
198       P199       Sean Kelly   38    Male
199       P200  Rebecca Sanders   57  Female

[200 rows x 4 columns]


In [13]:
# Simulated API

diagnostic_data = [
    {"diagnostic_id": "D001", "patient_id": "P001", "test": "Blood Test", "result": "Normal"},
    {"diagnostic_id": "D002", "patient_id": "P002", "test": "X-Ray", "result": "Fracture"},
    {"diagnostic_id": "D003", "patient_id": "P003", "test": "MRI", "result": "Normal"}
]

print("Extracted Diagnostic Data:")
print(diagnostic_data)

Extracted Diagnostic Data:
[{'diagnostic_id': 'D001', 'patient_id': 'P001', 'test': 'Blood Test', 'result': 'Normal'}, {'diagnostic_id': 'D002', 'patient_id': 'P002', 'test': 'X-Ray', 'result': 'Fracture'}, {'diagnostic_id': 'D003', 'patient_id': 'P003', 'test': 'MRI', 'result': 'Normal'}]


##2. Transform Data


*   **Clean patient data**: Let’s assume you need to filter out patients who are younger than 40 years old for a specific study.
*   **Enrich diagnostic data** with patient information: Join the diagnostics data with patient details (name, age, gender) to provide context for the test results.


In [14]:
#filter out patients who are younger than 40 years old for a specific study
filtered_patientsdf = patientsdf[patientsdf['age'] >= 40]
print("Filtered Patient Data:")
print(filtered_patientsdf)

Filtered Patient Data:
    patient_id               name  age  gender
0         P001        James Smith   45    Male
2         P003    Robert Williams   56    Male
4         P005         John Jones   67    Male
5         P006       Linda Garcia   40  Female
7         P008      Barbara Davis   55  Female
..         ...                ...  ...     ...
193       P194  Dorothy Patterson   48  Female
194       P195      Benjamin Ward   55    Male
195       P196       Emily Brooks   41  Female
197       P198         Judith Lee   50  Female
199       P200    Rebecca Sanders   57  Female

[127 rows x 4 columns]


In [15]:
#Join the diagnostics data with patient details (name, age, gender) to provide context for the test results

diagnostic_datadf = pd.DataFrame(diagnostic_data)

diagnostic_datadf = pd.merge(diagnostic_datadf, patientsdf[['patient_id', 'name', 'age', 'gender']], on='patient_id', how='left')
print("Enriched Diagnostic Data:")
print(diagnostic_datadf)

Enriched Diagnostic Data:
  diagnostic_id patient_id        test    result             name  age  gender
0          D001       P001  Blood Test    Normal      James Smith   45    Male
1          D002       P002       X-Ray  Fracture     Mary Johnson   32  Female
2          D003       P003         MRI    Normal  Robert Williams   56    Male


##3.Load data into Mongodb

In [8]:
pip install pymongo

Collecting pymongo
  Downloading pymongo-4.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (22 kB)
Collecting dnspython<3.0.0,>=1.16.0 (from pymongo)
  Downloading dnspython-2.7.0-py3-none-any.whl.metadata (5.8 kB)
Downloading pymongo-4.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dnspython-2.7.0-py3-none-any.whl (313 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m313.6/313.6 kB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dnspython, pymongo
Successfully installed dnspython-2.7.0 pymongo-4.11


In [9]:
from pymongo import MongoClient

# Connect to MongoDB (replace <username> and <password> with your MongoDB Atlas credentials)
client = MongoClient("mongodb+srv://christelleelkhoury:wCjkSnPEBpqb4Riu@cluster0.jqola.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0")


In [10]:
db = client['patient_database']
collection = db['patients']

In [16]:
# Convert DataFrame to dictionary and insert into MongoDB
patients_dict = filtered_patientsdf.to_dict(orient='records')
collection.insert_many(patients_dict)
print("Loaded Patient Data into MongoDB")


Loaded Patient Data into MongoDB


In [17]:
# Convert DataFrame to dictionary and insert into MongoDB
diagnostic_data_dict = diagnostic_datadf.to_dict(orient='records')
collection.insert_many(diagnostic_data_dict)
print("Loaded Diagnostic Data into MongoDB")

Loaded Diagnostic Data into MongoDB


##4. Automate the ETL Process

In [18]:
#automate ETL process

def extract_patients():
    return pd.read_csv('https://raw.githubusercontent.com/Christelleelkhoury/Data-Engineering/refs/heads/main/Week%203/patients.csv')

def extract_diagnostics():
    return pd.DataFrame(diagnostic_data)

def transform_patients(patients_df):
    return patients_df[patients_df['age'] >= 40]

def transform_diagnostics(diagnostics_df, patients_df):
    diagnostics_df = pd.merge(diagnostics_df, patients_df[['patient_id', 'name', 'age', 'gender']], on='patient_id', how='left')
    return diagnostics_df

def load_data(patients_df, diagnostics_df):
    db.patients.insert_many(patients_df.to_dict(orient='records'))
    db.diagnostics.insert_many(diagnostics_df.to_dict(orient='records'))

patients_df = extract_patients()
diagnostics_df = extract_diagnostics()
transformed_patients_df = transform_patients(patients_df)
transformed_diagnostics_df = transform_diagnostics(diagnostics_df, patients_df)

print("ETL Process Completed!")

ETL Process Completed!
