In [5]:
import pandas as pd
from pymongo import MongoClient
import os
from pathlib import Path

In [7]:

script_dir = Path.cwd()
project_root = script_dir.parent
data_path = project_root / 'data' / 'healthcare_dataset.csv'

data = pd.read_csv(data_path)


In [8]:

duplicates = data.duplicated()


if duplicates.any():
    print("Ces lignes sont en double :")
    print(data.loc[duplicates])


Ces lignes sont en double :
                     Name  Age  Gender Blood Type Medical Condition  \
50023        Samuel joYCe   56    Male         O+         Arthritis   
50040    KIMBErly vasqueZ   26    Male         A-           Obesity   
50055      SCoTT tHOrNton   55  Female         B-           Obesity   
50070         wiLliam LEe   27  Female         O+         Arthritis   
50078  chRistOPheR NoRrIs   38    Male        AB-         Arthritis   
...                   ...  ...     ...        ...               ...   
55461      connOR coMPTon   63    Male         A+            Asthma   
55462       alYSsA mIlLER   35  Female         A-          Diabetes   
55464        ChRIs huGHeS   35  Female        AB-           Obesity   
55484     keNNEtH alvarez   80    Male         O+            Cancer   
55491       lInDA grIFfin   47  Female         O+           Obesity   

      Date of Admission             Doctor                    Hospital  \
50023        2022-11-03     Krista Hartman   

In [4]:
duplicates = data[['Name', 'Date of Admission']].duplicated()

if duplicates.any():
    print("Les combinaisons suivantes (Name, Date_of_Admission) sont en double :")
    print(data.loc[duplicates, ['Name', 'Date of Admission']])

Les combinaisons suivantes (Name, Date_of_Admission) sont en double :
                    Name Date of Admission
50000      KRISTa GiBsoN        2022-05-05
50001  BRookE mCcUlLough        2022-01-18
50002    DaNiEL THoMPSon        2023-12-14
50003      nichOLas hunT        2021-10-09
50004   MElIssa mArTInEz        2022-01-05
...                  ...               ...
55495  eLIZABeTH jaCkSOn        2020-08-16
55496         KYle pEREz        2020-01-23
55497       HEATher WaNG        2020-07-13
55498     JENniFER JOneS        2019-05-25
55499       jAMES GARCiA        2024-04-02

[5500 rows x 2 columns]


In [5]:

duplicates = data[['Name']].duplicated()


if duplicates.any():
    print("Les combinaisons suivantes (Name) sont en double :")
    print(data.loc[duplicates, ['Name']])

Les combinaisons suivantes (Name) sont en double :
                    Name
22017         TInA white
27042          LOrI coOK
40470          aMy smiTh
40524           ADAM OrR
40666         PAUL SMith
...                  ...
55495  eLIZABeTH jaCkSOn
55496         KYle pEREz
55497       HEATher WaNG
55498     JENniFER JOneS
55499       jAMES GARCiA

[5508 rows x 1 columns]


In [6]:
duplicates = data[data.duplicated(subset=['Name', 'Date of Admission'], keep=False)]

duplicates_sorted = duplicates.sort_values(by='Name')

print(duplicates_sorted)


                   Name  Age  Gender Blood Type Medical Condition  \
54393       AARon smITh   81    Male         A-            Cancer   
17083       AARon smITh   79    Male         A-            Cancer   
54112      AAron ArCHER   49  Female         B-            Cancer   
3040       AAron ArCHER   47  Female         B-            Cancer   
52111    ABIGAiL wateRS   35  Female         O+            Asthma   
...                 ...  ...     ...        ...               ...   
53940    zacHary fLOrEs   63    Male        AB-            Cancer   
15565     zachAry Brown   69  Female         A+           Obesity   
50503     zachAry Brown   72  Female         A+           Obesity   
50727  zacharY BauTista   46  Female        AB+            Cancer   
1902   zacharY BauTista   43  Female        AB+            Cancer   

      Date of Admission              Doctor                   Hospital  \
54393        2019-11-21         Gina Jacobs               Weber-Warren   
17083        2019-11-21

In [7]:
null_values = data[data['Name'].isna() | data['Date of Admission'].isna() | 
                   (data['Name'] == '') | (data['Date of Admission'] == '')]
print(null_values)


Empty DataFrame
Columns: [Name, Age, Gender, Blood Type, Medical Condition, Date of Admission, Doctor, Hospital, Insurance Provider, Billing Amount, Room Number, Admission Type, Discharge Date, Medication, Test Results]
Index: []


In [8]:
# Extraire les lignes où 'Name' ou 'Date of Admission' sont vides ou NaN
null_or_empty_values = data[data['Name'].isna() | (data['Name'] == '') | data['Date of Admission'].isna() | (data['Date of Admission'] == '')]

# Afficher les résultats
print(null_or_empty_values[['Name', 'Date of Admission']])


Empty DataFrame
Columns: [Name, Date of Admission]
Index: []


In [9]:
data.columns = [col.strip().replace(" ", "_") for col in data.columns] #retire espace pour underscore pour une bonne syntaxe
data['Date_of_Admission'] = pd.to_datetime(data['Date_of_Admission'])
data['Discharge_Date'] = pd.to_datetime(data['Discharge_Date'])

In [10]:
data.head()

Unnamed: 0,Name,Age,Gender,Blood_Type,Medical_Condition,Date_of_Admission,Doctor,Hospital,Insurance_Provider,Billing_Amount,Room_Number,Admission_Type,Discharge_Date,Medication,Test_Results
0,Bobby JacksOn,30,Male,B-,Cancer,2024-01-31,Matthew Smith,Sons and Miller,Blue Cross,18856.281306,328,Urgent,2024-02-02,Paracetamol,Normal
1,LesLie TErRy,62,Male,A+,Obesity,2019-08-20,Samantha Davies,Kim Inc,Medicare,33643.327287,265,Emergency,2019-08-26,Ibuprofen,Inconclusive
2,DaNnY sMitH,76,Female,A-,Obesity,2022-09-22,Tiffany Mitchell,Cook PLC,Aetna,27955.096079,205,Emergency,2022-10-07,Aspirin,Normal
3,andrEw waTtS,28,Female,O+,Diabetes,2020-11-18,Kevin Wells,"Hernandez Rogers and Vang,",Medicare,37909.78241,450,Elective,2020-12-18,Ibuprofen,Abnormal
4,adrIENNE bEll,43,Female,AB+,Cancer,2022-09-19,Kathleen Hanna,White-White,Aetna,14238.317814,458,Urgent,2022-10-09,Penicillin,Abnormal


In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55500 entries, 0 to 55499
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Name                55500 non-null  object        
 1   Age                 55500 non-null  int64         
 2   Gender              55500 non-null  object        
 3   Blood_Type          55500 non-null  object        
 4   Medical_Condition   55500 non-null  object        
 5   Date_of_Admission   55500 non-null  datetime64[ns]
 6   Doctor              55500 non-null  object        
 7   Hospital            55500 non-null  object        
 8   Insurance_Provider  55500 non-null  object        
 9   Billing_Amount      55500 non-null  float64       
 10  Room_Number         55500 non-null  int64         
 11  Admission_Type      55500 non-null  object        
 12  Discharge_Date      55500 non-null  datetime64[ns]
 13  Medication          55500 non-null  object    