In [1]:
import pandas as pd

In [3]:
df = pd.read_csv("healthcare_dataset.csv") #Load the dataset

In [5]:
df.head(5)

Unnamed: 0,Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication,Test Results
0,Bobby JacksOn,30,Male,B-,Cancer,2024-01-31,Matthew Smith,Sons and Miller,Blue Cross,18856.281306,328,Urgent,2024-02-02,Paracetamol,Normal
1,LesLie TErRy,62,Male,A+,Obesity,2019-08-20,Samantha Davies,Kim Inc,Medicare,33643.327287,265,Emergency,2019-08-26,Ibuprofen,Inconclusive
2,DaNnY sMitH,76,Female,A-,Obesity,2022-09-22,Tiffany Mitchell,Cook PLC,Aetna,27955.096079,205,Emergency,2022-10-07,Aspirin,Normal
3,andrEw waTtS,28,Female,O+,Diabetes,2020-11-18,Kevin Wells,"Hernandez Rogers and Vang,",Medicare,37909.78241,450,Elective,2020-12-18,Ibuprofen,Abnormal
4,adrIENNE bEll,43,Female,AB+,Cancer,2022-09-19,Kathleen Hanna,White-White,Aetna,14238.317814,458,Urgent,2022-10-09,Penicillin,Abnormal


In [7]:
df.isnull().sum() #Check if there is any null value

Name                  0
Age                   0
Gender                0
Blood Type            0
Medical Condition     0
Date of Admission     0
Doctor                0
Hospital              0
Insurance Provider    0
Billing Amount        0
Room Number           0
Admission Type        0
Discharge Date        0
Medication            0
Test Results          0
dtype: int64

In [9]:
df["Medical Condition"].value_counts()

Medical Condition
Arthritis       9308
Diabetes        9304
Hypertension    9245
Obesity         9231
Cancer          9227
Asthma          9185
Name: count, dtype: int64

In [11]:
#Clean the "Name" column
def clean_name(name):
    if not isinstance(name, str):
        return ""
    name = name.strip().lower()
    parts = name.split()
    cleaned_parts = []
    for p in parts:
        if p:  # skip empty
            if p in ["mr", "mrs", "ms", "dr"]:
                cleaned_parts.append(p.capitalize() + ".")
            else:
                cleaned_parts.append(p.capitalize())
    return " ".join(cleaned_parts)

df["Name"] = df["Name"].apply(clean_name)
print(df["Name"])

0            Bobby Jackson
1             Leslie Terry
2              Danny Smith
3             Andrew Watts
4            Adrienne Bell
               ...        
55495    Elizabeth Jackson
55496           Kyle Perez
55497         Heather Wang
55498       Jennifer Jones
55499         James Garcia
Name: Name, Length: 55500, dtype: object


In [13]:
#Regenerate the dataset into the format required for this framework (Natural Language Text)
required_cols = [
    "Name", "Age", "Gender", "Blood Type", "Medical Condition",
    "Date of Admission", "Doctor", "Hospital", "Insurance Provider",
    "Billing Amount", "Room Number", "Admission Type", "Discharge Date",
    "Medication", "Test Results"
]
df = df[required_cols]


In [15]:
#Generate natural-language text for each row

def make_sentence(row):
    try:
        return (
            f"Patient {row['Name']} is a {row['Age']}-year-old {row['Gender']} "
            f"with {row['Medical Condition'].lower()}. "
            f"The patient was prescribed {row['Medication']} and "
            f"discharged with {row['Test Results'].lower()} test results."
        )
    except Exception:
        return ""

# Create shorter text column
df["text"] = df.apply(make_sentence, axis=1)
df["text"].head(10)

0    Patient Bobby Jackson is a 30-year-old Male wi...
1    Patient Leslie Terry is a 62-year-old Male wit...
2    Patient Danny Smith is a 76-year-old Female wi...
3    Patient Andrew Watts is a 28-year-old Female w...
4    Patient Adrienne Bell is a 43-year-old Female ...
5    Patient Emily Johnson is a 36-year-old Male wi...
6    Patient Edward Edwards is a 21-year-old Female...
7    Patient Christina Martinez is a 20-year-old Fe...
8    Patient Jasmine Aguilar is a 82-year-old Male ...
9    Patient Christopher Berg is a 58-year-old Fema...
Name: text, dtype: object

In [17]:
# Label column
df["label"] = df["Medical Condition"]
df["label"].head(10)

0      Cancer
1     Obesity
2     Obesity
3    Diabetes
4      Cancer
5      Asthma
6    Diabetes
7      Cancer
8      Asthma
9      Cancer
Name: label, dtype: object

In [19]:
df2 = df[["label", "text"]]
df2

Unnamed: 0,label,text
0,Cancer,Patient Bobby Jackson is a 30-year-old Male wi...
1,Obesity,Patient Leslie Terry is a 62-year-old Male wit...
2,Obesity,Patient Danny Smith is a 76-year-old Female wi...
3,Diabetes,Patient Andrew Watts is a 28-year-old Female w...
4,Cancer,Patient Adrienne Bell is a 43-year-old Female ...
...,...,...
55495,Asthma,Patient Elizabeth Jackson is a 42-year-old Fem...
55496,Obesity,Patient Kyle Perez is a 61-year-old Female wit...
55497,Hypertension,Patient Heather Wang is a 38-year-old Female w...
55498,Arthritis,Patient Jennifer Jones is a 43-year-old Male w...


In [21]:
# Limit dataset to 1600 random rows 
df2_final = df2.sample(n=1600, random_state=42).reset_index(drop=True)
df2_final["label"].value_counts()

label
Cancer          287
Obesity         274
Asthma          267
Arthritis       264
Diabetes        263
Hypertension    245
Name: count, dtype: int64

In [23]:
from sklearn.model_selection import train_test_split

train, temp = train_test_split(df2_final, test_size=0.2, random_state=42)
valid, test = train_test_split(temp, test_size=0.5, random_state=42)

train.to_csv(r"healthcare_train.csv", index=False)
valid.to_csv(r"healthcare_valid.csv", index=False)
test.to_csv(r"healthcare_test.csv", index=False)
