In [1]:
import pandas as pd
import random

In [3]:
# Generating synthetic Data for traning our model
random.seed(40)
# list of symptoms which we will use in patients data set
symptoms=["chest pain","high blood pressure","shortness of breath","dizziness","fatigue","persistent cough",
          "fever","weight loss","irregular heartbeat","headache","nausea", "back pain","swelling in legs",
          "loss of appetite ","night sweats","blurred vision","cold hands and feet","abdominal pain","vomitting","joint pain",
          "skin rash","sleep disturbance","anxiety","depression","confusion","low oxygen levels",
          ]
# risk factors for patients
risk_factors= [
    "smoker", "diabetic", "family history of heart disease",
    "obese", "alcohol use", "high cholesterol", "sedentary lifestyle",
    "chronic kidney disease", "asthma", "previous stroke",
    "genetic predisposition", "chronic stress", "unhealthy diet",
    "lack of exercise", "drug use", "autoimmune disorder"
]

# creating a function to generate Doctor Note Synthetic dataset 

def generate_note():
  age=random.randint(20,90)
  gender=random.choice(["Male","Female"])
  picked_symptoms=random.sample(symptoms,random.randint(2,5))
  picked_risk_factor=random.sample(risk_factors,random.randint(0,3))

  note=f"Patient is a {age}-year old {gender} presenting with "\
       f"{', '.join(picked_symptoms)}"
  if picked_risk_factor:
    note+=f" with {', '.join(picked_risk_factor)}."
  # NOW we will assign weightage to risk_factor which will help in analyzing the risk percentage and label
  score = 0
  if "chest pain" in picked_symptoms or "shortness of breath" in picked_symptoms:
      score += 40
  score = min(score, 100)
  if "high blood pressure" in picked_symptoms or "high cholesterol" in picked_risk_factor:
      score += 10
  if any(i in picked_risk_factor for i in ["smoker", "diabetic", "family history of heart disease","previous stroke"]):
      score += 20
  if len(picked_risk_factor) > 3:
      score += 20

  # cap score at 100
  score=min(score,100)
  label = 1 if score >= 50 else 0

  return note,label,score

In [4]:
# Creating a function to genrate dataset 
def generate_dataset(num_patients,filename="doctor_notes.csv"):
  data=[generate_note()for _ in range(num_patients)]
  df=pd.DataFrame(data,columns=["doctor_note","risk_label","risk_score"])
  df.to_csv(filename,index=False)
  return df

if __name__=="__main__":
  df=generate_dataset(5000)
  print(df.head())


                                         doctor_note  risk_label  risk_score
0  Patient is a 78-year old Male presenting with ...           0           0
1  Patient is a 64-year old Female presenting wit...           1          50
2  Patient is a 26-year old Male presenting with ...           0           0
3  Patient is a 72-year old Male presenting with ...           0          20
4  Patient is a 40-year old Female presenting wit...           0           0


In [4]:
df.head(20)
df.shape

(5000, 3)