In [235]:
# Standard Libraries
import pandas as pd
from datetime import datetime
import warnings
import numpy as np

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, average_precision_score

# Data Resampling
from sklearn.utils import resample

# Suppress warnings to avoid clutter in the output
warnings.filterwarnings('ignore')

# OpenAI API (if required)
import openai

In [236]:
# Mount Google Drive (only needed in Colab)
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [237]:
# Load datasets into a pandas DataFrame,
patients = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/AIH/LLM/patients.csv")
conditions = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/AIH/LLM/conditions.csv")
observations = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/AIH/LLM/observations.csv")

In [238]:
patients.head(1)

Unnamed: 0,patient,birthdate,deathdate,ssn,drivers,passport,prefix,first,last,suffix,maiden,marital,race,ethnicity,gender,birthplace,address
0,4ee2c837-e60f-4c54-9fdf-8686bc70760b,1929-04-08,2029-11-11,999-78-5976,,,,Rosamaria,Pfannerstill,,,,black,dominican,F,Pittsfield MA US,18797 Karson Burgs Suite 444 Palmer Town MA 01...


In [239]:
conditions.head(1)

Unnamed: 0,start,stop,patient,encounter,code,description
0,2009-01-08,2009-01-21,71949668-1c2e-43ae-ab0a-64654608defb,4d451e22-a354-40c9-8b33-b6126158666d,10509002,Acute bronchitis (disorder)


In [240]:
observations.head(1)

Unnamed: 0,date,patient,encounter,code,description,value,units
0,2008-03-11,71949668-1c2e-43ae-ab0a-64654608defb,5114a5b4-64b8-47b2-82a6-0ce24aae0943,8302-2,Body Height,166.03,cm


In [241]:
"""
Data Preprocessing
"""

# Convert birthdate to age
patients['birthdate'] = pd.to_datetime(patients['birthdate'])
patients['age'] = (datetime.today() - patients['birthdate']).dt.days // 365  # Calculate age in years based on today's date

# Extract diabetes diagnosis from conditions.csv
conditions['Diabetes'] = conditions['description'] == "Diabetes"
diabetes_patients = conditions[conditions['Diabetes']]['patient'].unique()

# Label patients with diabetes
patients['Diabetes'] = patients['patient'].isin(diabetes_patients).astype(int)

# Convert 'value' to numeric, forcing errors to NaN (to handle any non-numeric values)
observations['value'] = pd.to_numeric(observations['value'], errors='coerce')

# Extract BMI, glucose, and hypertension from observations.csv
# For BMI, filter observations that contain "Body Mass Index" and compute the mean BMI per patient
bmi_df = observations[observations['description'].str.contains("Body Mass Index", na=False)]\
    .groupby('patient')['value'].mean().reset_index()
bmi_df.rename(columns={'value': 'BMI'}, inplace=True)  # Rename the 'value' column to 'BMI'

# For Glucose, filter observations that contain "Glucose" and compute the mean glucose level per patient
glucose_df = observations[observations['description'].str.contains("Glucose", na=False)]\
    .groupby('patient')['value'].mean().reset_index()
glucose_df.rename(columns={'value': 'Glucose_Level'}, inplace=True)  # Rename the 'value' column to 'Glucose_Level'

# For Hypertension, filter conditions that contain "Hypertension", then mark patients with Hypertension
hypertension_df = conditions[conditions['description'].str.contains("Hypertension", na=False)]\
    .groupby('patient').size().reset_index(name='Hypertension')
hypertension_df['Hypertension'] = 1  # If patient has hypertension, mark as 1

# Merge dataframes: Merge BMI, Glucose, and Hypertension data with the main patient dataframe
df = patients.merge(bmi_df, on='patient', how='left')  # Merge BMI data
df = df.merge(glucose_df, on='patient', how='left')  # Merge Glucose data
df = df.merge(hypertension_df, on='patient', how='left')  # Merge Hypertension data

# Rename columns for better readability
df.rename(columns={'age': 'Age'}, inplace=True)
df.rename(columns={'gender': 'Gender'}, inplace=True)

# Filter required columns
# Keep only necessary columns in the DataFrame
df = df[['Age', 'Gender', 'BMI', 'Glucose_Level', 'Hypertension', 'Diabetes']]

# Convert categorical variables
# Map 'M' to 1 and 'F' to 0 in the Gender column
df['Gender'] = df['Gender'].map({'M': 1, 'F': 0})

# Fill missing values with the mean of the respective columns
# This will replace NaNs in numeric columns with the column's mean value
df.fillna(df.mean(), inplace=True)

# Display the length of the resulting DataFrame
print(len(df))

# Show the first few rows of the resulting DataFrame
print(df.head())


1462
   Age  Gender        BMI  Glucose_Level  Hypertension  Diabetes
0   96       0  28.188516      88.271360           1.0         0
1    8       0  28.188516      88.271360           1.0         0
2   81       1  30.395000      84.400000           1.0         0
3   70       0  38.984444      76.777778           1.0         0
4   90       1  23.060000      87.000000           1.0         0


In [242]:
# Diabetes patient counts
print(df['Diabetes'].value_counts())

Diabetes
0    1365
1      97
Name: count, dtype: int64


In [243]:
# Data is inbalance so lets downsample data

# Separate the majority and minority classes based on the 'Diabetes' column
df_majority = df[df['Diabetes'] == 0]
df_minority = df[df['Diabetes'] == 1]

# Downsample the majority class to match the minority class size
df_majority_downsampled = resample(df_majority,
                                   replace=False,
                                   n_samples=len(df_minority),  # Match minority class size
                                   random_state=42)

# Combine the downsampled majority class with the minority class
df_balanced = pd.concat([df_majority_downsampled, df_minority])

# Display the new class distribution
print(f'New class distribution:\n{df_balanced["Diabetes"].value_counts()}')

New class distribution:
Diabetes
0    97
1    97
Name: count, dtype: int64


In [244]:
# Shuffle the dataset
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Display the shuffled balanced dataset
print(df_balanced.head())

   Age  Gender        BMI  Glucose_Level  Hypertension  Diabetes
0   87       1  28.188516      88.271360           1.0         1
1   45       0  31.036000      88.271360           1.0         0
2   54       0  32.925000     118.666667           1.0         1
3   72       0  42.784000      88.271360           1.0         0
4   98       1  23.506667      77.666667           1.0         0


In [245]:
# Train Test split

X = df_balanced.drop(columns=["Diabetes"])
y = df_balanced["Diabetes"]

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shape of the resulting datasets
print(f'X_train shape: {X_train.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'y_test shape: {y_test.shape}')

X_train shape: (155, 5)
X_test shape: (39, 5)
y_train shape: (155,)
y_test shape: (39,)


In [246]:
openClient = openai.OpenAI(api_key="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")


In [247]:
# One shot prompt
messages = [
    {"role": "system", "content": "You are a doctor or an expert trained to predict Diabetes based on patient data. Respond with 'Yes' or 'No' only."},
    {"role": "user", "content": "Predict diabetes risk for a patient with age: 60, gender: Male, BMI: 28, glucose: 100, hypertension: Yes."}
]

response = openClient.chat.completions.create(
    model="gpt-4o-mini",
    messages=messages
)

print(response.choices[0].message.content)

Yes


In [248]:
messages = [
    {"role": "system", "content": "You are a doctor or an expert trained to predict Diabetes risk based on patient data. Respond with 'Yes' or 'No' only, and explain your reasoning."},
    {"role": "user", "content": """Here are a few patient cases. Predict if the patient has diabetes and explain your reasoning:

    Case 1: Age: 60, Gender: Male, BMI: 35, Glucose: 180, Hypertension: Yes → Diabetes: Yes

    Case 2: Age: 45, Gender: Female, BMI: 24, Glucose: 90, Hypertension: No → Diabetes: No

    Case 3: Age: 50, Gender: Male, BMI: 31, Glucose: 160, Hypertension: Yes → Diabetes: Yes

    Predict diabetes risk for a new patient: Age: 60, Gender: Male, BMI: 28, Glucose: 100, Hypertension: Yes. Prediction: Diabetes: """}
]

response = openClient.chat.completions.create(
    model="gpt-4o-mini",
    messages=messages
)

print(response.choices[0].message.content)


No

Reasoning: The patient is 60 years old and male, which are risk factors for diabetes, and has hypertension, which is also a contributing factor. However, the BMI is 28, which is on the lower end of overweight but not in the obese range, and the glucose level is 100 mg/dL, which is within the normal range for fasting glucose (less than 100 mg/dL is considered normal). Therefore, despite the presence of some risk factors, this patient's glucose level does not indicate diabetes.


In [249]:
#Chain of Thought
messages = [
    {"role": "system", "content": "You are a doctor or an expert trained to predict Diabetes risk based on patient data. Respond with 'Yes' or 'No' only, and provide a detailed explanation for the reasoning behind the prediction."},
    {"role": "user", "content": """Here is a patient case. Predict diabetes risk and explain your reasoning:

    Age: 55, Gender: Male, BMI: 33, Glucose: 170, Hypertension: Yes."""},
    {"role": "assistant", "content": """Based on the provided data:

    - Age: 55 is above 45, which increases the risk of diabetes.
    - BMI: 33 is in the obese range (greater than 30), which is a significant risk factor.
    - Glucose level of 170 mg/dL is above 160, suggesting insulin resistance or diabetes.

    Therefore, the prediction is: Diabetes: Yes."""}
]

response = openClient.chat.completions.create(
    model="gpt-4o-mini",
    messages=messages
)

print(response.choices[0].message.content)


Yes. 

Reasoning: The patient's age of 55 increases the risk for diabetes as the risk generally escalates with age, particularly for individuals over 45. The Body Mass Index (BMI) of 33 categorizes him as obese, which is a well-established risk factor for type 2 diabetes. The glucose level of 170 mg/dL indicates elevated blood sugar, which is above the normal fasting range and suggests either impaired glucose tolerance or diabetes. Additionally, the presence of hypertension further compounds his risk, as it often correlates with metabolic syndrome, which includes insulin resistance and an increased likelihood of diabetes. Given these factors, the patient is at high risk for developing diabetes.


In [250]:
# Decision Tree Approach
messages = [
    {"role": "system", "content": "You are a doctor or an expert trained to use a decision tree for predicting Diabetes based on patient data."},
    {"role": "user", "content": """Use a decision tree to predict diabetes risk for the following patient:

    Case: Age: 60, Gender: Male, BMI: 28, Glucose: 100, Hypertension: Yes.

    Here is a decision tree:
    1. Age >= 45? → Yes → Proceed to step 2.
    2. BMI >= 30? → Yes → Proceed to step 3.
    3. Glucose >= 160? → Yes → Diabetes: Yes.
    4. Glucose < 160? → No → Diabetes: No.

    Please use this tree to make the prediction."""},
    {"role": "assistant", "content": """Let's go through the decision tree:

    - **Step 1**: The patient is 60 years old, which is >= 45. Proceed to step 2.
    - **Step 2**: The patient's BMI is 28, which is less than 30. Skip to step 4.
    - **Step 4**: The patient's glucose level is 100, which is less than 160.

    Therefore, according to the decision tree, the prediction is: Diabetes: No."""}
]

response = openClient.chat.completions.create(
    model="gpt-4o-mini",
    messages=messages
)

print(response.choices[0].message.content)

Based on the provided decision tree and the patient's data, let's go through the steps again for clarity:

1. **Age >= 45?**  
   - The patient is 60 years old. This is **Yes**. Proceed to Step 2.

2. **BMI >= 30?**  
   - The patient has a BMI of 28. This is **No**. We skip Step 3.

3. **Proceed to Step 4** since the previous answer is No. 

4. **Glucose < 160?**  
   - The patient's glucose level is 100. This is **Yes**.

Given that the patient does not meet the criteria for diabetes as indicated in the tree:

**Final Prediction: Diabetes: No.**


In [251]:
from torch.utils.data import Dataset

class DiabetesDataset(Dataset):
    def __init__(self, df):
        """
        Initializes the dataset with a DataFrame containing patient information.
        """
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        """
        Returns a formatted string for diabetes prediction.
        """
        column_names = [
            ("Age", "The patient's age is "),
            ("Gender", ". Gender is "),
            ("BMI", ". BMI is "),
            ("Glucose_Level", ". Glucose level is "),
            ("Hypertension", ". Hypertension status is ")
        ]

        # Convert row data into text description
        x_strs = [f"{col_desc}{self.df.iloc[index][col]}" for col, col_desc in column_names]
        x_str = ''.join(x_strs)

        # Replace numerical values with meaningful labels
        x_str = x_str.replace("Gender is 1", "Gender is Male")
        x_str = x_str.replace("Gender is 0", "Gender is Female")
        x_str = x_str.replace("Hypertension status is 1", "Hypertension status is Yes")
        x_str = x_str.replace("Hypertension status is 0", "Hypertension status is No")

        # Instruction for LLM
        x_str = ("Decide in a single word if the patient has diabetes based on the given details. "
                 "If the diabetes label is 1, the answer is 'Yes'. If 0, the answer is 'No'. " + x_str)
        x_str += ". Please decide whether the patient has diabetes: Yes or No."

        return x_str


In [252]:
test_ds = DiabetesDataset(X_test)

In [253]:
from tqdm import tqdm
import time

results = []
for prompt in tqdm(test_ds):
  response = openClient.chat.completions.create(
    model="gpt-4o-mini",  # Make sure this model is available
    messages=[
          {"role": "user", "content": prompt},
      ]
  )
  results.append(response.choices[0].message.content)
  time.sleep(3)

100%|██████████| 39/39 [02:12<00:00,  3.40s/it]


In [254]:
results

['Yes',
 'No',
 'Yes',
 'Yes',
 'No',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'No',
 'No',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'Yes',
 'Yes',
 'No',
 'No',
 'No',
 'No',
 'No',
 'Yes',
 'Yes',
 'No',
 'Yes',
 'Yes',
 'No',
 'No',
 'No']

In [259]:
# Evaluating Diabetes Risk Prediction with OpenAPI: AUROC and AUPRC Metrics

# Convert predictions to binary (1 for "Yes", 0 for "No")
results_binary = list(map(lambda x: 1 if x == "Yes" else 0, results))

test_labels = y_test
test_pred = results_binary

auroc = roc_auc_score(test_labels, test_pred)
auprc = average_precision_score(test_labels, test_pred)
print('\nAUROC:', auroc, '\nAUPRC', auprc)


AUROC: 0.5912698412698413 
AUPRC 0.5155982905982907


In [260]:
def generate_embeddings(texts, model="text-embedding-ada-002"):
    embeddings = []

    for text in tqdm(texts):
        text = text.replace("\n", " ")  # Remove newline characters
        response = openClient.embeddings.create(
            input=[text], model=model
        )

        embeddings.append(response.data[0].embedding)

    return np.array(embeddings)

In [261]:
# Using OpenAPI Embedding and Logistic Regression to Classify Diabetes Risk: Model Evaluation with AUROC and AUPRC

# Train and test the performance
train_ds = DiabetesDataset(X_train)  # Example of a dataset, adjust as needed
embeddings = generate_embeddings(train_ds)  # Generate embeddings using OpenAPI

# Train the Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(embeddings, y_train)  # Fit the model to the training data

# Generate embeddings for the test dataset
test_embeddings = generate_embeddings(test_ds)  # Embeddings for the test dataset
test_labels = y_test  # Actual labels for the test set

# Predict probabilities for the positive class (diabetes risk = 1)
test_pred = model.predict_proba(test_embeddings)[:, 1]

# Evaluate the model using AUROC and AUPRC
auroc = roc_auc_score(test_labels, test_pred)
auprc = average_precision_score(test_labels, test_pred)

# Output the results
print('\nAUROC:', auroc, '\nAUPRC', auprc)

100%|██████████| 155/155 [00:56<00:00,  2.72it/s]
100%|██████████| 39/39 [00:10<00:00,  3.76it/s]


AUROC: 0.7566137566137566 
AUPRC 0.7261890379763187



