In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

# Load the CSV file
df = pd.read_csv("/content/drive/MyDrive/P2/T1/Dataset/overlapped/ML_o_train_filtered.csv")

# Get unique values in the "prognosis" column
unique_labels = df['disease_label'].unique()

# Print the unique labels
print(unique_labels)

print(len(unique_labels))

['osteomyelitis' 'infection' 'schizophrenia' 'dependence'
 'hypertensive disease' 'peripheral vascular disease' 'hyperlipidemia'
 'obesity morbid' 'stenosis aortic valve' 'coronary arteriosclerosis'
 'failure heart' 'benign prostatic hypertrophy' 'hepatitis' 'thrombus'
 'neutropenia' 'sickle cell anemia' 'biliary calculus' 'pancreatitis'
 'hyperglycemia' 'malignant neoplasms ' 'gout' 'infection urinary tract'
 'arthritis' 'kidney failure acute' 'colitis'
 'tricuspid valve insufficiency' 'adhesion' 'carcinoma breast' 'glaucoma'
 'effusion pericardial' 'deep vein thrombosis' 'tachycardia sinus'
 'pneumothorax' 'chronic obstructive airway disease' 'asthma'
 'adenocarcinoma' 'paroxysmal dyspnea' 'gastroenteritis' 'gastritis'
 'pyelonephritis' 'melanoma' 'cirrhosis' 'epilepsy' 'overload fluid'
 'delirium' 'obesity' 'manic disorder' 'affect labile']
48


# Take only '1' symptoms and ignore all '0' (Use this only)

In [3]:
# training_LLM
import pandas as pd

# Load the dataset
file_path = "/content/drive/MyDrive/P2/T1/Dataset/overlapped/ML_o_train_filtered.csv"
data = pd.read_csv(file_path)

# Extract column names for features and label
feature_columns = data.columns[:-1]  # All columns except the last one
label_column = data.columns[-3]      # The last column is the prognosis

# Check if the row is a dummy row
def is_dummy_row(row):
    return str(row['disease_label']).startswith("Dummy")

# Convert each row into a descriptive text format for the features
def convert_row_to_text(row):
    # Handle dummy rows
    if is_dummy_row(row):
        return f"Dummy symptoms for {row['disease_label']}"

    symptoms = []
    # Only include columns with a value of 1
    relevant_columns = [col for col in feature_columns if row[col] == 1]

    for col in relevant_columns:
        symptoms.append(f"have {col.replace('_', ' ')}")

    # Construct the description with only relevant symptoms
    if symptoms:
        text_description = f"Patient shows symptoms as follows: {'; '.join(symptoms)}."
    else:
        text_description = "Patient shows no symptoms."
    return text_description

# Apply the conversion to each row and separate the label
data['symptoms'] = data.apply(convert_row_to_text, axis=1)
data['disease_label'] = data[label_column].astype(str)

# Save the transformed dataset to a new CSV file with separate columns for features and label
output_path = "/content/drive/MyDrive/P2/T1/Dataset/overlapped/LLM_o_train.csv"
data[['symptoms', 'disease_label']].to_csv(output_path, index=False)

print(f"Text-based dataset saved to {output_path}")


Text-based dataset saved to /content/drive/MyDrive/P2/T1/Dataset/overlapped/LLM_o_train.csv


In [4]:
# validation_LLM
import pandas as pd

# Load the dataset
file_path = "/content/drive/MyDrive/P2/T1/Dataset/overlapped/ML_o_val_filtered.csv"
data = pd.read_csv(file_path)

# Extract column names for features and label
feature_columns = data.columns[:-1]  # All columns except the last one
label_column = data.columns[-3]      # The last column is the prognosis

# Convert each row into a descriptive text format for the features
def convert_row_to_text(row):
    symptoms = []
    # Only include columns with a value of 1
    relevant_columns = [col for col in feature_columns if row[col] == 1]

    for col in relevant_columns:
        symptoms.append(f"have {col.replace('_', ' ')}")

    # Construct the description with only relevant symptoms
    if symptoms:
        text_description = f"Patient shows symptoms as follows: {'; '.join(symptoms)}."
    else:
        text_description = "Patient shows no symptoms."
    return text_description

# Apply the conversion to each row and separate the label
data['symptoms'] = data.apply(convert_row_to_text, axis=1)
data['disease_label'] = data[label_column].astype(str)

# Save the transformed dataset to a new CSV file with separate columns for features and label
output_path = "/content/drive/MyDrive/P2/T1/Dataset/overlapped/LLM_o_val.csv"
data[['symptoms', 'disease_label']].to_csv(output_path, index=False)

print(f"Text-based dataset saved to {output_path}")


Text-based dataset saved to /content/drive/MyDrive/P2/T1/Dataset/overlapped/LLM_o_val.csv


In [5]:
# testing_LLM
import pandas as pd

# Load the dataset
file_path = "/content/drive/MyDrive/P2/T1/Dataset/overlapped/ML_o_test_filtered.csv"
data = pd.read_csv(file_path)

# Extract column names for features and label
feature_columns = data.columns[:-1]  # All columns except the last one
label_column = data.columns[-3]      # The last column is the prognosis

# Convert each row into a descriptive text format for the features
def convert_row_to_text(row):
    symptoms = []
    # Only include columns with a value of 1
    relevant_columns = [col for col in feature_columns if row[col] == 1]

    for col in relevant_columns:
        symptoms.append(f"have {col.replace('_', ' ')}")

    # Construct the description with only relevant symptoms
    if symptoms:
        text_description = f"Patient shows symptoms as follows: {'; '.join(symptoms)}."
    else:
        text_description = "Patient shows no symptoms."
    return text_description

# Apply the conversion to each row and separate the label
data['symptoms'] = data.apply(convert_row_to_text, axis=1)
data['disease_label'] = data[label_column].astype(str)

# Save the transformed dataset to a new CSV file with separate columns for features and label
output_path = "/content/drive/MyDrive/P2/T1/Dataset/overlapped/LLM_o_test.csv"
data[['symptoms', 'disease_label']].to_csv(output_path, index=False)

print(f"Text-based dataset saved to {output_path}")


Text-based dataset saved to /content/drive/MyDrive/P2/T1/Dataset/overlapped/LLM_o_test.csv
