In [None]:
import pandas as pd
import re


def clean_name(name):

    # Remove UMLS codes and keep only the descriptive name
    if isinstance(name, str):
        name = re.sub(r'UMLS:[A-Z0-9]+_', '', name)  # Remove UMLS codes
        name = name.replace('_', ' ')  # Replace underscores with spaces
        name = name.strip()  # Clean extra spaces
    return name


# Load the input data from an Excel file
input_file_path = "/content/dataset.xlsx"  # Replace this with your actual file path
raw_data = pd.read_excel(input_file_path)

# Parse the unique symptoms dynamically from the symptom column
all_symptoms = set()
for symptom_string in raw_data['Symptom']:
    if isinstance(symptom_string, str):  # Ensure it's a string
        symptom_list = symptom_string.split(';')
        for symptom in symptom_list:
            cleaned_symptom = clean_name(symptom.strip())
            all_symptoms.add(cleaned_symptom)  # Add the cleaned symptom to the unique set

# Convert the unique symptom set into a list for DataFrame processing
all_symptoms = list(all_symptoms)

# Initialize cleaned data storage
cleaned_data = []

# Process each row in the raw data
for index, row in raw_data.iterrows():
    # Parse the symptom for the given disease
    symptom_list = row['Symptom'].split(';') if isinstance(row['Symptom'], str) else []

    # Clean and map symptoms with binary encoding
    symptom_map = {clean_name(symptom): (1 if clean_name(symptom) in symptom_list else 0) for symptom in all_symptoms}

    # Combine the frequency of occurrences with the binary encoding
    disease_row = {'Frequency': row['Count of Disease Occurrence']}
    disease_row.update(symptom_map)

    # Append to cleaned data
    cleaned_data.append(disease_row)

# Convert cleaned data to a DataFrame
cleaned_df = pd.DataFrame(cleaned_data)

# Clean and map disease names to a readable format for the target variable
cleaned_df['Disease'] = raw_data['Disease'].apply(clean_name)

# Rearrange DataFrame columns so that 'Disease' and 'Frequency' are first
columns = ['Disease', 'Frequency'] + all_symptoms
cleaned_df = cleaned_df[columns]

# Output the cleaned DataFrame
print(cleaned_df.head())

# Optionally save the cleaned data to CSV
output_file_path = "cleaned_disease_data.csv"
cleaned_df.to_csv(output_file_path, index=False)
print(f"Cleaned data saved to {output_file_path}")


                Disease  Frequency     hematochezia  choke  spasm  \
0  hypertensive disease     3363.0  0             0      0      0   
1                   NaN        NaN  0             0      0      0   
2                   NaN        NaN  0             0      0      0   
3                   NaN        NaN  0             0      0      0   
4                   NaN        NaN  0             0      0      0   

   lightheadedness  hypesthesia  pin-point pupils  tachypnea  ...  \
0                0            0                 0          0  ...   
1                0            0                 0          0  ...   
2                0            0                 0          0  ...   
3                0            0                 0          0  ...   
4                0            0                 0          0  ...   

   breakthrough pain  welt  gravida 0  malaise  rhonchus  pulsus paradoxus  \
0                  0     0          0        0         0                 0   
1             