In [1]:
# !pip install tensorflow

In [2]:
# !pip install xgboost

In [3]:
# !pip install numpy==1.24.3

In [4]:
# Import Libraries and Dependencies
import pandas as pd
import numpy as np
import re
# import spacy
import subprocess
import os
import matplotlib.pyplot as plt
import seaborn as sns  
from sklearn.model_selection import train_test_split    # To split the dataset into train and test sets
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder   # For scaling and encoding categorical variables
from sklearn.compose import ColumnTransformer           # To apply transformers to specific columns
from sklearn.impute import SimpleImputer                # For handling missing data
from sklearn.ensemble import RandomForestClassifier     # Random Forest Classifier
from sklearn.linear_model import LogisticRegression     # Logistic Regression
from sklearn.svm import SVC                            # Support Vector Classifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score # For evaluating the model
import joblib
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV


import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Concatenate
from tensorflow.keras.models import Model


In [5]:
!pip show tensorflow

Name: tensorflow
Version: 2.18.0
Summary: TensorFlow is an open source machine learning framework for everyone.
Home-page: https://www.tensorflow.org/
Author: Google Inc.
Author-email: packages@tensorflow.org
License: Apache 2.0
Location: /opt/anaconda3/lib/python3.12/site-packages
Requires: absl-py, astunparse, flatbuffers, gast, google-pasta, grpcio, h5py, keras, libclang, ml-dtypes, numpy, opt-einsum, packaging, protobuf, requests, setuptools, six, tensorboard, termcolor, typing-extensions, wrapt
Required-by: 


In [6]:
# Read in the CSV file
disease_df = pd.read_csv('Resources/disease_symptoms.csv')

In [7]:
# Rename Disease column to Prognosis
disease_df.rename(columns={'Disease': 'Prognosis'}, inplace=True)

In [8]:

# Expand Symptoms into separate columns
disease_df_expanded = disease_df["Symptoms"].str.split(", ", expand=True)

In [9]:

# Rename columns dynamically
disease_df_expanded.columns = [f"Symptom_{i+1}" for i in range(disease_df_expanded.shape[1])]

In [10]:

# Concatenate with Prognosis column
disease_df_final = pd.concat([disease_df["Prognosis"], disease_df_expanded], axis=1)

In [11]:

# Reshape from wide to long format
disease_df_long = disease_df_final.melt(id_vars="Prognosis", value_name="Symptom").drop(columns=["variable"])

In [12]:
# Drop NaN values before encoding**
disease_df_long = disease_df_long.dropna(subset=["Symptom"])

In [13]:
# **Fix: Trim whitespace from symptoms**
disease_df_long["Symptom"] = disease_df_long["Symptom"].str.strip()

In [14]:
# One-hot encode symptoms properly
disease_df_encoded = pd.get_dummies(disease_df_long, columns=["Symptom"])

In [15]:
# **Fix: Ensure index is reset**
disease_df_encoded = disease_df_encoded.reset_index(drop=True)

In [16]:
# sort alphabetically by Prognosis
disease_df_encoded = disease_df_encoded.sort_values("Prognosis")

In [17]:
# disease_df_encoded.set_index('Prognosis', inplace=True)


In [18]:
#reset the index 
disease_df_encoded = disease_df_encoded.reset_index(drop=True)
#Display the result
print(disease_df_encoded.shape)
disease_df_encoded.head(50)

(511, 263)


Unnamed: 0,Prognosis,Symptom_abdominal cramps,Symptom_abdominal pain,Symptom_abdominal swelling,Symptom_abnormal fatigue,Symptom_abnormal fluid accumulation in the abdomen,Symptom_absence of bowel movements for days,Symptom_absence of menstruation,Symptom_absent speech,Symptom_acid reflux,...,Symptom_waking up too early,Symptom_walking difficulty,Symptom_weakness,Symptom_weight gain,Symptom_weight loss,Symptom_wheezing,Symptom_white patches on the skin,Symptom_white pupil,Symptom_widespread itching,Symptom_yellowing of the skin and eyes
0,Acetonemia,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,Acetonemia,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,Acetonemia,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,Acetonemia,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,Acute Bronchitis,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5,Acute Bronchitis,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6,Acute Bronchitis,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7,Acute Bronchitis,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
8,Acute Bronchitis,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9,Alopecia,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [19]:
# remove Symptom_ in the column names
disease_df_encoded.columns = disease_df_encoded.columns.str.replace('Symptom_', '')

# Display the result
disease_df_encoded.head(50)

Unnamed: 0,Prognosis,abdominal cramps,abdominal pain,abdominal swelling,abnormal fatigue,abnormal fluid accumulation in the abdomen,absence of bowel movements for days,absence of menstruation,absent speech,acid reflux,...,waking up too early,walking difficulty,weakness,weight gain,weight loss,wheezing,white patches on the skin,white pupil,widespread itching,yellowing of the skin and eyes
0,Acetonemia,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,Acetonemia,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,Acetonemia,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,Acetonemia,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,Acute Bronchitis,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5,Acute Bronchitis,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6,Acute Bronchitis,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7,Acute Bronchitis,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
8,Acute Bronchitis,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9,Alopecia,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [20]:
disease_df_encoded.shape

(511, 263)

In [21]:
# read in prognosis csv file
prognosis_df = pd.read_csv('Resources/symbipredict_2022.csv')

# Display the result
prognosis_df.head()

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal Infection
1,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal Infection
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal Infection
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal Infection
4,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal Infection


In [22]:
# Prognosis column to first column in the dataset
prognosis_df = prognosis_df[["prognosis"] + [col for col in prognosis_df.columns if col != "prognosis"]]
#sort the dataset alphabetically by prognosis
prognosis_df = prognosis_df.sort_values("prognosis")
#reset the index
# prognosis_df = prognosis_df.reset_index(drop=True)
prognosis_df.tail()

Unnamed: 0,prognosis,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,...,pus_filled_pimples,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze
2537,Vertigo,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3316,Vertigo,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3890,Vertigo,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2417,Vertigo,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
364,Vertigo,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
# prognosis_df.set_index('prognosis', inplace=True)


In [24]:
prognosis_df.head()

Unnamed: 0,prognosis,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,...,pus_filled_pimples,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze
2113,AIDS,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
884,AIDS,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
885,AIDS,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2712,AIDS,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
886,AIDS,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
# rename prognosis column to Prognosis
prognosis_df.rename(columns={'prognosis': 'Prognosis'}, inplace=True)
# reset the index
prognosis_df = prognosis_df.reset_index(drop=True)

In [26]:
prognosis_df.head()

Unnamed: 0,Prognosis,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,...,pus_filled_pimples,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze
0,AIDS,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,AIDS,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,AIDS,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,AIDS,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,AIDS,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
prognosis_df.shape

(4961, 133)

In [28]:
prognosis_df.describe()

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,pus_filled_pimples,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze
count,4961.0,4961.0,4961.0,4961.0,4961.0,4961.0,4961.0,4961.0,4961.0,4961.0,...,4961.0,4961.0,4961.0,4961.0,4961.0,4961.0,4961.0,4961.0,4961.0,4961.0
mean,0.137875,0.159847,0.021971,0.045152,0.021971,0.162266,0.139085,0.045152,0.045152,0.021971,...,0.021971,0.021971,0.021971,0.023181,0.023181,0.023181,0.023181,0.023181,0.023181,0.023181
std,0.344804,0.366501,0.146605,0.207659,0.146605,0.368732,0.34607,0.207659,0.207659,0.146605,...,0.146605,0.146605,0.146605,0.150493,0.150493,0.150493,0.150493,0.150493,0.150493,0.150493
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [29]:
prognosis_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4961 entries, 0 to 4960
Columns: 133 entries, Prognosis to yellow_crust_ooze
dtypes: int64(132), object(1)
memory usage: 5.0+ MB


In [30]:
prognosis_df.dtypes

Prognosis               object
itching                  int64
skin_rash                int64
nodal_skin_eruptions     int64
continuous_sneezing      int64
                         ...  
small_dents_in_nails     int64
inflammatory_nails       int64
blister                  int64
red_sore_around_nose     int64
yellow_crust_ooze        int64
Length: 133, dtype: object

In [31]:
# read in prognosis csv file
final_train_df = pd.read_csv('Resources/FInal_Train_Data.csv')

# Display the result
final_train_df.head()

Unnamed: 0.1,Unnamed: 0,pain chest,shortness of breath,dizziness,asthenia,fall,syncope,vertigo,sweat sweating increased,palpitation,...,prodrome,hypoproteinemia,alcohol binge episode,abdomen acute,air fluid level,catching breath,large-for-dates fetus,immobile,homicidal thoughts,prognosis
0,0,0,1,0,0,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,hypertensive disease
1,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,diabetes
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"depression mental , depressive disorder"
3,0,1,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,"coronary arteriosclerosis ,coronary heart disease"
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,pneumonia


In [32]:
final_train_df.shape

(2564, 402)

In [33]:
#drop 'Unnamed: 0' column from the final_train_df dataframe
final_train_df.drop('Unnamed: 0', axis=1, inplace=True)

In [34]:
# rename prognosis column to Prognosis
final_train_df.rename(columns={'prognosis': 'Prognosis'}, inplace=True)

# make prognosis column the first column in the dataset
final_train_df = final_train_df[["Prognosis"] + [col for col in final_train_df.columns if col != "Prognosis"]]

In [35]:
final_train_df.sort_values('Prognosis', inplace=True)

In [36]:
# reset the index
final_train_df = final_train_df.reset_index(drop=True)

In [37]:
#display the first 5 rows of the dataframe
final_train_df.head()

Unnamed: 0,Prognosis,pain chest,shortness of breath,dizziness,asthenia,fall,syncope,vertigo,sweat sweating increased,palpitation,...,feces in rectum,prodrome,hypoproteinemia,alcohol binge episode,abdomen acute,air fluid level,catching breath,large-for-dates fetus,immobile,homicidal thoughts
0,Alzheimer's disease,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Alzheimer's disease,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Alzheimer's disease,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Alzheimer's disease,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Alzheimer's disease,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
final_train_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2564 entries, 0 to 2563
Columns: 401 entries, Prognosis to homicidal thoughts
dtypes: int64(400), object(1)
memory usage: 7.8+ MB


In [39]:
final_train_df.describe()

Unnamed: 0,pain chest,shortness of breath,dizziness,asthenia,fall,syncope,vertigo,sweat sweating increased,palpitation,nausea,...,feces in rectum,prodrome,hypoproteinemia,alcohol binge episode,abdomen acute,air fluid level,catching breath,large-for-dates fetus,immobile,homicidal thoughts
count,2564.0,2564.0,2564.0,2564.0,2564.0,2564.0,2564.0,2564.0,2564.0,2564.0,...,2564.0,2564.0,2564.0,2564.0,2564.0,2564.0,2564.0,2564.0,2564.0,2564.0
mean,0.064353,0.124415,0.022621,0.074103,0.026911,0.050312,0.015991,0.040952,0.035101,0.060062,...,0.00351,0.00273,0.00195,0.00312,0.00273,0.00156,0.00195,0.00156,0.00117,0.00234
std,0.245428,0.330119,0.148721,0.261989,0.161855,0.218631,0.125463,0.198217,0.184072,0.237649,...,0.059154,0.052189,0.044125,0.055782,0.052189,0.039475,0.044125,0.039475,0.034193,0.048327
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [40]:
final_train_df.dtypes

Prognosis                object
pain chest                int64
shortness of breath       int64
dizziness                 int64
asthenia                  int64
                          ...  
air fluid level           int64
catching breath           int64
large-for-dates fetus     int64
immobile                  int64
homicidal thoughts        int64
Length: 401, dtype: object

In [54]:
# Example generator function handling all three datasets separately
def my_generator():
    for i in range(max(len(final_train_df), len(prognosis_df), len(disease_df_encoded))):
        yield (
            np.array(final_train_df.iloc[i % len(final_train_df)], dtype=np.float32),  # Features from final_train_df
            np.array(prognosis_df.iloc[i % len(prognosis_df)], dtype=np.int32),  # Labels from prognosis_df
            np.array(disease_df_encoded.iloc[i % len(disease_df_encoded)], dtype=np.float32)  # Encoded disease data
        )

# Define output_signature for all three datasets
output_signature = (
    tf.TensorSpec(shape=(final_train_df.shape[1],), dtype=tf.float32),  # Features vector
    tf.TensorSpec(shape=(), dtype=tf.int32),  # Prognosis label (assuming a single value)
    tf.TensorSpec(shape=(disease_df_encoded.shape[1],), dtype=tf.float32)  # Encoded disease vector
)

# Create a TensorFlow dataset
dataset = tf.data.Dataset.from_generator(my_generator, output_signature=output_signature)

# Iterate and print elements to verify
for element in dataset.take(5):  # Print first 5 elements for verification
    print(element)


2025-02-28 14:44:08.189718: W tensorflow/core/framework/op_kernel.cc:1829] INVALID_ARGUMENT: ValueError: could not convert string to float: "Alzheimer's disease"
Traceback (most recent call last):

  File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/ops/script_ops.py", line 269, in __call__
    ret = func(*args)
          ^^^^^^^^^^^

  File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/autograph/impl/api.py", line 643, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^

  File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/data/ops/from_generator_op.py", line 198, in generator_py_func
    values = next(generator_state.get_iterator(iterator_id))
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

  File "/var/folders/5h/clwzvcpx0w33zvszv119nd7m0000gn/T/ipykernel_9938/2826991477.py", line 5, in my_generator
    np.array(final_train_df.iloc[i % len(final_train_df)], dtype=np.float32),  # Features from 

InvalidArgumentError: {{function_node __wrapped__IteratorGetNext_output_types_3_device_/job:localhost/replica:0/task:0/device:CPU:0}} ValueError: could not convert string to float: "Alzheimer's disease"
Traceback (most recent call last):

  File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/ops/script_ops.py", line 269, in __call__
    ret = func(*args)
          ^^^^^^^^^^^

  File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/autograph/impl/api.py", line 643, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^

  File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/data/ops/from_generator_op.py", line 198, in generator_py_func
    values = next(generator_state.get_iterator(iterator_id))
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

  File "/var/folders/5h/clwzvcpx0w33zvszv119nd7m0000gn/T/ipykernel_9938/2826991477.py", line 5, in my_generator
    np.array(final_train_df.iloc[i % len(final_train_df)], dtype=np.float32),  # Features from final_train_df
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

  File "/opt/anaconda3/lib/python3.12/site-packages/pandas/core/series.py", line 1031, in __array__
    arr = np.asarray(values, dtype=dtype)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

ValueError: could not convert string to float: "Alzheimer's disease"


	 [[{{node PyFunc}}]] [Op:IteratorGetNext] name: 

In [43]:
# Separate features and target labels
y = prognosis_df["Prognosis"].to_numpy()

X1 = prognosis_df.drop(columns=["Prognosis"]).to_numpy()
X2 = disease_df_encoded.drop(columns=["Prognosis"]).to_numpy()
X3 = final_train_df.drop(columns=["Prognosis"]).to_numpy()

In [44]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [45]:
# Train-test split for each input
X1_train, X1_test, y_train, y_test = train_test_split(X1, y_encoded, test_size=0.2, random_state=42)
X2_train, X2_test = train_test_split(X2, test_size=0.2, random_state=42)
X3_train, X3_test = train_test_split(X3, test_size=0.2, random_state=42)

In [46]:
# Define input layers
input1 = Input(shape=(X1_train.shape[1],), name="prognosis_input")
input2 = Input(shape=(X2_train.shape[1],), name="disease_input")
input3 = Input(shape=(X3_train.shape[1],), name="final_train_input")

In [47]:
# Define separate processing layers
x1 = Dense(64, activation="relu")(input1)
x2 = Dense(64, activation="relu")(input2)
x3 = Dense(64, activation="relu")(input3)

In [48]:
# Concatenate processed inputs
merged = Concatenate()([x1, x2, x3])

In [49]:
# Fully connected layers after merging
x = Dense(128, activation="relu")(merged)
x = Dense(64, activation="relu")(x)
output = Dense(len(np.unique(y)), activation="softmax")(x)  # Multi-class classification

In [50]:
# Create model
model = tf.keras.Model(inputs=[input1, input2, input3], outputs=output)

In [51]:
# Compile model
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [55]:
# # Define a data generator for different batch sizes
def data_generator(X1, X2, X3, y, batch_size):
    while True:
        for i in range(0, min(len(X1), len(X2), len(X3)), batch_size):
            # Ensure batches are correctly sized and handled
            X1_batch = np.array(X1[i:i+batch_size], dtype=np.float32)
            X2_batch = np.array(X2[i:i+batch_size], dtype=np.float32) if i < len(X2) else np.zeros((batch_size, X2.shape[1]))
            X3_batch = np.array(X3[i:i+batch_size], dtype=np.float32) if i < len(X3) else np.zeros((batch_size, X3.shape[1]))
            y_batch = np.array(y[i:i+batch_size], dtype=np.int32)  # Ensure `y` is an integer array

            # Yield as tuple of tensors
            yield (X1_batch, X2_batch, X3_batch), y_batch

In [56]:
# Set batch size
batch_size = 32

# Initialize generator
train_generator = data_generator(X1_train, X2_train, X3_train, y_train, batch_size)

# Convert to a TensorFlow dataset (optional)
train_dataset = tf.data.Dataset.from_generator(
    lambda: train_generator,
    output_signature=(
        (
            tf.TensorSpec(shape=(None, X1_train.shape[1]), dtype=tf.float32),
            tf.TensorSpec(shape=(None, X2_train.shape[1]), dtype=tf.float32),
            tf.TensorSpec(shape=(None, X3_train.shape[1]), dtype=tf.float32),
        ),
        tf.TensorSpec(shape=(None,), dtype=tf.int32),
    )
)

In [57]:
# Train model using dataset
model.fit(train_dataset, epochs=20, steps_per_epoch=len(y_train) // batch_size)

Epoch 1/20
[1m  1/124[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m3:33[0m 2s/step - accuracy: 0.0000e+00 - loss: 3.7478

2025-02-28 14:52:22.625336: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: INVALID_ARGUMENT: All dimensions except 1 must match. Input 1 has shape [24 64] and doesn't match input 0 with shape [32 64].
	 [[{{function_node __inference_one_step_on_data_2140}}{{node gradient_tape/functional_1/concatenate_1/ConcatOffset}}]]


InvalidArgumentError: Graph execution error:

Detected at node gradient_tape/functional_1/concatenate_1/ConcatOffset defined at (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main

  File "<frozen runpy>", line 88, in _run_code

  File "/opt/anaconda3/lib/python3.12/site-packages/ipykernel_launcher.py", line 17, in <module>

  File "/opt/anaconda3/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance

  File "/opt/anaconda3/lib/python3.12/site-packages/ipykernel/kernelapp.py", line 701, in start

  File "/opt/anaconda3/lib/python3.12/site-packages/tornado/platform/asyncio.py", line 205, in start

  File "/opt/anaconda3/lib/python3.12/asyncio/base_events.py", line 641, in run_forever

  File "/opt/anaconda3/lib/python3.12/asyncio/base_events.py", line 1987, in _run_once

  File "/opt/anaconda3/lib/python3.12/asyncio/events.py", line 88, in _run

  File "/opt/anaconda3/lib/python3.12/site-packages/ipykernel/kernelbase.py", line 534, in dispatch_queue

  File "/opt/anaconda3/lib/python3.12/site-packages/ipykernel/kernelbase.py", line 523, in process_one

  File "/opt/anaconda3/lib/python3.12/site-packages/ipykernel/kernelbase.py", line 429, in dispatch_shell

  File "/opt/anaconda3/lib/python3.12/site-packages/ipykernel/kernelbase.py", line 767, in execute_request

  File "/opt/anaconda3/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 429, in do_execute

  File "/opt/anaconda3/lib/python3.12/site-packages/ipykernel/zmqshell.py", line 549, in run_cell

  File "/opt/anaconda3/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3075, in run_cell

  File "/opt/anaconda3/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3130, in _run_cell

  File "/opt/anaconda3/lib/python3.12/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner

  File "/opt/anaconda3/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3334, in run_cell_async

  File "/opt/anaconda3/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3517, in run_ast_nodes

  File "/opt/anaconda3/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3577, in run_code

  File "/var/folders/5h/clwzvcpx0w33zvszv119nd7m0000gn/T/ipykernel_9938/3622209879.py", line 2, in <module>

  File "/opt/anaconda3/lib/python3.12/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/opt/anaconda3/lib/python3.12/site-packages/keras/src/backend/tensorflow/trainer.py", line 371, in fit

  File "/opt/anaconda3/lib/python3.12/site-packages/keras/src/backend/tensorflow/trainer.py", line 219, in function

  File "/opt/anaconda3/lib/python3.12/site-packages/keras/src/backend/tensorflow/trainer.py", line 132, in multi_step_on_iterator

  File "/opt/anaconda3/lib/python3.12/site-packages/keras/src/backend/tensorflow/trainer.py", line 113, in one_step_on_data

  File "/opt/anaconda3/lib/python3.12/site-packages/keras/src/backend/tensorflow/trainer.py", line 77, in train_step

All dimensions except 1 must match. Input 1 has shape [24 64] and doesn't match input 0 with shape [32 64].
	 [[{{node gradient_tape/functional_1/concatenate_1/ConcatOffset}}]] [Op:__inference_multi_step_on_iterator_2229]

In [52]:
# # Define a data generator for different batch sizes
# def data_generator(X1, X2, X3, y, batch_size):
#     while True:
#         for i in range(0, min(len(X1), len(X2), len(X3)), batch_size):
#             X1_batch = X1[i:i+batch_size]
#             X2_batch = X2[i:i+batch_size] if i < len(X2) else np.zeros((batch_size, X2.shape[1]))
#             X3_batch = X3[i:i+batch_size] if i < len(X3) else np.zeros((batch_size, X3.shape[1]))
#             y_batch = y[i:i+batch_size]
#             yield [X1_batch, X2_batch, X3_batch], y_batch

# batch_size = 32
# train_generator = data_generator(X1_train, X2_train, X3_train, y_train, batch_size)

In [53]:
# Train model using generator
# model.fit(train_generator, steps_per_epoch=len(y_train) // batch_size, epochs=20)

TypeError: `output_signature` must contain objects that are subclass of `tf.TypeSpec` but found <class 'list'> which is not.