# Data preprocessing for Fusion and Model

Importing Libraries

In [19]:
# Essentials
import pandas as pd
import numpy as np

# Processing
from sklearn.preprocessing import StandardScaler

# Model

# Visualization

# Warnings
import warnings
warnings.filterwarnings('ignore')

Loading Datasets

In [20]:
# Clinical Datasets
African = pd.read_csv(r"C:\Users\dahab\OneDrive\Desktop\T2D-Prediction-System--Data-Fusion-for-Enhanced-Decision-Making\processed_datasets\clinical\African_pro.csv")
Bangladesh = pd.read_csv(r"C:\Users\dahab\OneDrive\Desktop\T2D-Prediction-System--Data-Fusion-for-Enhanced-Decision-Making\processed_datasets\clinical\Bangladesh_pro.csv")
Iraq = pd.read_csv(r"C:\Users\dahab\OneDrive\Desktop\T2D-Prediction-System--Data-Fusion-for-Enhanced-Decision-Making\processed_datasets\clinical\Iraq_pro.csv")

# Genetic Datasets
inter_genetic = pd.read_csv(r"C:\Users\dahab\OneDrive\Desktop\T2D-Prediction-System--Data-Fusion-for-Enhanced-Decision-Making\processed_datasets\genetic\inter_genetic_dataset.csv")
normal_genetic = pd.read_csv(r"C:\Users\dahab\OneDrive\Desktop\T2D-Prediction-System--Data-Fusion-for-Enhanced-Decision-Making\processed_datasets\genetic\normal_genetic_dataset.csv")

Checking Target Columns

In [21]:
print(African.columns, "\n")
print(Bangladesh.columns, "\n")
print(Iraq.columns, "\n")
print(inter_genetic.columns, "\n")
print(normal_genetic.columns)

Index(['Patient number', 'Cholesterol', 'Glucose', 'HDL Chol',
       'Chol/HDL ratio', 'Age', 'Gender', 'Height', 'Weight', 'BMI',
       'Systolic BP', 'Diastolic BP', 'waist', 'hip', 'Waist/hip ratio',
       'Diabetes', 'BMI Category'],
      dtype='object') 

Index(['age', 'pulse_rate', 'systolic_bp', 'diastolic_bp', 'glucose', 'height',
       'weight', 'bmi', 'family_diabetes', 'hypertensive',
       'family_hypertension', 'cardiovascular_disease', 'stroke',
       'gender_Encoded', 'diabetic_Encoded'],
      dtype='object') 

Index(['Age', 'Urea', 'Cr', 'HbA1c', 'Chol', 'TG', 'HDL', 'LDL', 'VLDL', 'BMI',
       'Gender_Encoded', 'Class_Encoded'],
      dtype='object') 

Index(['STUDY', 'DISEASE_DESCRIPTION', 'REGION', 'CHR_ID', 'CHR_POS',
       'MAPPED_GENE', 'UPSTREAM_GENE_ID', 'DOWNSTREAM_GENE_ID',
       'UPSTREAM_GENE_DISTANCE', 'DOWNSTREAM_GENE_DISTANCE', 'SNPS', 'MERGED',
       'GENOMIC_CONTEXT', 'INTERGENIC', 'RISK_ALLELE_FREQUENCY', 'PVALUE',
       'PVALUE_MLOG', 'EF

Uniting Target Columns' name

In [22]:
African.rename(columns={
    'Diabetes': 'T2D',
}, inplace=True)
 
Bangladesh.rename(columns={
    'diabetic_Encoded': 'T2D',
}, inplace=True)
 
Iraq.rename(columns={
    'Class_Encoded': 'T2D',
}, inplace=True)
 
# Add a new column 'T2D' with all values set to 1
#inter_genetic['T2D'] = 1
 
# Add a new column 'T2D' with all values set to 1
#normal_genetic['T2D'] = 1

Checking Data Types 

In [23]:
print(African.dtypes, '\n')
print(Bangladesh.dtypes, '\n')
print(Iraq.dtypes, '\n')
print(inter_genetic.dtypes, '\n')
print(normal_genetic.dtypes, '\n')

Patient number       int64
Cholesterol          int64
Glucose              int64
HDL Chol             int64
Chol/HDL ratio     float64
Age                  int64
Gender               int64
Height               int64
Weight               int64
BMI                float64
Systolic BP          int64
Diastolic BP         int64
waist                int64
hip                  int64
Waist/hip ratio    float64
T2D                  int64
BMI Category         int64
dtype: object 

age                         int64
pulse_rate                  int64
systolic_bp                 int64
diastolic_bp                int64
glucose                   float64
height                    float64
weight                    float64
bmi                       float64
family_diabetes             int64
hypertensive                int64
family_hypertension         int64
cardiovascular_disease      int64
stroke                      int64
gender_Encoded              int64
T2D                         int64
dtype: object 


Define GAN Architecture

In [25]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, BatchNormalization, LeakyReLU
from tensorflow.keras.models import Sequential
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

In [28]:
# Define Generator model
def build_generator(input_dim, output_dim):
    model = Sequential([
        Dense(128, input_dim=input_dim),
        LeakyReLU(alpha=0.2),
        BatchNormalization(),
        Dense(256),
        LeakyReLU(alpha=0.2),
        BatchNormalization(),
        Dense(output_dim, activation='tanh')
    ])
    return model

# Define Discriminator model
def build_discriminator(input_dim):
    model = Sequential([
        Dense(256, input_dim=input_dim),
        LeakyReLU(alpha=0.2),
        Dense(128),
        LeakyReLU(alpha=0.2),
        Dense(1, activation='sigmoid')  # Output probability (Real vs Fake)
    ])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

Build and Compile the GAN

In [29]:
input_dim = X_train_inter_genetic_scaled.shape[1]  # Number of features
generator = build_generator(100, input_dim)  # Generator takes 100 noise dimensions
discriminator = build_discriminator(input_dim)

# Create GAN model (combining Generator & Discriminator)
discriminator.trainable = False  # Freeze discriminator during GAN training

gan = Sequential([generator, discriminator])
gan.compile(loss='binary_crossentropy', optimizer='adam')

Train the GAN

In [34]:
def train_gan(epochs, batch_size):
    real_labels = np.ones((batch_size, 1))  # Label 1 for real data (Yes)
    fake_labels = np.zeros((batch_size, 1))  # Label 0 for fake data (No)

    for epoch in range(epochs):
        # Select a random batch of real samples
        idx = np.random.randint(0, X_train_inter_genetic_scaled.shape[0], batch_size)
        real_samples = X_train_inter_genetic_scaled[idx]

        # Generate fake samples
        noise = np.random.normal(0, 1, (batch_size, 100))  # Noise input
        fake_samples = generator.predict(noise)

        # Train the Discriminator
        d_loss_real = discriminator.train_on_batch(real_samples, real_labels)
        d_loss_fake = discriminator.train_on_batch(fake_samples, fake_labels)

        # Ensure we extract the actual loss value (first element of returned list/tuple)
        d_loss_real_value = d_loss_real[0] if isinstance(d_loss_real, (list, tuple, np.ndarray)) else d_loss_real
        d_loss_fake_value = d_loss_fake[0] if isinstance(d_loss_fake, (list, tuple, np.ndarray)) else d_loss_fake
        d_loss = 0.5 * (d_loss_real_value + d_loss_fake_value)  # Compute average loss

        # Train the Generator
        noise = np.random.normal(0, 1, (batch_size, 100))
        g_loss_data = gan.train_on_batch(noise, real_labels)

        # Extract loss value from g_loss_data if it's a list/tuple
        g_loss = g_loss_data[0] if isinstance(g_loss_data, (list, tuple, np.ndarray)) else g_loss_data

        # Print loss every 500 epochs  
        print(f"Epoch {epoch}: D Loss: {d_loss:.4f}, G Loss: {g_loss:.4f}")

# Train the GAN
train_gan(epochs=25, batch_size=16)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
Epoch 0: D Loss: 0.9932, G Loss: 0.9932
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
Epoch 1: D Loss: 0.9932, G Loss: 0.9932
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
Epoch 2: D Loss: 0.9932, G Loss: 0.9933
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
Epoch 3: D Loss: 0.9932, G Loss: 0.9933
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
Epoch 4: D Loss: 0.9933, G Loss: 0.9933
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
Epoch 5: D Loss: 0.9933, G Loss: 0.9933
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
Epoch 6: D Loss: 0.9933, G Loss: 0.9933
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
Epoch 7: D Loss: 0.9933, G Loss: 0.9933
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
Epoch 8: D Loss: 0.9933, G Loss:

Generate Synthetic "No" Samples

In [None]:
# Step 1: Ensure Generator Inputs Match Expected Noise Dim
num_samples = 1110
noise_dim = generator.input_shape[1]  # Get correct input size
noise = np.random.normal(0, 1, (num_samples, noise_dim))  # Generate noise

# Step 2: Generate Fake "No" Samples
synthetic_no_samples = generator.predict(noise)

# Step 3: Apply inverse_transform only on numerical columns
synthetic_no_samples_scaled = scaler.inverse_transform(synthetic_no_samples[:, :scaler.scale_.shape[0]])

# Step 4: Convert to DataFrame with Correct Column Names
synthetic_df = pd.DataFrame(synthetic_no_samples_scaled, columns=inter_genetic.columns[:scaler.scale_.shape[0]])

# Step 5: Add Missing Categorical Columns Randomly
for col in inter_genetic.columns[scaler.scale_.shape[0]:-1]:  # Exclude target column
    synthetic_df[col] = np.random.choice(inter_genetic[col].values, size=len(synthetic_df))

# Step 6: Label Synthetic Samples as "No" (0)
synthetic_df['Diabetes'] = 0

# Step 7: Label Original Samples as "Yes" (1)
inter_genetic['Diabetes'] = 1

# Step 8: Combine Real and Synthetic Data
balanced_df = pd.concat([inter_genetic, synthetic_df], ignore_index=True)

# Step 9: Save to CSV
balanced_df.to_csv("diabetes_balanced_data_gan.csv", index=False)
print("Generated dataset saved successfully!")

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
Generated dataset saved successfully!


In [43]:
balanced_df['Diabetes'].value_counts()

Diabetes
1    1110
0    1000
Name: count, dtype: int64

Splitting Data

In [None]:
# Splitting datasets into training and testing sets
from sklearn.model_selection import train_test_split

# Splitting African dataset
X_train_african, X_test_african, y_train_african, y_test_african = train_test_split(
    African.drop('T2D', axis=1), African['T2D'], test_size=0.2, random_state=42
)

# Splitting Bangladesh dataset
X_train_bangladesh, X_test_bangladesh, y_train_bangladesh, y_test_bangladesh = train_test_split(
    Bangladesh.drop('T2D', axis=1), Bangladesh['T2D'], test_size=0.2, random_state=42
)

# Splitting Iraq dataset
X_train_iraq, X_test_iraq, y_train_iraq, y_test_iraq = train_test_split(
    Iraq.drop('T2D', axis=1), Iraq['T2D'], test_size=0.2, random_state=42
)

# Splitting Inter Genetic dataset
X_train_inter_genetic, X_test_inter_genetic, y_train_inter_genetic, y_test_inter_genetic = train_test_split(
    inter_genetic.drop('T2D', axis=1), inter_genetic['T2D'], test_size=0.2, random_state=42
)

# Splitting Normal Genetic dataset
X_train_normal_genetic, X_test_normal_genetic, y_train_normal_genetic, y_test_normal_genetic = train_test_split(
    normal_genetic.drop('T2D', axis=1), normal_genetic['T2D'], test_size=0.2, random_state=42
)

Feature Scalling

In [27]:
# Importing necessary libraries
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Initializing the scaler
scaler = StandardScaler()

# Handling non-numeric data by selecting numeric columns only
def preprocess_and_scale(X_train, X_test):
    # Selecting only numeric columns
    X_train_numeric = X_train.select_dtypes(include=[float, int])
    X_test_numeric = X_test.select_dtypes(include=[float, int])
    
    # Scaling numeric features
    X_train_scaled = scaler.fit_transform(X_train_numeric)
    X_test_scaled = scaler.transform(X_test_numeric)
    
    return X_train_scaled, X_test_scaled

# Preprocessing and scaling for African dataset
X_train_african_scaled, X_test_african_scaled = preprocess_and_scale(
    X_train_african, X_test_african
)

# Preprocessing and scaling for Bangladesh dataset
X_train_bangladesh_scaled, X_test_bangladesh_scaled = preprocess_and_scale(
    X_train_bangladesh, X_test_bangladesh
)

# Preprocessing and scaling for Iraq dataset
X_train_iraq_scaled, X_test_iraq_scaled = preprocess_and_scale(
    X_train_iraq, X_test_iraq
)

# Preprocessing and scaling for Inter Genetic dataset
X_train_inter_genetic_scaled, X_test_inter_genetic_scaled = preprocess_and_scale(
    X_train_inter_genetic, X_test_inter_genetic
)

# Preprocessing and scaling for Normal Genetic dataset
X_train_normal_genetic_scaled, X_test_normal_genetic_scaled = preprocess_and_scale(
    X_train_normal_genetic, X_test_normal_genetic
)

# Model Selection 

In [None]:
# Importing necessary libraries for model selection
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Defining datasets and corresponding scaled data
datasets = {
    "African": (X_train_african_scaled, X_test_african_scaled, y_train_african, y_test_african),
    "Bangladesh": (X_train_bangladesh_scaled, X_test_bangladesh_scaled, y_train_bangladesh, y_test_bangladesh),
    "Iraq": (X_train_iraq_scaled, X_test_iraq_scaled, y_train_iraq, y_test_iraq),
    "Inter Genetic": (X_train_inter_genetic_scaled, X_test_inter_genetic_scaled, y_train_inter_genetic, y_test_inter_genetic),
    "Normal Genetic": (X_train_normal_genetic_scaled, X_test_normal_genetic_scaled, y_train_normal_genetic, y_test_normal_genetic)
}

# Models to evaluate
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Naive Bayes": GaussianNB(),
    "Decision Tree": DecisionTreeClassifier(),
    "XGBoost": XGBClassifier(eval_metric='logloss'),
}

# Looping through each dataset and evaluating models
for dataset_name, (X_train, X_test, y_train, y_test) in datasets.items():
    print(f"\n{dataset_name} Dataset Model Selection Results:")
    results = {}
    for model_name, model in models.items():
        model.fit(X_train, y_train)  # Training the model
        y_pred = model.predict(X_test)  # Predicting on the test set
        accuracy = accuracy_score(y_test, y_pred)  # Calculating accuracy
        results[model_name] = accuracy

    # Displaying results for the current dataset
    for model_name, accuracy in results.items():
        print(f"{model_name}: Accuracy = {accuracy:.4f}")

ModuleNotFoundError: No module named 'xgboost'

Training Model

Fusion

Model Evaluation