# Data preprocessing for Fusion and Model

Importing Libraries

In [121]:
# Essentials
import pandas as pd
import numpy as np

# Processing
from sklearn.preprocessing import StandardScaler

# Model

# Visualization

# Warnings
import warnings
warnings.filterwarnings('ignore')

Loading Datasets

In [122]:
# Clinical Datasets
African = pd.read_csv(r"/Users/mohammedbasem/Desktop/T2D-Prediction-System--Data-Fusion-for-Enhanced-Decision-Making/processed_datasets/clinical/African_pro.csv")
Bangladesh = pd.read_csv(r"/Users/mohammedbasem/Desktop/T2D-Prediction-System--Data-Fusion-for-Enhanced-Decision-Making/processed_datasets/clinical/Bangladesh_pro.csv")
Iraq = pd.read_csv(r"/Users/mohammedbasem/Desktop/T2D-Prediction-System--Data-Fusion-for-Enhanced-Decision-Making/processed_datasets/clinical/Iraq_pro.csv")

# Genetic Datasets
inter_genetic = pd.read_pickle(r"/Users/mohammedbasem/Desktop/T2D-Prediction-System--Data-Fusion-for-Enhanced-Decision-Making/processed_datasets/genetic/inter_genetic_dataset.csv")
normal_genetic = pd.read_pickle(r"/Users/mohammedbasem/Desktop/T2D-Prediction-System--Data-Fusion-for-Enhanced-Decision-Making/processed_datasets/genetic/normal_genetic_dataset.csv")


Checking Target Columns

In [123]:
print(African.columns, "\n")
print(Bangladesh.columns, "\n")
print(Iraq.columns, "\n")
print(inter_genetic.columns, "\n")
print(normal_genetic.columns)

Index(['Cholesterol', 'Glucose', 'HDL Chol', 'Chol/HDL ratio', 'Age', 'Gender',
       'Height', 'Weight', 'BMI', 'Systolic BP', 'Diastolic BP', 'waist',
       'hip', 'Waist/hip ratio', 'Diabetes', 'BMI Category'],
      dtype='object') 

Index(['age', 'pulse_rate', 'systolic_bp', 'diastolic_bp', 'glucose', 'height',
       'weight', 'bmi', 'family_diabetes', 'hypertensive',
       'family_hypertension', 'cardiovascular_disease', 'stroke',
       'gender_Encoded', 'diabetic_Encoded'],
      dtype='object') 

Index(['Age', 'Urea', 'Cr', 'HbA1c', 'Chol', 'TG', 'HDL', 'LDL', 'VLDL', 'BMI',
       'Gender_Encoded', 'Class_Encoded'],
      dtype='object') 

Index(['STUDY', 'DISEASE_DESCRIPTION', 'REGION', 'CHR_ID', 'CHR_POS',
       'MAPPED_GENE', 'UPSTREAM_GENE_ID', 'DOWNSTREAM_GENE_ID',
       'UPSTREAM_GENE_DISTANCE', 'DOWNSTREAM_GENE_DISTANCE', 'SNPS', 'MERGED',
       'GENOMIC_CONTEXT', 'INTERGENIC', 'RISK_ALLELE_FREQUENCY', 'PVALUE',
       'PVALUE_MLOG', 'EFFECT_SIZE', 'CASE_PERCENT

Uniting Target Columns' name

In [124]:
African.rename(columns={
    'Diabetes': 'T2D',
}, inplace=True)
 
Bangladesh.rename(columns={
    'diabetic_Encoded': 'T2D',
}, inplace=True)
 
Iraq.rename(columns={
    'Class_Encoded': 'T2D',
}, inplace=True)
 
# Add a new column 'T2D' with all values set to 1
inter_genetic['T2D'] = 1
 
# Add a new column 'T2D' with all values set to 1
normal_genetic['T2D'] = 1


Checking Data Types 

In [125]:
print(African.dtypes, '\n')
print(Bangladesh.dtypes, '\n')
print(Iraq.dtypes, '\n')
print(inter_genetic.dtypes, '\n')
print(normal_genetic.dtypes, '\n')


Cholesterol          int64
Glucose              int64
HDL Chol             int64
Chol/HDL ratio     float64
Age                  int64
Gender               int64
Height               int64
Weight               int64
BMI                float64
Systolic BP          int64
Diastolic BP         int64
waist                int64
hip                  int64
Waist/hip ratio    float64
T2D                  int64
BMI Category         int64
dtype: object 

age                         int64
pulse_rate                  int64
systolic_bp                 int64
diastolic_bp                int64
glucose                   float64
height                    float64
weight                    float64
bmi                       float64
family_diabetes             int64
hypertensive                int64
family_hypertension         int64
cardiovascular_disease      int64
stroke                      int64
gender_Encoded              int64
T2D                         int64
dtype: object 

Age                 int64


Splitting Data

In [126]:
# Splitting datasets into training and testing sets
from sklearn.model_selection import train_test_split

# Splitting African dataset
X_train_african, X_test_african, y_train_african, y_test_african = train_test_split(
    African.drop('T2D', axis=1), African['T2D'], test_size=0.2, random_state=42
)

# Splitting Bangladesh dataset
X_train_bangladesh, X_test_bangladesh, y_train_bangladesh, y_test_bangladesh = train_test_split(
    Bangladesh.drop('T2D', axis=1), Bangladesh['T2D'], test_size=0.2, random_state=42
)

# Splitting Iraq dataset
X_train_iraq, X_test_iraq, y_train_iraq, y_test_iraq = train_test_split(
    Iraq.drop('T2D', axis=1), Iraq['T2D'], test_size=0.2, random_state=42
)

# Splitting Inter Genetic dataset
X_train_inter_genetic, X_test_inter_genetic, y_train_inter_genetic, y_test_inter_genetic = train_test_split(
    inter_genetic.drop('T2D', axis=1), inter_genetic['T2D'], test_size=0.2, random_state=42
)

# Splitting Normal Genetic dataset
X_train_normal_genetic, X_test_normal_genetic, y_train_normal_genetic, y_test_normal_genetic = train_test_split(
    normal_genetic.drop('T2D', axis=1), normal_genetic['T2D'], test_size=0.2, random_state=42
)


Feature Scalling

In [127]:
# Importing necessary libraries
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Initializing the scaler
scaler = StandardScaler()

# Handling non-numeric data by selecting numeric columns only
def preprocess_and_scale(X_train, X_test):
    # Selecting only numeric columns
    X_train_numeric = X_train.select_dtypes(include=[float, int])
    X_test_numeric = X_test.select_dtypes(include=[float, int])
    
    # Scaling numeric features
    X_train_scaled = scaler.fit_transform(X_train_numeric)
    X_test_scaled = scaler.transform(X_test_numeric)
    
    return X_train_scaled, X_test_scaled

# Preprocessing and scaling for African dataset
X_train_african_scaled, X_test_african_scaled = preprocess_and_scale(
    X_train_african, X_test_african
)

# Preprocessing and scaling for Bangladesh dataset
X_train_bangladesh_scaled, X_test_bangladesh_scaled = preprocess_and_scale(
    X_train_bangladesh, X_test_bangladesh
)

# Preprocessing and scaling for Iraq dataset
X_train_iraq_scaled, X_test_iraq_scaled = preprocess_and_scale(
    X_train_iraq, X_test_iraq
)

# Preprocessing and scaling for Inter Genetic dataset
X_train_inter_genetic_scaled, X_test_inter_genetic_scaled = preprocess_and_scale(
    X_train_inter_genetic, X_test_inter_genetic
)

# Preprocessing and scaling for Normal Genetic dataset
X_train_normal_genetic_scaled, X_test_normal_genetic_scaled = preprocess_and_scale(
    X_train_normal_genetic, X_test_normal_genetic
)



# Model Selection 

In [128]:
# Importing necessary libraries for model selection
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Defining datasets and corresponding scaled data
datasets = {
    "African": (X_train_african_scaled, X_test_african_scaled, y_train_african, y_test_african),
    "Bangladesh": (X_train_bangladesh_scaled, X_test_bangladesh_scaled, y_train_bangladesh, y_test_bangladesh),
    "Iraq": (X_train_iraq_scaled, X_test_iraq_scaled, y_train_iraq, y_test_iraq),
    "Inter Genetic": (X_train_inter_genetic_scaled, X_test_inter_genetic_scaled, y_train_inter_genetic, y_test_inter_genetic),
    "Normal Genetic": (X_train_normal_genetic_scaled, X_test_normal_genetic_scaled, y_train_normal_genetic, y_test_normal_genetic)
}

# Models to evaluate
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Naive Bayes": GaussianNB(),
    "Decision Tree": DecisionTreeClassifier(),
    "XGBoost": XGBClassifier(eval_metric='logloss'),
}

# Looping through each dataset and evaluating models
for dataset_name, (X_train, X_test, y_train, y_test) in datasets.items():
    print(f"\n{dataset_name} Dataset Model Selection Results:")
    results = {}
    for model_name, model in models.items():
        model.fit(X_train, y_train)  # Training the model
        y_pred = model.predict(X_test)  # Predicting on the test set
        accuracy = accuracy_score(y_test, y_pred)  # Calculating accuracy
        results[model_name] = accuracy

    # Displaying results for the current dataset
    for model_name, accuracy in results.items():
        print(f"{model_name}: Accuracy = {accuracy:.4f}")



African Dataset Model Selection Results:
Logistic Regression: Accuracy = 0.9103
Random Forest: Accuracy = 0.8974
SVM: Accuracy = 0.8974
KNN: Accuracy = 0.8462
Gradient Boosting: Accuracy = 0.8846
Naive Bayes: Accuracy = 0.8974
Decision Tree: Accuracy = 0.8718
XGBoost: Accuracy = 0.9231

Bangladesh Dataset Model Selection Results:
Logistic Regression: Accuracy = 0.9384
Random Forest: Accuracy = 0.9430
SVM: Accuracy = 0.9403
KNN: Accuracy = 0.9347
Gradient Boosting: Accuracy = 0.9412
Naive Bayes: Accuracy = 0.9053
Decision Tree: Accuracy = 0.9127
XGBoost: Accuracy = 0.9393

Iraq Dataset Model Selection Results:
Logistic Regression: Accuracy = 0.9810
Random Forest: Accuracy = 0.9684
SVM: Accuracy = 0.9684
KNN: Accuracy = 0.9494
Gradient Boosting: Accuracy = 0.9747
Naive Bayes: Accuracy = 0.9620
Decision Tree: Accuracy = 0.9747
XGBoost: Accuracy = 0.9747

Inter Genetic Dataset Model Selection Results:


ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 1

Training Model

Fusion

Model Evaluation