<a href="https://colab.research.google.com/github/Bilal-Hijazi/Colab-Code/blob/main/Diabetes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

data = pd.read_csv("diabetes_binary_5050split_health_indicators_BRFSS2015.csv")

thresholds = {
    'BMI': 25,
    'GenHlth': 3,
    'MentHlth': 3,
    'PhysHlth': 5,
    'Age': 7
}

# Create binary columns based on thresholds
for column, threshold in thresholds.items():
    data[column + '_binary'] = (data[column] > threshold).astype(int)

drop_cols = ['GenHlth', 'MentHlth', 'PhysHlth', 'Age', 'Income', 'Education', 'BMI']

data = data.drop(drop_cols, axis=1)

X = data.drop("Diabetes_binary", axis=1)
y = data["Diabetes_binary"]
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Results dictionary
results = {}

# For each r, create an imbalanced dataset, train, and record precision, recall, and F1 score
# Classifier used: random forest
for r in np.arange(0.9, 0, -0.1):
    # Selecting fraction r of positive class instances
    positive_indices = np.where(y_train == 1)[0]
    np.random.shuffle(positive_indices)
    selected_positive_indices = positive_indices[:int(r * len(positive_indices))]
    negative_indices = np.where(y_train == 0)[0]

    # Creating imbalanced training dataset
    imbalanced_train_indices = np.concatenate((selected_positive_indices, negative_indices))
    X_train_imbalanced = X_train.iloc[imbalanced_train_indices]
    y_train_imbalanced = y_train.iloc[imbalanced_train_indices]

    # Training the Random Forest Classifier
    clf = RandomForestClassifier(random_state=42)
    clf.fit(X_train_imbalanced, y_train_imbalanced)
    # Predictions and recording precision, recall, and F1 score
    y_pred = clf.predict(X_test)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    results[round(r, 1)] = {'Precision': round(precision, 3), 'Recall': round(recall, 3), 'F1 Score': round(f1, 3)}

# Print the results
pd.DataFrame(results)


In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

data = pd.read_csv("diabetes_binary_5050split_health_indicators_BRFSS2015.csv")

thresholds = {
    'BMI': 25,
    'GenHlth': 3,
    'MentHlth': 3,
    'PhysHlth': 5,
    'Age': 7
}

# Create binary columns based on thresholds
for column, threshold in thresholds.items():
    data[column + '_binary'] = (data[column] > threshold).astype(int)

drop_cols = ['GenHlth', 'MentHlth', 'PhysHlth', 'Age', 'Income', 'Education', 'BMI']

data = data.drop(drop_cols, axis=1)

X = data.drop("Diabetes_binary", axis=1)
y = data["Diabetes_binary"]
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Results dictionary
results = {}

# For each r, create an imbalanced dataset, train, and record precision, recall, and F1 score
# Classifier used: XGBoost
for r in np.arange(0.9, 0, -0.1):
    # Selecting fraction r of positive class instances
    positive_indices = np.where(y_train == 1)[0]
    np.random.shuffle(positive_indices)
    selected_positive_indices = positive_indices[:int(r * len(positive_indices))]
    negative_indices = np.where(y_train == 0)[0]

    # Creating imbalanced training dataset
    imbalanced_train_indices = np.concatenate((selected_positive_indices, negative_indices))
    X_train_imbalanced = X_train.iloc[imbalanced_train_indices]
    y_train_imbalanced = y_train.iloc[imbalanced_train_indices]

    # Training The XGB Classifier
    xgb = XGBClassifier()
    xgb.fit(X_train_imbalanced, y_train_imbalanced)
    # Predictions and recording precision, recall, and F1 score
    y_pred = xgb.predict(X_test)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    results[round(r, 1)] = {'Precision': round(precision, 3), 'Recall': round(recall, 3), 'F1 Score': round(f1, 3)}

# Print the results
pd.DataFrame(results)


In [None]:
from sklearn import svm
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

data = pd.read_csv("diabetes_binary_5050split_health_indicators_BRFSS2015.csv")

thresholds = {
    'BMI': 25,
    'GenHlth': 3,
    'MentHlth': 3,
    'PhysHlth': 5,
    'Age': 7
}

# Create binary columns based on thresholds
for column, threshold in thresholds.items():
    data[column + '_binary'] = (data[column] > threshold).astype(int)

drop_cols = ['GenHlth', 'MentHlth', 'PhysHlth', 'Age', 'Income', 'Education', 'BMI']

data = data.drop(drop_cols, axis=1)

X = data.drop("Diabetes_binary", axis=1)
y = data["Diabetes_binary"]
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Results dictionary
results = {}

# For each r, create an imbalanced dataset, train, and record precision, recall, and F1 score
# Classifier used: Support Vector Machine (SVM)
for r in np.arange(0.9, 0, -0.1):
    # Selecting fraction r of positive class instances
    positive_indices = np.where(y_train == 1)[0]
    np.random.shuffle(positive_indices)
    selected_positive_indices = positive_indices[:int(r * len(positive_indices))]
    negative_indices = np.where(y_train == 0)[0]

    # Creating imbalanced training dataset
    imbalanced_train_indices = np.concatenate((selected_positive_indices, negative_indices))
    X_train_imbalanced = X_train.iloc[imbalanced_train_indices]
    y_train_imbalanced = y_train.iloc[imbalanced_train_indices]

    # Training The SVM Classifier
    svmClassifier = svm.LinearSVC(random_state=42)  # Instantiate SVM classifier
    svmClassifier.fit(X_train_imbalanced, y_train_imbalanced)  # Train the SVM classifier

    # Predictions and recording precision, recall, and F1 score
    y_pred = svmClassifier.predict(X_test)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    results[round(r, 1)] = {'Precision': round(precision, 3), 'Recall': round(recall, 3), 'F1 Score': round(f1, 3)}

# Print the results
print(pd.DataFrame(results))


In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

# Load the dataset
data = pd.read_csv("diabetes_binary_5050split_health_indicators_BRFSS2015.csv")

# Define thresholds for creating binary columns
thresholds = {
    'BMI': 25,
    'GenHlth': 3,
    'MentHlth': 3,
    'PhysHlth': 5,
    'Age': 7
}

# Create binary columns based on thresholds
for column, threshold in thresholds.items():
    data[column + '_binary'] = (data[column] > threshold).astype(int)

# Drop unnecessary columns
drop_cols = ['GenHlth', 'MentHlth', 'PhysHlth', 'Age', 'Income', 'Education', 'BMI']
data = data.drop(drop_cols, axis=1)

# Split the dataset into features (X) and target variable (y)
X = data.drop("Diabetes_binary", axis=1)
y = data["Diabetes_binary"]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Results dictionary
results = {}

# For each fraction r of positive class instances
for r in np.arange(0.9, 0, -0.1):
    # Select fraction r of positive class instances
    positive_indices = np.where(y_train == 1)[0]
    np.random.shuffle(positive_indices)
    selected_positive_indices = positive_indices[:int(r * len(positive_indices))]
    negative_indices = np.where(y_train == 0)[0]

    # Create imbalanced training dataset
    imbalanced_train_indices = np.concatenate((selected_positive_indices, negative_indices))
    X_train_imbalanced = X_train.iloc[imbalanced_train_indices]
    y_train_imbalanced = y_train.iloc[imbalanced_train_indices]

    # Initialize and train the MLPClassifier
    mlp = MLPClassifier(random_state=42, max_iter=500)
    mlp.fit(X_train_imbalanced, y_train_imbalanced)

    # Predictions and recording precision, recall, and F1 score
    y_pred = mlp.predict(X_test)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    results[round(r, 1)] = {'Precision': round(precision, 3), 'Recall': round(recall, 3), 'F1 Score': round(f1, 3)}

# Print the results
print(pd.DataFrame(results))


In [None]:
import tensorflow as tf
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

# Load the dataset
data = pd.read_csv("diabetes_binary_5050split_health_indicators_BRFSS2015.csv")

# Define thresholds for creating binary columns
thresholds = {
    'BMI': 25,
    'GenHlth': 3,
    'MentHlth': 3,
    'PhysHlth': 5,
    'Age': 7
}

# Create binary columns based on thresholds
for column, threshold in thresholds.items():
    data[column + '_binary'] = (data[column] > threshold).astype(int)

# Drop unnecessary columns
drop_cols = ['GenHlth', 'MentHlth', 'PhysHlth', 'Age', 'Income', 'Education', 'BMI']
data = data.drop(drop_cols, axis=1)

# Split the dataset into features (X) and target variable (y)
X = data.drop("Diabetes_binary", axis=1)
y = data["Diabetes_binary"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

results = {}


for r in np.arange(0.9, 0, -0.1):

    positive_indices = np.where(y_train == 1)[0]
    np.random.shuffle(positive_indices)
    selected_positive_indices = positive_indices[:int(r * len(positive_indices))]
    negative_indices = np.where(y_train == 0)[0]


    imbalanced_train_indices = np.concatenate((selected_positive_indices, negative_indices))
    X_train_imbalanced = X_train.iloc[imbalanced_train_indices]
    y_train_imbalanced = y_train.iloc[imbalanced_train_indices]


    model = tf.keras.Sequential([
        tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train_imbalanced.shape[1],)),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])


    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    # Train the model
    model.fit(X_train_imbalanced, y_train_imbalanced, epochs=10, batch_size=32, verbose=0)

    # Predictions and recording precision, recall, and F1 score
    y_pred = (model.predict(X_test) > 0.5).astype("int32")
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    results[round(r, 1)] = {'Precision': round(precision, 3), 'Recall': round(recall, 3), 'F1 Score': round(f1, 3)}

# Print the results
print(pd.DataFrame(results))
