> #  Import necessary libraries

In [None]:
import pandas as pd
import numpy as np                 
import matplotlib.pyplot as plt
import seaborn as sns 
import missingno as msno
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

>  # 1. Load Dataset

In [None]:
Data = pd.read_csv("Diabetes prediction.csv")

> # 2.  information about dataset

> ###    2-1 Columns headings

In [None]:
Data.columns

> ### 2-2 Total information about dataset

In [None]:
Data.info()

> ### 2-2-1. Details of dataset (transpose)

In [None]:
Data.describe().T

> ### 2-2-2. Histogram of dataset

In [None]:
Data.hist(bins = 30, figsize = (15, 15), edgecolor = "white")
plt.grid(axis = "x", alpha = 0.8)
plt.suptitle("Histogram of Data", fontsize = 20)
plt.tight_layout()
plt.show()

> ### 2-3. Checking null valuses

In [None]:
Data.isnull()

> ### 2-3-1. Count of nulls in each columns

In [None]:
Data.isnull().sum()

> ### 2-3-2. Chart of nulls in each columns

In [None]:
num_columns = len(Data.columns)
colors = plt.cm.viridis(np.linspace(0, 1, num_columns))  
msno.bar(Data, color = colors)
plt.suptitle("Missing values", fontsize = 30)
plt.tight_layout()
plt.show()

> # 3. Data Cleaning and Preparation

> ### 3-1. Deleting Unnessesary columns

In [None]:
Data.drop(columns = ["id"], inplace = True)       # Unusefull
Data.drop(columns = ["frame"], inplace = True)    # Unusefull because the dataset isn"t so big
Data.drop(columns = ["location"], inplace = True) # Unusefull because the dataset isn"t so big
Data.drop(columns = ["bp.2s"], inplace = True)    # Lots of missing values
Data.drop(columns = ["bp.2d"], inplace = True)    # Lots of missing values

> ### 3-2. Calculate range of ages and place it in a new column

In [None]:
# min_age = Data["age"].min()
# max_age = Data["age"].max()
# print(f"The range of ages is from {min_age} to {max_age}.")
# print ("---------")

B = [0, 25, 60, 100]   
L = ["Young", "Adult", "Elderly"]
Data["age_group"] = pd.cut(Data["age"], bins = B, labels = L, right = False)

> ### 3-3. Filling NaN in whole dataset with any type of data

In [None]:
for col in Data.columns:
    if Data[col].dtype == "int64" or Data[col].dtype == "float64":    # Check for numeric types
        Data[col] = Data[col].fillna(Data[col].mean())
    elif Data[col].dtype == "object":                                 # Check for string types
        Data[col] = Data[col].fillna("UNKNOWN")

> ### 3-4. Calculate BMI and place it in a new column
> 1. BMI = weight (kg) / height (m) ** 2
> 2. BMI = ( weight (lbs) / height (inches) ** 2 ) * 703

In [None]:
Data["BMI"] = ( Data["weight"] / (Data["height"] ** 2) ) * 703

In [None]:
Data

> # 4. Exploratory Data Analysis (EDA)

> ### 4-1. Histogram of dataset after cleaning and preparation step

In [None]:

Data.hist(bins = 30, figsize = (20, 15), edgecolor = "white")
plt.grid(axis = "x", alpha = 0.8)
plt.suptitle("Histogram of Data", fontsize = 20)
plt.tight_layout()
plt.show()

> ### 4-2. Boxplot of dataset

In [None]:
Row , Col = 5, 4          # Count of rows and columns for chart
fig, axes = plt.subplots(Row, Col, figsize = (15, 10))
plt.suptitle("BoxPlot of Dataset", fontsize = 20)

axes = axes.flatten()     # Flatten the axes, for easier iteration

for i, column in enumerate(Data.columns):       # Loop through numeric columns and create boxplots
    sns.boxplot(data = Data, x = column, ax = axes[i])

for j in range(len(Data.columns), len(axes)):   # Remove any remaining empty subplots
    fig.delaxes(axes[j])

plt.tight_layout()
plt.grid()
plt.show()

> ### 4-3. Pairplot of dataset with age_group filter 

In [None]:
R = sns.pairplot(Data, hue = "age_group")

> ### 4-4. Correlation of dataset
>   Correlation is for numeric data not string data

In [None]:
Num_Data = Data.copy()

# Create special dictionary for strings
gender_map = {"male" :1, "female" :2 }
age_group_map = {"Young" :1, "Adult":2, "Elderly":3}

# Using map of Pandas to convert strings to special numeric data
Num_Data["gender"] = Num_Data["gender"].map(gender_map)
Num_Data["age_group"] = Num_Data["age_group"].map(age_group_map)

# Calculate Correlation
Num_Data.corr()

> ### 4-5. Heatmap of dataset

In [None]:

plt.figure(figsize = (22, 15))
M = np.triu(Num_Data.corr())
sns.heatmap(Num_Data.corr(), annot = True, mask = M, cmap = "RdYlGn")
plt.suptitle("HeatMap of Correlation", fontsize = 20)
plt.show()

> ### 4-6. Boxplot of dataset with details

In [None]:
numerical = Num_Data.columns.tolist()

# Create subplots
fig = make_subplots(rows = len(numerical), cols = 1)

# Add box plots for each numeric column
for i, column in enumerate(numerical):
    fig.add_trace(go.Box(x = Num_Data[column], name = column), row = i + 1, col = 1)

# Update layout
fig.update_layout(title_text = "Box Plots of Numeric Columns", showlegend = False, width = 1200, height = 1200)

# Show the figure
fig.show()

> ### 4-7. Strip plot and violin plot

> ### 4-7-1. for Hemoglobin A1c

In [None]:
# Define color palettes
palette1 = ["#EB3324", "#0023F5"]             # Two colors for gender
palette2 = ["#F08650", "#741B7C", "#4d6a73"]  # Three colors for age groups

# Create subplots
fig, axes = plt.subplots(nrows = 2, ncols = 1, figsize = (20, 10), dpi = 400)
fig.suptitle("Gender and Age Group vs Glyhb Distribution", fontsize = 20)

# Strip plot and violin plot for gender
sns.stripplot(x = "glyhb", y = "gender", data = Data, hue = "gender", palette = palette1, orient = "h", ax = axes[0], dodge = True, legend = False)
sns.violinplot(x =  "glyhb", y = "gender", data = Data, hue = "gender", palette = palette1, orient = "h", ax = axes[0], alpha = 0.5, legend = False)
axes[0].set_ylabel("Gender", fontsize = 17)

# Strip plot and violin plot for age group
sns.stripplot(x = "glyhb", y = "age_group", data = Data, hue = "age_group", palette = palette2, orient = "h", ax = axes[1], dodge = True, legend = False)
sns.violinplot(x =  "glyhb", y = "age_group", data = Data, hue = "age_group", palette = palette2, orient = "h", ax = axes[1], alpha = 0.5, legend = False)
axes[1].set_ylabel("Age Group", fontsize = 17)

# Show the plots
plt.tight_layout(rect = [0, 0, 1, 0.95])      # Adjust layout to make room for the main title
plt.show()

> ### 4-7-2. for Stabilized Glucose

In [None]:
# Define color palettes
palette1 = ["#EB3324", "#0023F5"]             
palette2 = ["#F08650", "#741B7C", "#4d6a73"]  

# Create subplots
fig, axes = plt.subplots(nrows = 2, ncols = 1, figsize = (20, 10), dpi = 400)
fig.suptitle("Gender/ Age Group vs stab.glu distribution", fontsize = 20)

# Strip plot and violin plot for gender
sns.stripplot(x = "stab.glu", y = "gender", data = Data, hue = "gender", palette = palette1, orient = "h", ax = axes[0], dodge = True, legend = False)
sns.violinplot(x = "stab.glu", y = "gender", data = Data, hue = "gender", palette = palette1, orient = "h", ax = axes[0], alpha = 0.5, legend = False)
axes[0].set_ylabel("Gender", fontsize = 17)
axes[0].set_yticks([0, 1])  
axes[0].set_yticklabels(["Male", "Female"])  

# Strip plot and violin plot for age group
sns.stripplot(x = "stab.glu", y = "age_group", data = Data, hue = "age_group", palette = palette2, orient = "h", ax = axes[1], dodge = True, legend = False)
sns.violinplot(x = "stab.glu", y = "age_group", data = Data, hue = "age_group", palette = palette2, orient = "h", ax = axes[1], alpha = 0.5, legend = False)
axes[1].set_ylabel("Age Group", fontsize = 17)

# Show the plots
plt.tight_layout(rect = [0, 0, 1, 0.95])
plt.show()

> ### 4-8. Regression Plot for Hemoglobin A1c vs Stabilized Glucose

In [None]:
plt.figure(figsize = (10, 5))
sns.set_theme(style = "whitegrid")
sns.regplot(x = "stab.glu", y = "glyhb", data = Data, marker = "o", color = "blue", 
            scatter_kws =  {"s": 100, "alpha": 0.7, "edgecolor": "w"}, line_kws = {"color": "red", "linewidth": 2})
plt.title("Regression Plot of Stab.glu vs Glyhb")
plt.xlabel("Stab.glu (mg/dL)")
plt.ylabel("Glyhb (%)")
plt.show()

> ### 4-9. Calculate diabetes status and place it in a new column

In [None]:
def categorize_patient(row):
    glyhb = row["glyhb"]
    stab_glu = row["stab.glu"]
    
    if glyhb <=  5.7 or glyhb <=  5.7 and stab_glu <=  99:
        return "Normal"
    elif 5.7 < glyhb < 6.5 or 5.7 < glyhb < 6.5 and 99 < stab_glu < 125:
        return "Prediabetes"
    elif glyhb >=  6.5 or glyhb >=  6.5 and stab_glu >=  125:
        return "Diabetes"
    else:
        return "Unknown"
    
Data["Diabetes_Status"] = Data.apply(categorize_patient, axis = 1)

#Data
print(Data["Diabetes_Status"].value_counts())


# Pie chart
status_counts = Data["Diabetes_Status"].value_counts()
plt.figure(figsize = (6, 6))
plt.pie(status_counts , autopct = "%1.1f%%", startangle = 200)
plt.title("Distribution of Diabetes Status")
plt.axis("equal")
plt.axis("equal")
plt.legend(labels = status_counts.index)
plt.show()

> ### 4-10. Compare features with each other

> ### 4-10-1. Hemoglobin A1c vs Stabilized Glucose

In [None]:
# Define colors for each age group
colors = {"Young": "#F08650", "Adult": "#377D22", "Elderly": "#FF26FA"}

# Create the figure
plt.figure(figsize = (15, 5), dpi = 400)

# Loop through each age group and plot
for age_group, color in colors.items():
    subset = Data[Data["age_group"] == age_group]
    plt.scatter(subset["stab.glu"], subset["glyhb"], marker = "o", s = 80,
                facecolors = "none", edgecolors = color, lw = 1, label = age_group)

plt.title("Hemoglobin A1c vs Stabilized Glucose", fontsize = 30)
plt.xlabel("stab.glu (mg/dl)", fontsize = 15)
plt.ylabel("glyhb (%)", fontsize = 15)

# Vertical lines for stab.glu levels
plt.axvline(x = 99, color = "black", linestyle = "--")
plt.axvline(x = 126, color = "blue", linestyle = "-")

# Text annotations for stab.glu levels
plt.text(80, 16.5, "Normal", fontsize = 11, color = "black")
plt.text(80, 15.5, "x < 99", fontsize = 11, color = "black")
plt.text(99.3, 16, "Prediabetes", fontsize = 11, color = "black")
plt.text(99.3, 15, "99 < x < 126", fontsize = 11, color = "black")
plt.text(128, 15, "Diabetes", fontsize = 11, color = "blue")
plt.text(128, 14, "x > 126", fontsize = 11, color = "blue")

# Horizontal lines for glyhb levels
plt.axhline(y = 5.7, color = "black", linestyle = "--")
plt.axhline(y = 6.4, color = "red", linestyle = "-")

# Text annotations for glyhb levels
plt.text(300, 5, "y < 5.7 : Normal", fontsize = 11, color = "black")
plt.text(300, 5.8, "5.7 < y < 6.4 : Prediabetes", fontsize = 11, color = "black")
plt.text(300, 6.9, "y > 6.4 : Diabetes", fontsize = 11, color = "red")

# Set tick sizes
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)

# Add legend
plt.legend(title = "Age Group", fontsize = 12, title_fontsize = "13")

# Show the plot
plt.show()

> ### 4-10-2. Cholesterol Ratio vs Cholesterol

In [None]:
# Define colors for each age group
colors = {"Young": "#F08650", "Adult": "#377D22", "Elderly": "#FF26FA"}

# Create the figure
plt.figure(figsize = (15, 5), dpi = 400)

# Loop through each age group and plot
for age_group, color in colors.items():
    subset = Data[Data["age_group"] == age_group]
    plt.scatter(subset["chol"], subset["ratio"], marker = "o", s = 80,
                facecolors = "none", edgecolors = color, lw = 1, label = age_group)

plt.title("Cholesterol Ratio vs Cholesterol", fontsize = 30)
plt.xlabel("chol (mg/dl)", fontsize = 15)
plt.ylabel("ratio (Number)", fontsize = 15)

# Vertical lines for chol levels
plt.axvline(x = 199, color = "black", linestyle = "--")
plt.axvline(x = 239, color = "blue", linestyle = "-")

# Text annotations for chol levels
plt.text(178,17.5, "Desirable", fontsize = 11, color = "black")
plt.text(178, 16.5, "x < 199", fontsize = 11, color = "black")
plt.text(205,17.5, "Borderline", fontsize = 11, color = "black")
plt.text(205, 16.5, "199 < x < 239", fontsize = 11, color = "black")
plt.text(245, 17.5, "High", fontsize = 11, color = "blue")
plt.text(245, 16.5, "x > 239", fontsize = 11, color = "blue")

# Horizontal lines for ratio levels
plt.axhline(y = 3.5, color = "black", linestyle = "--")
plt.axhline(y = 4.5, color = "black", linestyle = "--")
plt.axhline(y = 5, color = "red", linestyle = "-")

# Text annotations for ratio levels
plt.text(410, 2.6, "y < 3.5 : Ideal", fontsize = 11, color = "black")
plt.text(410, 3.8, "3.5 < y < 4.5 : Good", fontsize = 11, color = "black")
plt.text(410, 4.6, "4.5 < y < 5 : Moderate risk", fontsize = 11, color = "black")
plt.text(410, 5.3, "y > 5 : High risk", fontsize = 11, color = "red")

# Set tick sizes
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)

# Add legend
plt.legend(title = "Age Group", fontsize = 12, title_fontsize = "13")

# Show the plot
plt.show()

> ### 4-10-3. Cholesterol vs BMI

In [None]:
colors = {"Young": "#F08650", "Adult": "#377D22", "Elderly" : "#FF26FA"}

# Create the figure
plt.figure(figsize = (15, 5), dpi = 400)

# Loop through each gender and plot
for age_group, color in colors.items():
    subset = Data[Data["age_group"] == age_group]
    plt.scatter(subset["chol"], subset["BMI"], marker = "o", s = 80,
                facecolors = "none", edgecolors = color, lw = 1, label = age_group)

plt.title("Cholesterol vs BMI", fontsize = 30)
plt.xlabel("chol (mg/dl)", fontsize = 15)
plt.ylabel("BMI (Number)", fontsize = 15)

# Vertical lines for chol levels
plt.axvline(x = 199, color = "black", linestyle = "--")
plt.axvline(x = 239, color = "blue", linestyle = "-")

# Text annotations for chol levels
plt.text(174, 55, "Desirable", fontsize = 11, color = "black")
plt.text(174, 52, "x < 199", fontsize = 11, color = "black")
plt.text(205, 55, "Borderline", fontsize = 11, color = "black")
plt.text(205, 52, "199 < x < 239", fontsize = 11, color = "black")
plt.text(245, 55, "High", fontsize = 11, color = "blue")
plt.text(245, 52, "x > 239", fontsize = 11, color = "blue")

# Horizontal lines for BMI levels
plt.axhline(y = 18.5, color = "black", linestyle = "--")
plt.axhline(y = 24.9, color = "black", linestyle = "--")
plt.axhline(y = 29.9, color = "red", linestyle = "-")

# Text annotations for BMI levels
plt.text(380, 16, "y < 18.5 : Under weight", fontsize = 11, color = "black")
plt.text(380, 21, "18.5 < y < 24.9 : Normal weight", fontsize = 11, color = "black")
plt.text(380, 27, "24.9 < y < 29.9 : Over weight", fontsize = 11, color = "black")
plt.text(380, 32, "y > 29.9 : Obesity weight", fontsize = 11, color = "red")

# Set tick sizes
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)

# Add legend
plt.legend(title = "Age_Group", fontsize = 12, title_fontsize = "13")

# Show the plot
plt.show()

> ### 4-10-4. Hemoglobin A1c vs Systolic Blood Pressure

In [None]:
colors = {"Young": "#F08650", "Adult": "#377D22", "Elderly" : "#FF26FA"}

# Create the figure
plt.figure(figsize = (15, 5), dpi = 400)

# Loop through each gender and plot
for age_group, color in colors.items():
    subset = Data[Data["age_group"] == age_group]
    plt.scatter(subset["bp.1s"], subset["glyhb"], marker = "o", s = 80,
                facecolors = "none", edgecolors = color, lw = 1, label = age_group)
    
plt.title("Hemoglobin A1c vs Systolic Blood Pressure", fontsize = 30)
plt.xlabel("bp.1s (mm Hg)", fontsize = 15)
plt.ylabel("glyhb (%)", fontsize = 15)

# Vertical lines for bp.1s levels
plt.axvline(x = 120, color = "black", linestyle = "--")
plt.axvline(x = 130, color = "black", linestyle = "--")
plt.axvline(x = 140, color = "blue", linestyle = "-")

# Text annotations for bp.1s levels
plt.text(110, 16.5, "Normal", fontsize = 11, color = "black")
plt.text(110, 15.5, "x < 120", fontsize = 11, color = "black")
plt.text(121, 15.5, "Elevated", fontsize = 11, color = "black")
plt.text(121, 14.5, "120 < x < 130", fontsize = 11, color = "black")
plt.text(131, 14, "Hypertension", fontsize = 11, color = "black")
plt.text(131, 13, "130 < x < 140", fontsize = 11, color = "black")
plt.text(141, 12.5, "emergency care", fontsize = 11, color = "blue")
plt.text(141, 11.5, "x > 140", fontsize = 11, color = "blue")

# Horizontal lines for glyhb levels
plt.axhline(y = 5.7, color = "black", linestyle = "--")
plt.axhline(y = 6.4, color = "red", linestyle = "-")

# Text annotations for glyhb levels
plt.text(233, 5, "y < 5.7 : Normal", fontsize = 11, color = "black")
plt.text(230, 5.9, "5.7 < y < 6.4 : Prediabetes", fontsize = 11, color = "black")
plt.text(227, 6.8, "y > 6.4 : Diabetes", fontsize = 11, color = "red")

# Set tick sizes
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)

# Add legend
plt.legend(title = "Age_Group", fontsize = 12, title_fontsize = "13")

# Show the plot
plt.show()

> ### 4-10-5. Hemoglobin A1c vs Diastolic Blood Pressure

In [None]:
# Define colors for each age group
colors = {"Young": "#F08650", "Adult": "#377D22", "Elderly": "#FF26FA"}

# Create the figure
plt.figure(figsize = (15, 5), dpi = 400)

# Loop through each age group and plot
for age_group, color in colors.items():
    subset = Data[Data["age_group"] == age_group]
    plt.scatter(subset["bp.1d"], subset["glyhb"], marker = "o", s = 80,
                facecolors = "none", edgecolors = color, lw = 1, label = age_group)
    
plt.title("Hemoglobin A1c vs Diastolic Blood Pressure", fontsize = 30)
plt.xlabel("bp.1d (mm Hg)", fontsize = 15)
plt.ylabel("glyhb (%)", fontsize = 15)

# Vertical lines for bp.1d levels
plt.axvline(x = 80, color = "black", linestyle = "--")
plt.axvline(x = 89, color = "black", linestyle = "--")
plt.axvline(x = 99, color = "blue", linestyle = "-")

# Text annotations for bp.1d levels
plt.text(75, 15, "Normal", fontsize = 11, color = "black")
plt.text(75, 14.5, "x < 80", fontsize = 11, color = "black")
plt.text(81, 14, "Elevated", fontsize = 11, color = "black")
plt.text(81, 13.5, "80 < x < 89", fontsize = 11, color = "black")
plt.text(90, 13, "Hypertension", fontsize = 11, color = "black")
plt.text(90, 12.5, "89 < x < 99", fontsize = 11, color = "black")
plt.text(100, 12, "emergency care", fontsize = 11, color = "blue")
plt.text(100, 11.5, "x > 99", fontsize = 11, color = "blue")

# Horizontal lines for glyhb levels
plt.axhline(y = 5.7, color = "black", linestyle = "--")
plt.axhline(y = 6.4, color = "red", linestyle = "-")

# Text annotations for glyhb levels
plt.text(118, 5, "y < 5.7 : Normal", fontsize = 11, color = "black")
plt.text(115, 5.9, "5.7 < y < 6.4 : Prediabetes", fontsize = 11, color = "black")
plt.text(118, 6.8, "y > 6.4 : Diabetes", fontsize = 11, color = "red")

# Set tick sizes
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)

# Add legend
plt.legend(title = "Age_Group", fontsize = 12, title_fontsize = "13")

# Show the plot
plt.show()

> ### 4-10-6. Cholesterol vs High-Density Lipoprotein

In [None]:
# Define colors for each age group
colors = {"Young": "#F08650", "Adult": "#377D22", "Elderly": "#FF26FA"}

# Create the figure
plt.figure(figsize = (15, 5), dpi = 400)

# Loop through each age group and plot
for age_group, color in colors.items():
    subset = Data[Data["age_group"] == age_group]
    plt.scatter(subset["chol"], subset["glyhb"], marker = "o", s = 80,
                facecolors = "none", edgecolors = color, lw = 1, label = age_group)

plt.title("Cholesterol vs High-Density Lipoprotein", fontsize = 30)
plt.xlabel("chol (mg/dl)", fontsize = 15)
plt.ylabel("hdl (mg/dl)", fontsize = 15)

# Vertical lines for chol levels
plt.axvline(x = 199, color = "black", linestyle = "--")
plt.axvline(x = 239, color = "blue", linestyle = "-")

# Text annotations for chol levels
plt.text(178, 115, "Desirable  ", fontsize = 11, color = "black")
plt.text(178, 110, "x < 199", fontsize = 11, color = "black")
plt.text(205, 115, "Borderline ", fontsize = 11, color = "black")
plt.text(205, 110, "199 < x < 239", fontsize = 11, color = "black")
plt.text(245, 115, "High  ", fontsize = 11, color = "blue")
plt.text(245, 110, "x > 239", fontsize = 11, color = "blue")

# Horizontal lines for glyhb levels
plt.axhline(y = 39, color = "black", linestyle = "--")
plt.axhline(y = 59, color = "red", linestyle = "-")

# Text annotations for glyhb levels
plt.text(350, 34, "y < 39 : Low   ", fontsize = 11, color = "black")
plt.text(350, 52, "39 < y < 59 : Acceptable  ", fontsize = 11, color = "black")
plt.text(350, 63, "y > 59 : Good   ", fontsize = 11, color = "red")

# Set tick sizes
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)

# Add legend
plt.legend(title = "Age_Group", fontsize = 12, title_fontsize = "13")

# Show the plot
plt.show()

> ### 4-10-7. Hemoglobin A1c vs BMI

In [None]:
# Define colors for each age group
colors = {"Young": "#F08650", "Adult": "#377D22", "Elderly": "#FF26FA"}

# Create the figure
plt.figure(figsize = (15, 5), dpi = 400)

# Loop through each age group and plot
for age_group, color in colors.items():
    subset = Data[Data["age_group"] == age_group]
    plt.scatter(subset["glyhb"], subset["BMI"], marker = "o", s = 80,
            facecolors = "none", edgecolors = color, lw = 1, label = age_group)
    
plt.title("Hemoglobin A1c vs BMI", fontsize = 30)
plt.xlabel("glyhb (%)", fontsize = 15)
plt.ylabel("BMI (Number)", fontsize = 15)

# Vertical lines for glyhb levels
plt.axvline(x = 5.7, color = "black", linestyle = "--")
plt.axvline(x = 6.4, color = "blue", linestyle = "-")

# Text annotations for glyhb levels
plt.text(5, 55, "Normal ", fontsize = 11, color = "black")
plt.text(5, 53, "x < 5.7", fontsize = 11, color = "black")
plt.text(5.8, 53, "Prediabetes ", fontsize = 11, color = "black")
plt.text(5.8, 51, "5.7 < x < 6.4", fontsize = 11, color = "black")
plt.text(6.5, 49, "Diabetes", fontsize = 11, color = "blue")
plt.text(6.5, 47, "x > 6.4", fontsize = 11, color = "blue")

# Horizontal lines for BMI levels
plt.axhline(y = 18.5, color = "black", linestyle = "--")
plt.axhline(y = 24.9, color = "black", linestyle = "--")
plt.axhline(y = 29.9, color = "red", linestyle = "-")

# Text annotations for BMI levels
plt.text(14, 16.5, "y < 18.5 : Under weight ", fontsize = 11, color = "black")
plt.text(14, 22.5, "18.5 < y < 24.9 : Normal weight ", fontsize = 11, color = "black")
plt.text(14, 28, "24.9 < y < 29.9 : Over weight ", fontsize = 11, color = "black")
plt.text(14, 32, "y > 29.9 : Obesity weight ", fontsize = 11, color = "red")


# Set tick sizes
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)

# Add legend
plt.legend(title = "Age_Group", fontsize = 12, title_fontsize = "13")

# Show the plot
plt.show()

> ### 4-10-8. Stabilized Glucose vs Cholesterol

In [None]:
# Define colors for each age group
colors = {"Young": "#F08650", "Adult": "#377D22", "Elderly": "#FF26FA"}

# Create the figure
plt.figure(figsize = (15, 5), dpi = 400)

# Loop through each age group and plot
for age_group, color in colors.items():
    subset = Data[Data["age_group"] == age_group]
    plt.scatter(subset["stab.glu"], subset["chol"], marker = "o", s = 80,
            facecolors = "none", edgecolors = color, lw = 1, label = age_group)
    
plt.title("Stabilized Glucose vs Cholesterol", fontsize = 30)
plt.xlabel("stab.glu (mg/dl)", fontsize = 15)
plt.ylabel("chol (mg/dl)", fontsize = 15)

# Vertical lines for stab.glu levels
plt.axvline(x = 99, color = "black", linestyle = "--")
plt.axvline(x = 126, color = "blue", linestyle = "-")

# Text annotations for stab.glu levels
plt.text(78, 420, "Normal ", fontsize = 11, color = "black")
plt.text(78, 405, "x < 99", fontsize = 11, color = "black")
plt.text(100, 420, "Prediabetes", fontsize = 11, color = "black")
plt.text(100, 405, "99 < x < 126", fontsize = 11, color = "black")
plt.text(130, 420, "Diabetes ", fontsize = 11, color = "blue")
plt.text(130, 405, "x > 126", fontsize = 11, color = "blue")

# Horizontal lines for chol levels
plt.axhline(y = 199, color = "black", linestyle = "--")
plt.axhline(y = 239, color = "red", linestyle = "-")

# Text annotations for chol levels
plt.text(350, 185, "y < 199 : Desirable ", fontsize = 11, color = "black")
plt.text(350, 215, "199 < y < 239 : Borderline ", fontsize = 11, color = "black")
plt.text(350, 245, "y > 239 : High ", fontsize = 11, color = "red")


# Set tick sizes
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)

# Add legend
plt.legend(title = "Age_Group", fontsize = 12, title_fontsize = "13")

# Show the plot
plt.show()

> ### 4-10-9. Hemoglobin A1c vs Cholesterol

In [None]:
# Define colors for each age group
colors = {"Young": "#F08650", "Adult": "#377D22", "Elderly": "#FF26FA"}

# Create the figure
plt.figure(figsize = (15, 5), dpi = 400)

# Loop through each age group and plot
for age_group, color in colors.items():
    subset = Data[Data["age_group"] == age_group]
    plt.scatter(subset["glyhb"], subset["chol"], marker = "o", s = 80,
            facecolors = "none", edgecolors = color, lw = 1, label = age_group)
    
plt.title("Hemoglobin A1c vs Cholesterol", fontsize = 30)
plt.xlabel("glyhb (%)", fontsize = 15)
plt.ylabel("chol (mg/dl)", fontsize = 15)

# Vertical lines for glyhb
plt.axvline(x = 5.7, color = "black", linestyle = "--")
plt.axvline(x = 6.4, color = "blue", linestyle = "-")

# Text annotations for glyhb levels
plt.text(5, 420, "Normal ", fontsize = 11, color = "black")
plt.text(5, 405, "x < 5.7", fontsize = 11, color = "black")
plt.text(5.8, 405, "Prediabetes ", fontsize = 11, color = "black")
plt.text(5.8, 390, "5.7 < x < 6.4", fontsize = 11, color = "black")
plt.text(6.5, 380, "Diabetes", fontsize = 11, color = "blue")
plt.text(6.5, 365, "x > 6.4", fontsize = 11, color = "blue")

# Horizontal lines for chol levels
plt.axhline(y = 199, color = "black", linestyle = "--")
plt.axhline(y = 239, color = "red", linestyle = "-")

# Text annotations for chol levels
plt.text(14, 185, "y < 199 : Desirable ", fontsize = 11, color = "black")
plt.text(14, 215, "199 < y < 239 : Borderline ", fontsize = 11, color = "black")
plt.text(14, 245, "y > 239 : High ", fontsize = 11, color = "red")


# Set tick sizes
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)

# Add legend
plt.legend(title = "Age_Group", fontsize = 12, title_fontsize = "13")

# Show the plot
plt.show()

> ### 4-10-10. Hemoglobin A1c vs Waist

In [None]:
# Define colors for each age group
colors = {"Young": "#F08650", "Adult": "#377D22", "Elderly": "#FF26FA"}

# Create the figure
plt.figure(figsize = (15, 5), dpi = 400)

# Loop through each age group and plot
for age_group, color in colors.items():
    subset = Data[Data["age_group"] == age_group]
    plt.scatter(subset["glyhb"], subset["waist"], marker = "o", s = 80,
            facecolors = "none", edgecolors = color, lw = 1, label = age_group)
    
plt.title("Hemoglobin A1c vs Waist", fontsize = 30)
plt.xlabel("glyhb (%)", fontsize = 15)
plt.ylabel("waist (inches)", fontsize = 15)

# Vertical lines for glyhb
plt.axvline(x = 5.7, color = "black", linestyle = "--")
plt.axvline(x = 6.4, color = "blue", linestyle = "-")

# Text annotations for glyhb levels
plt.text(5, 58, "Normal ", fontsize = 11, color = "black")
plt.text(5, 56, "x < 5.7", fontsize = 11, color = "black")
plt.text(5.8, 56, "Prediabetes ", fontsize = 11, color = "black")
plt.text(5.8, 54, "5.7 < x < 6.4", fontsize = 11, color = "black")
plt.text(6.5, 53, "Diabetes", fontsize = 11, color = "blue")
plt.text(6.5, 51, "x > 6.4", fontsize = 11, color = "blue")

# Horizontal lines for waist levels
plt.axhline(y = 35, color = "red", linestyle = "--")
plt.axhline(y = 40, color = "red", linestyle = "--")

# Text annotations for waist levels
plt.text(13, 36, "y > 35 : high risk for womens ", fontsize = 11, color = "red")
plt.text(13, 41, "y > 40 : high risk for mens", fontsize = 11, color = "red")


# Set tick sizes
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)

# Add legend
plt.legend(title = "Age_Group", fontsize = 12, title_fontsize = "13")

# Show the plot
plt.show()

> ### 4-10-11. Stabilized Glucose vs Waist

In [None]:
# Define colors for each age group
colors = {"Young": "#F08650", "Adult": "#377D22", "Elderly": "#FF26FA"}

# Create the figure
plt.figure(figsize = (15, 5), dpi = 400)

# Loop through each age group and plot
for age_group, color in colors.items():
    subset = Data[Data["age_group"] == age_group]
    plt.scatter(subset["stab.glu"], subset["waist"], marker = "o", s = 80,
            facecolors = "none", edgecolors = color, lw = 1, label = age_group)
    
plt.title("Stabilized Glucose vs Waist", fontsize = 30)
plt.xlabel("stab.glu (mg/dl)", fontsize = 15)
plt.ylabel("waist (inches)", fontsize = 15)

# Vertical lines for stab.glu
plt.axvline(x = 99, color = "black", linestyle = "--")
plt.axvline(x = 126, color = "blue", linestyle = "-")

# Text annotations for stab.glu levels
plt.text(78, 58, "Normal ", fontsize = 11, color = "black")
plt.text(78, 56, "x < 99", fontsize = 11, color = "black")
plt.text(100, 56, "Prediabetes", fontsize = 11, color = "black")
plt.text(100, 54, "99 < x < 126", fontsize = 11, color = "black")
plt.text(130, 53, "Diabetes ", fontsize = 11, color = "blue")
plt.text(130, 51, "x > 126", fontsize = 11, color = "blue")

# Horizontal lines for waist levels
plt.axhline(y = 35, color = "red", linestyle = "--")
plt.axhline(y = 40, color = "red", linestyle = "--")

# Text annotations for waist levels
plt.text(320, 36, "y > 35 : high risk for womens ", fontsize = 11, color = "red")
plt.text(320, 41, "y > 40 : high risk for mens", fontsize = 11, color = "red")


# Set tick sizes
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)

# Add legend
plt.legend(title = "Age_Group", fontsize = 12, title_fontsize = "13")

# Show the plot
plt.show()

> ### 4-10-12. Stabilized Glucose vs Time.ppn

In [None]:
# Define colors for each age group
colors = {"Young": "#F08650", "Adult": "#377D22", "Elderly": "#FF26FA"}

# Create the figure
plt.figure(figsize = (15, 5), dpi = 400)

# Loop through each age group and plot
for age_group, color in colors.items():
    subset = Data[Data["age_group"] == age_group]
    plt.scatter(subset["stab.glu"], subset["time.ppn"], marker = "o", s = 80,
            facecolors = "none", edgecolors = color, lw = 1, label = age_group)
    
plt.title("Stabilized Glucose vs Time.ppn", fontsize = 30)
plt.xlabel("stab.glu (mg/dl)", fontsize = 15)
plt.ylabel("time.ppn (inches)", fontsize = 15)

# Vertical lines for stab.glu
plt.axvline(x = 99, color = "black", linestyle = "--")
plt.axvline(x = 126, color = "blue", linestyle = "-")

# Text annotations for stab.glu levels
plt.text(78, 1420, "Normal ", fontsize = 11, color = "black")
plt.text(78, 1350, "x < 99", fontsize = 11, color = "black")
plt.text(100, 1420, "Prediabetes", fontsize = 11, color = "black")
plt.text(100, 1350, "99 < x < 126", fontsize = 11, color = "black")
plt.text(130, 1420, "Diabetes ", fontsize = 11, color = "blue")
plt.text(130, 1350, "x > 126", fontsize = 11, color = "blue")

# Horizontal lines for time.ppn levels
plt.axhline(y = 480, color = "red", linestyle = "--")

# Text annotations for time.ppn levels
plt.text(320, 500, "y > 480 : Standard duration for testing ", fontsize = 11, color = "red")

# Set tick sizes
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)

# Add legend
plt.legend(title = "Age_Group", fontsize = 12, title_fontsize = "13")

# Show the plot
plt.show()

> ### 4-10-13. Stabilized Glucose vs BMI

In [None]:
# Define colors for each age group
colors = {"Young": "#F08650", "Adult": "#377D22", "Elderly": "#FF26FA"}

# Create the figure
plt.figure(figsize = (15, 5), dpi = 400)

# Loop through each age group and plot
for age_group, color in colors.items():
    subset = Data[Data["age_group"] == age_group]
    plt.scatter(subset["stab.glu"], subset["BMI"], marker = "o", s = 80,
            facecolors = "none", edgecolors = color, lw = 1, label = age_group)
    
plt.title("Stabilized Glucose vs BMI", fontsize = 30)
plt.xlabel("stab.glu (mg/dl)", fontsize = 15)
plt.ylabel("BMI (Number)", fontsize = 15)

# Vertical lines for stab.glu
plt.axvline(x = 99, color = "black", linestyle = "--")
plt.axvline(x = 126, color = "blue", linestyle = "-")

# Text annotations for stab.glu levels
plt.text(78, 54, "Normal ", fontsize = 11, color = "black")
plt.text(78, 52, "x < 99", fontsize = 11, color = "black")
plt.text(100, 52, "Prediabetes", fontsize = 11, color = "black")
plt.text(100, 50, "99 < x < 126", fontsize = 11, color = "black")
plt.text(130, 49, "Diabetes ", fontsize = 11, color = "blue")
plt.text(130, 47, "x > 126", fontsize = 11, color = "blue")

# Horizontal lines for BMI levels
plt.axhline(y = 18.5, color = "black", linestyle = "--")
plt.axhline(y = 24.9, color = "black", linestyle = "--")
plt.axhline(y = 29.9, color = "red", linestyle = "-")

# Text annotations for BMI levels
plt.text(320, 16.5, "y < 18.5 : Under weight ", fontsize = 11, color = "black")
plt.text(320, 22.5, "18.5 < y < 24.9 : Normal weight ", fontsize = 11, color = "black")
plt.text(320, 28, "24.9 < y < 29.9 : Over weight ", fontsize = 11, color = "black")
plt.text(320, 32, "y > 29.9 : Obesity weight ", fontsize = 11, color = "red")


# Set tick sizes
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)

# Add legend
plt.legend(title = "Age_Group", fontsize = 12, title_fontsize = "13")

# Show the plot
plt.show()

> ### 4-10-14. Stabilized Glucose vs Systolic Blood Pressure

In [None]:
# Define colors for each age group
colors = {"Young": "#F08650", "Adult": "#377D22", "Elderly": "#FF26FA"}

# Create the figure
plt.figure(figsize = (15, 5), dpi = 400)

# Loop through each age group and plot
for age_group, color in colors.items():
    subset = Data[Data["age_group"] == age_group]
    plt.scatter(subset["stab.glu"], subset["bp.1s"], marker = "o", s = 80,
            facecolors = "none", edgecolors = color, lw = 1, label = age_group)
    
plt.title("Stabilized Glucose vs Systolic Blood Pressure", fontsize = 30)
plt.xlabel("stab.glu (mg/dl)", fontsize = 15)
plt.ylabel("bp.1s (mm Hg)", fontsize = 15)

# Vertical lines for stab.glu
plt.axvline(x = 99, color = "black", linestyle = "--")
plt.axvline(x = 126, color = "blue", linestyle = "-")

# Text annotations for stab.glu levels
plt.text(78, 240, "Normal ", fontsize = 11, color = "black")
plt.text(78, 230, "x < 99", fontsize = 11, color = "black")
plt.text(100, 240, "Prediabetes", fontsize = 11, color = "black")
plt.text(100, 230, "99 < x < 126", fontsize = 11, color = "black")
plt.text(130, 240, "Diabetes ", fontsize = 11, color = "blue")
plt.text(130, 230, "x > 126", fontsize = 11, color = "blue")

# Horizontal lines for bp.1s levels
plt.axhline(y = 120, color = "black", linestyle = "--")
plt.axhline(y = 130, color = "black", linestyle = "--")
plt.axhline(y = 140, color = "red", linestyle = "-")

# Text annotations for bp.1s levels
plt.text(320, 112, "Normal : y < 120", fontsize = 11, color = "black")
plt.text(320, 124, "Elevated : 120 < y < 130", fontsize = 11, color = "black")
plt.text(320, 134, "Hypertension : 130 < y < 140", fontsize = 11, color = "black")
plt.text(320, 148, "emergency care : y > 140", fontsize = 11, color = "red")

# Set tick sizes
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)

# Add legend
plt.legend(title = "Age_Group", fontsize = 12, title_fontsize = "13")

# Show the plot
plt.show()

> ### 4-10-15. Stabilized Glucose vs Diastolic Blood Pressure

In [None]:
# Define colors for each age group
colors = {"Young": "#F08650", "Adult": "#377D22", "Elderly": "#FF26FA"}

# Create the figure
plt.figure(figsize = (15, 5), dpi = 400)

# Loop through each age group and plot
for age_group, color in colors.items():
    subset = Data[Data["age_group"] == age_group]
    plt.scatter(subset["stab.glu"], subset["bp.1d"], marker = "o", s = 80,
            facecolors = "none", edgecolors = color, lw = 1, label = age_group)
    
plt.title("Stabilized Glucose vs Diastolic Blood Pressure", fontsize = 30)
plt.xlabel("stab.glu (mg/dl)", fontsize = 15)
plt.ylabel("bp.1d (mm Hg)", fontsize = 15)

# Vertical lines for stab.glu
plt.axvline(x = 99, color = "black", linestyle = "--")
plt.axvline(x = 126, color = "blue", linestyle = "-")

# Text annotations for stab.glu levels
plt.text(78, 125, "Normal ", fontsize = 11, color = "black")
plt.text(78, 122, "x < 99", fontsize = 11, color = "black")
plt.text(100, 125, "Prediabetes", fontsize = 11, color = "black")
plt.text(100, 122, "99 < x < 126", fontsize = 11, color = "black")
plt.text(130, 125, "Diabetes ", fontsize = 11, color = "blue")
plt.text(130, 122, "x > 126", fontsize = 11, color = "blue")

# Horizontal lines for bp.1d levels
plt.axhline(y = 80, color = "black", linestyle = "--")
plt.axhline(y = 89, color = "black", linestyle = "--")
plt.axhline(y = 99, color = "red", linestyle = "-")

# Text annotations for bp.1d levels
plt.text(320, 70, "Normal : y < 80", fontsize = 11, color = "black")
plt.text(320, 84, "Elevated : 80 < y < 89", fontsize = 11, color = "black")
plt.text(320, 94, "Hypertension : 89 < y < 99", fontsize = 11, color = "black")
plt.text(320, 104, "emergency care : y > 99", fontsize = 11, color = "red")

# Set tick sizes
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)

# Add legend
plt.legend(title = "Age_Group", fontsize = 12, title_fontsize = "13")

# Show the plot
plt.show()

> ### 4-10-16. Cholesterol vs Waist

In [None]:
# Define colors for each age group
colors = {"Young": "#F08650", "Adult": "#377D22", "Elderly": "#FF26FA"}

# Create the figure
plt.figure(figsize = (15, 5), dpi = 400)

# Loop through each age group and plot
for age_group, color in colors.items():
    subset = Data[Data["age_group"] == age_group]
    plt.scatter(subset["chol"], subset["waist"], marker = "o", s = 80,
            facecolors = "none", edgecolors = color, lw = 1, label = age_group)
    
plt.title("Cholesterol vs Waist", fontsize = 30)
plt.xlabel("chol (mg/dl)", fontsize = 15)
plt.ylabel("waist (inches)", fontsize = 15)

# Vertical lines for Cols
plt.axvline(x = 199, color = "black", linestyle = "--")
plt.axvline(x = 239, color = "blue", linestyle = "-")

# Text annotations for chol levels
plt.text(178, 55, "Desirable  ", fontsize = 11, color = "black")
plt.text(178, 53, "x < 199", fontsize = 11, color = "black")
plt.text(205, 55, "Borderline ", fontsize = 11, color = "black")
plt.text(205, 53, "199 < x < 239", fontsize = 11, color = "black")
plt.text(245, 55, "High  ", fontsize = 11, color = "blue")
plt.text(245, 53, "x > 239", fontsize = 11, color = "blue")

# Horizontal lines for waist levels
plt.axhline(y = 35, color = "red", linestyle = "--")
plt.axhline(y = 40, color = "red", linestyle = "--")

# Text annotations for waist levels
plt.text(380, 36, "y > 35 : high risk for womens ", fontsize = 11, color = "red")
plt.text(380, 41, "y > 40 : high risk for mens", fontsize = 11, color = "red")

# Set tick sizes
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)

# Add legend
plt.legend(title = "Age_Group", fontsize = 12, title_fontsize = "13")

# Show the plot
plt.show()

> ### 4-10-17. Hemoglobin A1c vs High-Density Lipoprotein

In [None]:
# Define colors for each age group
colors = {"Young": "#F08650", "Adult": "#377D22", "Elderly": "#FF26FA"}

# Create the figure
plt.figure(figsize = (15, 5), dpi = 400)

# Loop through each age group and plot
for age_group, color in colors.items():
    subset = Data[Data["age_group"] == age_group]
    plt.scatter(subset["glyhb"], subset["hdl"], marker = "o", s = 80,
            facecolors = "none", edgecolors = color, lw =  1, label = age_group)
    
plt.title("Hemoglobin A1c vs High-Density Lipoprotein", fontsize = 30)
plt.xlabel("glyhb (%)", fontsize = 15)
plt.ylabel("hdl (mg/dl)", fontsize = 15)

# Vertical lines for glyhb
plt.axvline(x = 5.7, color = "black", linestyle = "--")
plt.axvline(x = 6.4, color = "blue", linestyle = "-")

# Text annotations for glyhb levels
plt.text(5, 110, "Normal ", fontsize = 11, color = "black")
plt.text(5, 105, "x < 5.7", fontsize = 11, color = "black")
plt.text(5.8, 105, "Prediabetes ", fontsize = 11, color = "black")
plt.text(5.8, 100, "5.7 < x < 6.4", fontsize = 11, color = "black")
plt.text(6.5, 95, "Diabetes", fontsize = 11, color = "blue")
plt.text(6.5, 90, "x > 6.4", fontsize = 11, color = "blue")

# Horizontal lines for hdl levels
plt.axhline(y = 39, color = "black", linestyle = "--")
plt.axhline(y = 59, color = "red", linestyle = "-")

# Text annotations for hdl levels
plt.text(15, 34, "y < 39 : Low   ", fontsize = 11, color = "black")
plt.text(15, 48, "39 < y < 59 : Acceptable  ", fontsize = 11, color = "black")
plt.text(15, 63, "y > 59 : Good   ", fontsize = 11, color = "red")

# Set tick sizes
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)

# Add legend
plt.legend(title = "Age_Group", fontsize = 12, title_fontsize = "13")

# Show the plot
plt.show()

> ### 4-10-18. Stabilized Glucose vs High-Density Lipoprotein

In [None]:
# Define colors for each age group
colors = {"Young": "#F08650", "Adult": "#377D22", "Elderly": "#FF26FA"}

# Create the figure
plt.figure(figsize = (15, 5), dpi = 400)

# Loop through each age group and plot
for age_group, color in colors.items():
    subset = Data[Data["age_group"] == age_group]
    plt.scatter(subset["stab.glu"], subset["hdl"], marker = "o", s = 80,
            facecolors = "none", edgecolors = color, lw = 1, label = age_group)
    
plt.title("Stabilized Glucose vs High-Density Lipoprotein", fontsize = 30)
plt.xlabel("stab.glu (mg/dl)", fontsize = 15)
plt.ylabel("hdl (mg/dl)", fontsize = 15)

# Vertical lines for stab.glu
plt.axvline(x = 99, color = "black", linestyle = "--")
plt.axvline(x = 126, color = "blue", linestyle = "-")

# Text annotations for stab.glu levels
plt.text(78, 110, "Normal ", fontsize = 11, color = "black")
plt.text(78, 105, "x < 99", fontsize = 11, color = "black")
plt.text(100, 105, "Prediabetes", fontsize = 11, color = "black")
plt.text(100, 100, "99 < x < 126", fontsize = 11, color = "black")
plt.text(130, 95, "Diabetes", fontsize = 11, color = "blue")
plt.text(130, 90, "x > 126", fontsize = 11, color = "blue")

# Horizontal lines for hdl levels
plt.axhline(y = 39, color = "black", linestyle = "--")
plt.axhline(y = 59, color = "red", linestyle = "-")

# Text annotations for hdl levels
plt.text(320, 34, "y < 39 : Low   ", fontsize = 11, color = "black")
plt.text(320, 48, "39 < y < 59 : Acceptable", fontsize = 11, color = "black")
plt.text(320, 63, "y > 59 : Good   ", fontsize = 11, color = "red")
 
# Set tick sizes
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)

# Add legend
plt.legend(title = "Age_Group", fontsize = 12, title_fontsize = "13")

# Show the plot
plt.show()

> # 5. Machine Learning

In [None]:
Data.dtypes

> ### 5-1. Convert category and object variables

In [None]:
LE = LabelEncoder()
Data["gender"] = LE.fit_transform(Data["gender"])
Data["Diabetes_Status"] = LE.fit_transform(Data["Diabetes_Status"])
Data["age_group"] = Data["age_group"].cat.codes

> ### 5-2. Spliting

In [None]:
X = Data.drop(columns = ["Diabetes_Status"])  # Feature variables
y = Data["Diabetes_Status"]                   # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state = 42, stratify = y)

> ### 5-3. Standardization and normalization

> ### 5-3-1. Models of standardization

In [None]:
scaler_standard = StandardScaler()
X_train = scaler_standard.fit_transform(X_train)
X_test = scaler_standard.fit_transform(X_test)


> ### 5-3-1-1. Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression(max_iter = 2000, random_state = 24)

LR.fit(X_train, y_train)

y_pred_LR = LR.predict(X_test)
acc_LR = accuracy_score(y_test, y_pred_LR)

print(classification_report(y_test, y_pred_LR, zero_division = 0))
print("Accuracy Score is : {:.2f}%".format(acc_LR * 100))

> ### 5-3-1-2. Support Vector Machines (SVM)

In [None]:
from sklearn.svm import SVC

SVM = SVC(max_iter = 2000, random_state = 24)

SVM.fit(X_train, y_train)

y_pred_SVM = SVM.predict(X_test)
acc_SVM = accuracy_score(y_test, y_pred_SVM)

print(classification_report(y_test, y_pred_SVM, zero_division = 0))
print("Accuracy Score is : {:.2f}%".format(acc_SVM * 100))

> ### 5-3-1-3. K-Nearest Neighbors (KNN)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

KNN = KNeighborsClassifier()

KNN.fit(X_train, y_train)

y_pred_KNN = KNN.predict(X_test)
acc_KNN = accuracy_score(y_test, y_pred_KNN)

print(classification_report(y_test, y_pred_KNN, zero_division = 0))
print("Accuracy Score is : {:.2f}%".format(acc_KNN * 100))

> ### 5-3-1-4. Gradient Boosting Machines (GBM)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

GBM = GradientBoostingClassifier(random_state = 24)

GBM.fit(X_train, y_train)

y_pred_GBM = GBM.predict(X_test)
acc_GBM = accuracy_score(y_test, y_pred_GBM)

print(classification_report(y_test, y_pred_GBM, zero_division = 0))
print("Accuracy Score is : {:.2f}%".format(acc_GBM * 100))

> ### 5-3-1-5. Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

RFC = RandomForestClassifier(max_depth = 2000 , random_state = 24)

RFC.fit(X_train, y_train)

y_pred_RFC = RFC.predict(X_test)
acc_RFC = accuracy_score(y_test, y_pred_RFC)

print(classification_report(y_test, y_pred_RFC, zero_division = 0))
print("Accuracy Score is : {:.2f}%".format(acc_RFC * 100))

> ### 5-3-1-6. Neural Networks (Multi-Layer Perceptron)

In [None]:
from sklearn.neural_network import MLPClassifier

MLP = MLPClassifier(max_iter = 2000,random_state = 24)

MLP.fit(X_train, y_train)

y_pred_MLP = MLP.predict(X_test)
acc_MLP = accuracy_score(y_test, y_pred_MLP)

print(classification_report(y_test, y_pred_MLP, zero_division = 0))
print("Accuracy Score is : {:.2f}%".format(acc_MLP * 100))

> ### 5-3-1-7. Extra Trees Classifier

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

ETC = ExtraTreesClassifier(max_depth = 2000, random_state = 24)

ETC.fit(X_train, y_train)

y_pred_ETC = ETC.predict(X_test)
acc_ETC = accuracy_score(y_test, y_pred_ETC)

print(classification_report(y_test, y_pred_ETC, zero_division = 0))
print("Accuracy Score is : {:.2f}%".format(acc_ETC * 100))

> ### 5-3-1-8. Decision Trees

In [None]:
from sklearn.tree import DecisionTreeClassifier

DT = DecisionTreeClassifier(random_state = 24)

DT.fit(X_train, y_train)

y_pred_DT = DT.predict(X_test)
acc_DT = accuracy_score(y_test, y_pred_DT)

print(classification_report(y_test, y_pred_DT, zero_division = 0))
print("Accuracy Score is : {:.2f}%".format(acc_DT * 100))

> ### 5-3-1-9. Confusion Matrix for all models

In [None]:
# Create subplots for confusion matrices
fig, axes = plt.subplots(4, 2, figsize = (12, 12))  # 4 rows, 2 columns

# Confusion Matrix for Logistic Regression
sns.heatmap(confusion_matrix(y_test, y_pred_LR), annot = True, fmt = "d", cmap = "Blues", ax = axes[0, 0])
axes[0, 0].set_title("Confusion Matrix - LR")
axes[0, 0].set_xlabel("Predicted")
axes[0, 0].set_ylabel("Actual")

# Confusion Matrix for SVM
sns.heatmap(confusion_matrix(y_test, y_pred_SVM), annot = True, fmt = "d", cmap = "Greens", ax = axes[0, 1])
axes[0, 1].set_title("Confusion Matrix - SVM")
axes[0, 1].set_xlabel("Predicted")
axes[0, 1].set_ylabel("Actual")

# Confusion Matrix for KNN
sns.heatmap(confusion_matrix(y_test, y_pred_KNN), annot = True, fmt = "d", cmap = "Blues", ax = axes[1, 0])
axes[1, 0].set_title("Confusion Matrix - KNN")
axes[1, 0].set_xlabel("Predicted")
axes[1, 0].set_ylabel("Actual")

# Confusion Matrix for GBM
sns.heatmap(confusion_matrix(y_test, y_pred_GBM), annot = True, fmt = "d", cmap = "Greens", ax = axes[1, 1])
axes[1, 1].set_title("Confusion Matrix - GBM")
axes[1, 1].set_xlabel("Predicted")
axes[1, 1].set_ylabel("Actual")

# Confusion Matrix for RFC
sns.heatmap(confusion_matrix(y_test, y_pred_RFC), annot = True, fmt = "d", cmap = "Blues", ax = axes[2, 0])
axes[2, 0].set_title("Confusion Matrix - RFC")
axes[2, 0].set_xlabel("Predicted")
axes[2, 0].set_ylabel("Actual")

# Confusion Matrix for MLP
sns.heatmap(confusion_matrix(y_test, y_pred_MLP), annot = True, fmt = "d", cmap = "Greens", ax = axes[2, 1])
axes[2, 1].set_title("Confusion Matrix - MLP")
axes[2, 1].set_xlabel("Predicted")
axes[2, 1].set_ylabel("Actual")

# Confusion Matrix for ETC
sns.heatmap(confusion_matrix(y_test, y_pred_ETC), annot = True, fmt = "d", cmap = "Blues", ax = axes[3, 0])
axes[3, 0].set_title("Confusion Matrix - ETC")
axes[3, 0].set_xlabel("Predicted")
axes[3, 0].set_ylabel("Actual")

# Confusion Matrix for DT
sns.heatmap(confusion_matrix(y_test, y_pred_DT), annot = True, fmt = "d", cmap = "Greens", ax = axes[3, 1])
axes[3,1].set_title("Confusion Matrix - DT")
axes[3,1].set_xlabel("Predicted")
axes[3,1].set_ylabel("Actual")

# Adjust layout
plt.tight_layout()
plt.show()

> ### 5-3-1-10. Conclusion
> In standard data, as it's clear, Random Forest Classifier predict more TP for each class (Diabetes Status) and had higher accuracy.

>

> ### 5-3-2. Models of Normalization

In [None]:
scaler_normal = MinMaxScaler()
X_train = scaler_normal.fit_transform(X_train)
X_test = scaler_normal.fit_transform(X_test)

> ### 5-3-2-1. Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

LR2 = LogisticRegression(max_iter = 2000)

LR2.fit(X_train, y_train)

y_pred_LR2 = LR2.predict(X_test)
acc_LR2 = accuracy_score(y_test, y_pred_LR2)

print(classification_report(y_test, y_pred_LR2, zero_division = 0))
print("Accuracy Score is : {:.2f}%".format(acc_LR2 * 100))

> ### 5-3-2-2. Support Vector Machines (SVM)

In [None]:
from sklearn.svm import SVC

model_SVM2 = SVC(max_iter = 2000)

model_SVM2.fit(X_train, y_train)

y_pred_SVM2 = model_SVM2.predict(X_test)
acc_SVM2 = accuracy_score(y_test, y_pred_SVM2)

print(classification_report(y_test, y_pred_SVM2, zero_division = 0))
print("Accuracy Score is : {:.2f}%".format(acc_SVM2 * 100))

> ### 5-3-2-3. K-Nearest Neighbors (KNN)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

model_KNN2 = KNeighborsClassifier()

model_KNN2.fit(X_train, y_train)

y_pred_KNN2 = model_KNN2.predict(X_test)
acc_KNN2 = accuracy_score(y_test, y_pred_KNN2)

print(classification_report(y_test, y_pred_KNN2, zero_division = 0))
print("Accuracy Score is : {:.2f}%".format(acc_KNN2 * 100))

> ### 5-3-2-4. Gradient Boosting Machines (GBM)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

model_GBM2 = GradientBoostingClassifier()

model_GBM2.fit(X_train, y_train)

y_pred_GBM2 = model_GBM2.predict(X_test)
acc_GBM2 = accuracy_score(y_test, y_pred_GBM2)

print(classification_report(y_test, y_pred_GBM2, zero_division = 0))
print("Accuracy Score is : {:.2f}%".format(acc_GBM2 * 100))

> ### 5-3-2-5. Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

model_RFC2 = RandomForestClassifier()

model_RFC2.fit(X_train, y_train)

y_pred_RFC2 = model_RFC2.predict(X_test)
acc_RFC2 = accuracy_score(y_test, y_pred_RFC2)

print(classification_report(y_test, y_pred_RFC2, zero_division = 0))
print("Accuracy Score is : {:.2f}%".format(acc_RFC2 * 100))

> ### 5-3-2-6. Neural Networks (Multi-Layer Perceptron)

In [None]:
from sklearn.neural_network import MLPClassifier

model_MLP2 = MLPClassifier(max_iter = 2000)

model_MLP2.fit(X_train, y_train)

y_pred_MLP2 = model_MLP2.predict(X_test)
acc_MLP2 = accuracy_score(y_test, y_pred_MLP2)

print(classification_report(y_test, y_pred_MLP2, zero_division = 0))
print("Accuracy Score is : {:.2f}%".format(acc_MLP2 * 100)) 

> ### 5-3-2-7. Extra Trees Classifier

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

model_ETC2 = ExtraTreesClassifier()

model_ETC2.fit(X_train, y_train)

y_pred_ETC2 = model_ETC2.predict(X_test)
acc_ETC2 = accuracy_score(y_test, y_pred_ETC2)

print(classification_report(y_test, y_pred_ETC2, zero_division = 0))
print("Accuracy Score is : {:.2f}%".format(acc_ETC2 * 100))

> ### 5-3-2-8. Decision Trees

In [None]:
from sklearn.tree import DecisionTreeClassifier

model_DT2 = DecisionTreeClassifier()

model_DT2.fit(X_train, y_train)

y_pred_DT2 = model_DT2.predict(X_test)
acc_DT2 = accuracy_score(y_test, y_pred_DT2)

print(classification_report(y_test, y_pred_DT2, zero_division = 0))
print("Accuracy Score is : {:.2f}%".format(acc_DT2 * 100))

> ### 5-3-2-9. Confusion Matrix for all models

In [None]:
# Create subplots for confusion matrices
fig, axes = plt.subplots(4, 2, figsize = (12, 12))  # 4 rows, 2 columns

# Confusion Matrix for Logistic Regression2
sns.heatmap(confusion_matrix(y_test, y_pred_LR2), annot = True, fmt = "d", cmap = "Blues", ax = axes[0, 0])
axes[0, 0].set_title("Confusion Matrix - LR2")
axes[0, 0].set_xlabel("Predicted")
axes[0, 0].set_ylabel("Actual")

# Confusion Matrix for SVM2
sns.heatmap(confusion_matrix(y_test, y_pred_SVM2), annot = True, fmt = "d", cmap = "Greens", ax = axes[0, 1])
axes[0, 1].set_title("Confusion Matrix - SVM2")
axes[0, 1].set_xlabel("Predicted")
axes[0, 1].set_ylabel("Actual")

# Confusion Matrix for KNN2
sns.heatmap(confusion_matrix(y_test, y_pred_KNN2), annot = True, fmt = "d", cmap = "Blues", ax = axes[1, 0])
axes[1, 0].set_title("Confusion Matrix - KNN2")
axes[1, 0].set_xlabel("Predicted")
axes[1, 0].set_ylabel("Actual")

# Confusion Matrix for GBM2
sns.heatmap(confusion_matrix(y_test, y_pred_GBM2), annot = True, fmt = "d", cmap = "Greens", ax = axes[1, 1])
axes[1, 1].set_title("Confusion Matrix - GBM2")
axes[1, 1].set_xlabel("Predicted")
axes[1, 1].set_ylabel("Actual")

# Confusion Matrix for RFC2
sns.heatmap(confusion_matrix(y_test, y_pred_RFC2), annot = True, fmt = "d", cmap = "Blues", ax = axes[2, 0])
axes[2, 0].set_title("Confusion Matrix - RFC2")
axes[2, 0].set_xlabel("Predicted")
axes[2, 0].set_ylabel("Actual")

# Confusion Matrix for MLP2
sns.heatmap(confusion_matrix(y_test, y_pred_MLP2), annot = True, fmt = "d", cmap = "Greens", ax = axes[2, 1])
axes[2, 1].set_title("Confusion Matrix - MLP2")
axes[2, 1].set_xlabel("Predicted")
axes[2, 1].set_ylabel("Actual")

# Confusion Matrix for ETC2
sns.heatmap(confusion_matrix(y_test, y_pred_ETC2), annot = True, fmt = "d", cmap = "Blues", ax = axes[3, 0])
axes[3, 0].set_title("Confusion Matrix - ETC2")
axes[3, 0].set_xlabel("Predicted")
axes[3, 0].set_ylabel("Actual")

# Confusion Matrix for DT2
sns.heatmap(confusion_matrix(y_test, y_pred_DT2), annot = True, fmt = "d", cmap = "Greens", ax = axes[3,1])
axes[3,1].set_title("Confusion Matrix - DT2")
axes[3,1].set_xlabel("Predicted")
axes[3,1].set_ylabel("Actual")

# Adjust layout
plt.tight_layout()
plt.show()

> ### 5-3-2-10. Conclusion
> In normal data, as it's clear, Extra Trees Classifier had higher accuracy.